net/ipv6/ip6_output.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  *      IPv6 output functions
   4  *      Linux INET6 implementation
   5  *
   6  *      Authors:
   7  *      Pedro Roque             <roque@di.fc.ul.pt>
   8  *
   9  *      Based on linux/net/ipv4/ip_output.c
  10  *
  11  *      Changes:
  12  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  13  *                              extension headers are implemented.
  14  *                              route changes now work.
  15  *                              ip6_forward does not confuse sniffers.
  16  *                              etc.
  17  *
  18  *      H. von Brand    :       Added missing #include <linux/string.h>
  19  *      Imran Patel     :       frag id should be in NBO
  20  *      Kazunori MIYAZAWA @USAGI
  21  *                      :       add ip6_append_data and related functions
  22  *                              for datagram xmit
  23  */
  24
  25 #include <linux/errno.h>
  26 #include <linux/kernel.h>
  27 #include <linux/string.h>
  28 #include <linux/socket.h>
  29 #include <linux/net.h>
  30 #include <linux/netdevice.h>
  31 #include <linux/if_arp.h>
  32 #include <linux/in6.h>
  33 #include <linux/tcp.h>
  34 #include <linux/route.h>
  35 #include <linux/module.h>
  36 #include <linux/slab.h>
  37
  38 #include <linux/bpf-cgroup.h>
  39 #include <linux/netfilter.h>
  40 #include <linux/netfilter_ipv6.h>
  41
  42 #include <net/sock.h>
  43 #include <net/snmp.h>
  44
  45 #include <net/ipv6.h>
  46 #include <net/ndisc.h>
  47 #include <net/protocol.h>
  48 #include <net/ip6_route.h>
  49 #include <net/addrconf.h>
  50 #include <net/rawv6.h>
  51 #include <net/icmp.h>
  52 #include <net/xfrm.h>
  53 #include <net/checksum.h>
  54 #include <linux/mroute6.h>
  55 #include <net/l3mdev.h>
  56 #include <net/lwtunnel.h>
  57 #include <net/ip_tunnels.h>
  58
  59 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
  60 {
  61         struct dst_entry *dst = skb_dst(skb);
  62         struct net_device *dev = dst->dev;
  63         struct inet6_dev *idev = ip6_dst_idev(dst);
  64         unsigned int hh_len = LL_RESERVED_SPACE(dev);
  65         const struct in6_addr *daddr, *nexthop;
  66         struct ipv6hdr *hdr;
  67         struct neighbour *neigh;
  68         int ret;
  69
  70         /* Be paranoid, rather than too clever. */
  71         if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
  72                 skb = skb_expand_head(skb, hh_len);
  73                 if (!skb) {
  74                         IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
  75                         return -ENOMEM;
  76                 }
  77         }
  78
  79         hdr = ipv6_hdr(skb);
  80         daddr = &hdr->daddr;
  81         if (ipv6_addr_is_multicast(daddr)) {
  82                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
  83                     ((mroute6_is_socket(net, skb) &&
  84                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
  85                      ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
  86                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
  87
  88                         /* Do not check for IFF_ALLMULTI; multicast routing
  89                            is not supported in any case.
  90                          */
  91                         if (newskb)
  92                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
  93                                         net, sk, newskb, NULL, newskb->dev,
  94                                         dev_loopback_xmit);
  95
  96                         if (hdr->hop_limit == 0) {
  97                                 IP6_INC_STATS(net, idev,
  98                                               IPSTATS_MIB_OUTDISCARDS);
  99                                 kfree_skb(skb);
 100                                 return 0;
 101                         }
 102                 }
 103
 104                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
 105                 if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
 106                     !(dev->flags & IFF_LOOPBACK)) {
 107                         kfree_skb(skb);
 108                         return 0;
 109                 }
 110         }
 111
 112         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
 113                 int res = lwtunnel_xmit(skb);
 114
 115                 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
 116                         return res;
 117         }
 118
 119         rcu_read_lock_bh();
 120         nexthop = rt6_nexthop((struct rt6_info *)dst, daddr);
 121         neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
 122         if (unlikely(!neigh))
 123                 neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
 124         if (!IS_ERR(neigh)) {
 125                 sock_confirm_neigh(skb, neigh);
 126                 ret = neigh_output(neigh, skb, false);
 127                 rcu_read_unlock_bh();
 128                 return ret;
 129         }
 130         rcu_read_unlock_bh();
 131
 132         IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
 133         kfree_skb(skb);
 134         return -EINVAL;
 135 }
 136
 137 static int
 138 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
 139                                     struct sk_buff *skb, unsigned int mtu)
 140 {
 141         struct sk_buff *segs, *nskb;
 142         netdev_features_t features;
 143         int ret = 0;
 144
 145         /* Please see corresponding comment in ip_finish_output_gso
 146          * describing the cases where GSO segment length exceeds the
 147          * egress MTU.
 148          */
 149         features = netif_skb_features(skb);
 150         segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
 151         if (IS_ERR_OR_NULL(segs)) {
 152                 kfree_skb(skb);
 153                 return -ENOMEM;
 154         }
 155
 156         consume_skb(skb);
 157
 158         skb_list_walk_safe(segs, segs, nskb) {
 159                 int err;
 160
 161                 skb_mark_not_on_list(segs);
 162                 err = ip6_fragment(net, sk, segs, ip6_finish_output2);
 163                 if (err && ret == 0)
 164                         ret = err;
 165         }
 166
 167         return ret;
 168 }
 169
 170 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 171 {
 172         unsigned int mtu;
 173
 174 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 175         /* Policy lookup after SNAT yielded a new policy */
 176         if (skb_dst(skb)->xfrm) {
 177                 IP6CB(skb)->flags |= IP6SKB_REROUTED;
 178                 return dst_output(net, sk, skb);
 179         }
 180 #endif
 181
 182         mtu = ip6_skb_dst_mtu(skb);
 183         if (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))
 184                 return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
 185
 186         if ((skb->len > mtu && !skb_is_gso(skb)) ||
 187             dst_allfrag(skb_dst(skb)) ||
 188             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
 189                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
 190         else
 191                 return ip6_finish_output2(net, sk, skb);
 192 }
 193
 194 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 195 {
 196         int ret;
 197
 198         ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
 199         switch (ret) {
 200         case NET_XMIT_SUCCESS:
 201                 return __ip6_finish_output(net, sk, skb);
 202         case NET_XMIT_CN:
 203                 return __ip6_finish_output(net, sk, skb) ? : ret;
 204         default:
 205                 kfree_skb(skb);
 206                 return ret;
 207         }
 208 }
 209
 210 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 211 {
 212         struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
 213         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 214
 215         skb->protocol = htons(ETH_P_IPV6);
 216         skb->dev = dev;
 217
 218         if (unlikely(idev->cnf.disable_ipv6)) {
 219                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 220                 kfree_skb(skb);
 221                 return 0;
 222         }
 223
 224         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 225                             net, sk, skb, indev, dev,
 226                             ip6_finish_output,
 227                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 228 }
 229 EXPORT_SYMBOL(ip6_output);
 230
 231 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
 232 {
 233         if (!np->autoflowlabel_set)
 234                 return ip6_default_np_autolabel(net);
 235         else
 236                 return np->autoflowlabel;
 237 }
 238
 239 /*
 240  * xmit an sk_buff (used by TCP, SCTP and DCCP)
 241  * Note : socket lock is not held for SYNACK packets, but might be modified
 242  * by calls to skb_set_owner_w() and ipv6_local_error(),
 243  * which are using proper atomic operations or spinlocks.
 244  */
 245 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 246              __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
 247 {
 248         struct net *net = sock_net(sk);
 249         const struct ipv6_pinfo *np = inet6_sk(sk);
 250         struct in6_addr *first_hop = &fl6->daddr;
 251         struct dst_entry *dst = skb_dst(skb);
 252         struct net_device *dev = dst->dev;
 253         struct inet6_dev *idev = ip6_dst_idev(dst);
 254         unsigned int head_room;
 255         struct ipv6hdr *hdr;
 256         u8  proto = fl6->flowi6_proto;
 257         int seg_len = skb->len;
 258         int hlimit = -1;
 259         u32 mtu;
 260
 261         head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dev);
 262         if (opt)
 263                 head_room += opt->opt_nflen + opt->opt_flen;
 264
 265         if (unlikely(head_room > skb_headroom(skb))) {
 266                 skb = skb_expand_head(skb, head_room);
 267                 if (!skb) {
 268                         IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 269                         return -ENOBUFS;
 270                 }
 271         }
 272
 273         if (opt) {
 274                 seg_len += opt->opt_nflen + opt->opt_flen;
 275
 276                 if (opt->opt_flen)
 277                         ipv6_push_frag_opts(skb, opt, &proto);
 278
 279                 if (opt->opt_nflen)
 280                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
 281                                              &fl6->saddr);
 282         }
 283
 284         skb_push(skb, sizeof(struct ipv6hdr));
 285         skb_reset_network_header(skb);
 286         hdr = ipv6_hdr(skb);
 287
 288         /*
 289          *      Fill in the IPv6 header
 290          */
 291         if (np)
 292                 hlimit = np->hop_limit;
 293         if (hlimit < 0)
 294                 hlimit = ip6_dst_hoplimit(dst);
 295
 296         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
 297                                 ip6_autoflowlabel(net, np), fl6));
 298
 299         hdr->payload_len = htons(seg_len);
 300         hdr->nexthdr = proto;
 301         hdr->hop_limit = hlimit;
 302
 303         hdr->saddr = fl6->saddr;
 304         hdr->daddr = *first_hop;
 305
 306         skb->protocol = htons(ETH_P_IPV6);
 307         skb->priority = priority;
 308         skb->mark = mark;
 309
 310         mtu = dst_mtu(dst);
 311         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
 312                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
 313
 314                 /* if egress device is enslaved to an L3 master device pass the
 315                  * skb to its handler for processing
 316                  */
 317                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
 318                 if (unlikely(!skb))
 319                         return 0;
 320
 321                 /* hooks should never assume socket lock is held.
 322                  * we promote our socket to non const
 323                  */
 324                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
 325                                net, (struct sock *)sk, skb, NULL, dev,
 326                                dst_output);
 327         }
 328
 329         skb->dev = dev;
 330         /* ipv6_local_error() does not require socket lock,
 331          * we promote our socket to non const
 332          */
 333         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
 334
 335         IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
 336         kfree_skb(skb);
 337         return -EMSGSIZE;
 338 }
 339 EXPORT_SYMBOL(ip6_xmit);
 340
 341 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 342 {
 343         struct ip6_ra_chain *ra;
 344         struct sock *last = NULL;
 345
 346         read_lock(&ip6_ra_lock);
 347         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 348                 struct sock *sk = ra->sk;
 349                 if (sk && ra->sel == sel &&
 350                     (!sk->sk_bound_dev_if ||
 351                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 352                         struct ipv6_pinfo *np = inet6_sk(sk);
 353
 354                         if (np && np->rtalert_isolate &&
 355                             !net_eq(sock_net(sk), dev_net(skb->dev))) {
 356                                 continue;
 357                         }
 358                         if (last) {
 359                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 360                                 if (skb2)
 361                                         rawv6_rcv(last, skb2);
 362                         }
 363                         last = sk;
 364                 }
 365         }
 366
 367         if (last) {
 368                 rawv6_rcv(last, skb);
 369                 read_unlock(&ip6_ra_lock);
 370                 return 1;
 371         }
 372         read_unlock(&ip6_ra_lock);
 373         return 0;
 374 }
 375
 376 static int ip6_forward_proxy_check(struct sk_buff *skb)
 377 {
 378         struct ipv6hdr *hdr = ipv6_hdr(skb);
 379         u8 nexthdr = hdr->nexthdr;
 380         __be16 frag_off;
 381         int offset;
 382
 383         if (ipv6_ext_hdr(nexthdr)) {
 384                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 385                 if (offset < 0)
 386                         return 0;
 387         } else
 388                 offset = sizeof(struct ipv6hdr);
 389
 390         if (nexthdr == IPPROTO_ICMPV6) {
 391                 struct icmp6hdr *icmp6;
 392
 393                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 394                                          offset + 1 - skb->data)))
 395                         return 0;
 396
 397                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 398
 399                 switch (icmp6->icmp6_type) {
 400                 case NDISC_ROUTER_SOLICITATION:
 401                 case NDISC_ROUTER_ADVERTISEMENT:
 402                 case NDISC_NEIGHBOUR_SOLICITATION:
 403                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 404                 case NDISC_REDIRECT:
 405                         /* For reaction involving unicast neighbor discovery
 406                          * message destined to the proxied address, pass it to
 407                          * input function.
 408                          */
 409                         return 1;
 410                 default:
 411                         break;
 412                 }
 413         }
 414
 415         /*
 416          * The proxying router can't forward traffic sent to a link-local
 417          * address, so signal the sender and discard the packet. This
 418          * behavior is clarified by the MIPv6 specification.
 419          */
 420         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 421                 dst_link_failure(skb);
 422                 return -1;
 423         }
 424
 425         return 0;
 426 }
 427
 428 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 429                                      struct sk_buff *skb)
 430 {
 431         struct dst_entry *dst = skb_dst(skb);
 432
 433         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 434         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 435
 436 #ifdef CONFIG_NET_SWITCHDEV
 437         if (skb->offload_l3_fwd_mark) {
 438                 consume_skb(skb);
 439                 return 0;
 440         }
 441 #endif
 442
 443         skb->tstamp = 0;
 444         return dst_output(net, sk, skb);
 445 }
 446
 447 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 448 {
 449         if (skb->len <= mtu)
 450                 return false;
 451
 452         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
 453         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
 454                 return true;
 455
 456         if (skb->ignore_df)
 457                 return false;
 458
 459         if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
 460                 return false;
 461
 462         return true;
 463 }
 464
 465 int ip6_forward(struct sk_buff *skb)
 466 {
 467         struct dst_entry *dst = skb_dst(skb);
 468         struct ipv6hdr *hdr = ipv6_hdr(skb);
 469         struct inet6_skb_parm *opt = IP6CB(skb);
 470         struct net *net = dev_net(dst->dev);
 471         struct inet6_dev *idev;
 472         u32 mtu;
 473
 474         idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
 475         if (net->ipv6.devconf_all->forwarding == 0)
 476                 goto error;
 477
 478         if (skb->pkt_type != PACKET_HOST)
 479                 goto drop;
 480
 481         if (unlikely(skb->sk))
 482                 goto drop;
 483
 484         if (skb_warn_if_lro(skb))
 485                 goto drop;
 486
 487         if (!net->ipv6.devconf_all->disable_policy &&
 488             !idev->cnf.disable_policy &&
 489             !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 490                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 491                 goto drop;
 492         }
 493
 494         skb_forward_csum(skb);
 495
 496         /*
 497          *      We DO NOT make any processing on
 498          *      RA packets, pushing them to user level AS IS
 499          *      without ane WARRANTY that application will be able
 500          *      to interpret them. The reason is that we
 501          *      cannot make anything clever here.
 502          *
 503          *      We are not end-node, so that if packet contains
 504          *      AH/ESP, we cannot make anything.
 505          *      Defragmentation also would be mistake, RA packets
 506          *      cannot be fragmented, because there is no warranty
 507          *      that different fragments will go along one path. --ANK
 508          */
 509         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
 510                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
 511                         return 0;
 512         }
 513
 514         /*
 515          *      check and decrement ttl
 516          */
 517         if (hdr->hop_limit <= 1) {
 518                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 519                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
 520
 521                 kfree_skb(skb);
 522                 return -ETIMEDOUT;
 523         }
 524
 525         /* XXX: idev->cnf.proxy_ndp? */
 526         if (net->ipv6.devconf_all->proxy_ndp &&
 527             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 528                 int proxied = ip6_forward_proxy_check(skb);
 529                 if (proxied > 0) {
 530                         hdr->hop_limit--;
 531                         return ip6_input(skb);
 532                 } else if (proxied < 0) {
 533                         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 534                         goto drop;
 535                 }
 536         }
 537
 538         if (!xfrm6_route_forward(skb)) {
 539                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 540                 goto drop;
 541         }
 542         dst = skb_dst(skb);
 543
 544         /* IPv6 specs say nothing about it, but it is clear that we cannot
 545            send redirects to source routed frames.
 546            We don't send redirects to frames decapsulated from IPsec.
 547          */
 548         if (IP6CB(skb)->iif == dst->dev->ifindex &&
 549             opt->srcrt == 0 && !skb_sec_path(skb)) {
 550                 struct in6_addr *target = NULL;
 551                 struct inet_peer *peer;
 552                 struct rt6_info *rt;
 553
 554                 /*
 555                  *      incoming and outgoing devices are the same
 556                  *      send a redirect.
 557                  */
 558
 559                 rt = (struct rt6_info *) dst;
 560                 if (rt->rt6i_flags & RTF_GATEWAY)
 561                         target = &rt->rt6i_gateway;
 562                 else
 563                         target = &hdr->daddr;
 564
 565                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
 566
 567                 /* Limit redirects both by destination (here)
 568                    and by source (inside ndisc_send_redirect)
 569                  */
 570                 if (inet_peer_xrlim_allow(peer, 1*HZ))
 571                         ndisc_send_redirect(skb, target);
 572                 if (peer)
 573                         inet_putpeer(peer);
 574         } else {
 575                 int addrtype = ipv6_addr_type(&hdr->saddr);
 576
 577                 /* This check is security critical. */
 578                 if (addrtype == IPV6_ADDR_ANY ||
 579                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 580                         goto error;
 581                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 582                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 583                                     ICMPV6_NOT_NEIGHBOUR, 0);
 584                         goto error;
 585                 }
 586         }
 587
 588         mtu = ip6_dst_mtu_maybe_forward(dst, true);
 589         if (mtu < IPV6_MIN_MTU)
 590                 mtu = IPV6_MIN_MTU;
 591
 592         if (ip6_pkt_too_big(skb, mtu)) {
 593                 /* Again, force OUTPUT device used as source address */
 594                 skb->dev = dst->dev;
 595                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 596                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
 597                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 598                                 IPSTATS_MIB_FRAGFAILS);
 599                 kfree_skb(skb);
 600                 return -EMSGSIZE;
 601         }
 602
 603         if (skb_cow(skb, dst->dev->hard_header_len)) {
 604                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 605                                 IPSTATS_MIB_OUTDISCARDS);
 606                 goto drop;
 607         }
 608
 609         hdr = ipv6_hdr(skb);
 610
 611         /* Mangling hops number delayed to point after skb COW */
 612
 613         hdr->hop_limit--;
 614
 615         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
 616                        net, NULL, skb, skb->dev, dst->dev,
 617                        ip6_forward_finish);
 618
 619 error:
 620         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
 621 drop:
 622         kfree_skb(skb);
 623         return -EINVAL;
 624 }
 625
 626 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 627 {
 628         to->pkt_type = from->pkt_type;
 629         to->priority = from->priority;
 630         to->protocol = from->protocol;
 631         skb_dst_drop(to);
 632         skb_dst_set(to, dst_clone(skb_dst(from)));
 633         to->dev = from->dev;
 634         to->mark = from->mark;
 635
 636         skb_copy_hash(to, from);
 637
 638 #ifdef CONFIG_NET_SCHED
 639         to->tc_index = from->tc_index;
 640 #endif
 641         nf_copy(to, from);
 642         skb_ext_copy(to, from);
 643         skb_copy_secmark(to, from);
 644 }
 645
 646 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
 647                       u8 nexthdr, __be32 frag_id,
 648                       struct ip6_fraglist_iter *iter)
 649 {
 650         unsigned int first_len;
 651         struct frag_hdr *fh;
 652
 653         /* BUILD HEADER */
 654         *prevhdr = NEXTHDR_FRAGMENT;
 655         iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 656         if (!iter->tmp_hdr)
 657                 return -ENOMEM;
 658
 659         iter->frag = skb_shinfo(skb)->frag_list;
 660         skb_frag_list_init(skb);
 661
 662         iter->offset = 0;
 663         iter->hlen = hlen;
 664         iter->frag_id = frag_id;
 665         iter->nexthdr = nexthdr;
 666
 667         __skb_pull(skb, hlen);
 668         fh = __skb_push(skb, sizeof(struct frag_hdr));
 669         __skb_push(skb, hlen);
 670         skb_reset_network_header(skb);
 671         memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
 672
 673         fh->nexthdr = nexthdr;
 674         fh->reserved = 0;
 675         fh->frag_off = htons(IP6_MF);
 676         fh->identification = frag_id;
 677
 678         first_len = skb_pagelen(skb);
 679         skb->data_len = first_len - skb_headlen(skb);
 680         skb->len = first_len;
 681         ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
 682
 683         return 0;
 684 }
 685 EXPORT_SYMBOL(ip6_fraglist_init);
 686
 687 void ip6_fraglist_prepare(struct sk_buff *skb,
 688                           struct ip6_fraglist_iter *iter)
 689 {
 690         struct sk_buff *frag = iter->frag;
 691         unsigned int hlen = iter->hlen;
 692         struct frag_hdr *fh;
 693
 694         frag->ip_summed = CHECKSUM_NONE;
 695         skb_reset_transport_header(frag);
 696         fh = __skb_push(frag, sizeof(struct frag_hdr));
 697         __skb_push(frag, hlen);
 698         skb_reset_network_header(frag);
 699         memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
 700         iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
 701         fh->nexthdr = iter->nexthdr;
 702         fh->reserved = 0;
 703         fh->frag_off = htons(iter->offset);
 704         if (frag->next)
 705                 fh->frag_off |= htons(IP6_MF);
 706         fh->identification = iter->frag_id;
 707         ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
 708         ip6_copy_metadata(frag, skb);
 709 }
 710 EXPORT_SYMBOL(ip6_fraglist_prepare);
 711
 712 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
 713                    unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
 714                    u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
 715 {
 716         state->prevhdr = prevhdr;
 717         state->nexthdr = nexthdr;
 718         state->frag_id = frag_id;
 719
 720         state->hlen = hlen;
 721         state->mtu = mtu;
 722
 723         state->left = skb->len - hlen;  /* Space per frame */
 724         state->ptr = hlen;              /* Where to start from */
 725
 726         state->hroom = hdr_room;
 727         state->troom = needed_tailroom;
 728
 729         state->offset = 0;
 730 }
 731 EXPORT_SYMBOL(ip6_frag_init);
 732
 733 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
 734 {
 735         u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
 736         struct sk_buff *frag;
 737         struct frag_hdr *fh;
 738         unsigned int len;
 739
 740         len = state->left;
 741         /* IF: it doesn't fit, use 'mtu' - the data space left */
 742         if (len > state->mtu)
 743                 len = state->mtu;
 744         /* IF: we are not sending up to and including the packet end
 745            then align the next start on an eight byte boundary */
 746         if (len < state->left)
 747                 len &= ~7;
 748
 749         /* Allocate buffer */
 750         frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
 751                          state->hroom + state->troom, GFP_ATOMIC);
 752         if (!frag)
 753                 return ERR_PTR(-ENOMEM);
 754
 755         /*
 756          *      Set up data on packet
 757          */
 758
 759         ip6_copy_metadata(frag, skb);
 760         skb_reserve(frag, state->hroom);
 761         skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
 762         skb_reset_network_header(frag);
 763         fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
 764         frag->transport_header = (frag->network_header + state->hlen +
 765                                   sizeof(struct frag_hdr));
 766
 767         /*
 768          *      Charge the memory for the fragment to any owner
 769          *      it might possess
 770          */
 771         if (skb->sk)
 772                 skb_set_owner_w(frag, skb->sk);
 773
 774         /*
 775          *      Copy the packet header into the new buffer.
 776          */
 777         skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
 778
 779         fragnexthdr_offset = skb_network_header(frag);
 780         fragnexthdr_offset += prevhdr - skb_network_header(skb);
 781         *fragnexthdr_offset = NEXTHDR_FRAGMENT;
 782
 783         /*
 784          *      Build fragment header.
 785          */
 786         fh->nexthdr = state->nexthdr;
 787         fh->reserved = 0;
 788         fh->identification = state->frag_id;
 789
 790         /*
 791          *      Copy a block of the IP datagram.
 792          */
 793         BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
 794                              len));
 795         state->left -= len;
 796
 797         fh->frag_off = htons(state->offset);
 798         if (state->left > 0)
 799                 fh->frag_off |= htons(IP6_MF);
 800         ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
 801
 802         state->ptr += len;
 803         state->offset += len;
 804
 805         return frag;
 806 }
 807 EXPORT_SYMBOL(ip6_frag_next);
 808
 809 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 810                  int (*output)(struct net *, struct sock *, struct sk_buff *))
 811 {
 812         struct sk_buff *frag;
 813         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
 814         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
 815                                 inet6_sk(skb->sk) : NULL;
 816         struct ip6_frag_state state;
 817         unsigned int mtu, hlen, nexthdr_offset;
 818         ktime_t tstamp = skb->tstamp;
 819         int hroom, err = 0;
 820         __be32 frag_id;
 821         u8 *prevhdr, nexthdr = 0;
 822
 823         err = ip6_find_1stfragopt(skb, &prevhdr);
 824         if (err < 0)
 825                 goto fail;
 826         hlen = err;
 827         nexthdr = *prevhdr;
 828         nexthdr_offset = prevhdr - skb_network_header(skb);
 829
 830         mtu = ip6_skb_dst_mtu(skb);
 831
 832         /* We must not fragment if the socket is set to force MTU discovery
 833          * or if the skb it not generated by a local socket.
 834          */
 835         if (unlikely(!skb->ignore_df && skb->len > mtu))
 836                 goto fail_toobig;
 837
 838         if (IP6CB(skb)->frag_max_size) {
 839                 if (IP6CB(skb)->frag_max_size > mtu)
 840                         goto fail_toobig;
 841
 842                 /* don't send fragments larger than what we received */
 843                 mtu = IP6CB(skb)->frag_max_size;
 844                 if (mtu < IPV6_MIN_MTU)
 845                         mtu = IPV6_MIN_MTU;
 846         }
 847
 848         if (np && np->frag_size < mtu) {
 849                 if (np->frag_size)
 850                         mtu = np->frag_size;
 851         }
 852         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
 853                 goto fail_toobig;
 854         mtu -= hlen + sizeof(struct frag_hdr);
 855
 856         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
 857                                     &ipv6_hdr(skb)->saddr);
 858
 859         if (skb->ip_summed == CHECKSUM_PARTIAL &&
 860             (err = skb_checksum_help(skb)))
 861                 goto fail;
 862
 863         prevhdr = skb_network_header(skb) + nexthdr_offset;
 864         hroom = LL_RESERVED_SPACE(rt->dst.dev);
 865         if (skb_has_frag_list(skb)) {
 866                 unsigned int first_len = skb_pagelen(skb);
 867                 struct ip6_fraglist_iter iter;
 868                 struct sk_buff *frag2;
 869
 870                 if (first_len - hlen > mtu ||
 871                     ((first_len - hlen) & 7) ||
 872                     skb_cloned(skb) ||
 873                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
 874                         goto slow_path;
 875
 876                 skb_walk_frags(skb, frag) {
 877                         /* Correct geometry. */
 878                         if (frag->len > mtu ||
 879                             ((frag->len & 7) && frag->next) ||
 880                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
 881                                 goto slow_path_clean;
 882
 883                         /* Partially cloned skb? */
 884                         if (skb_shared(frag))
 885                                 goto slow_path_clean;
 886
 887                         BUG_ON(frag->sk);
 888                         if (skb->sk) {
 889                                 frag->sk = skb->sk;
 890                                 frag->destructor = sock_wfree;
 891                         }
 892                         skb->truesize -= frag->truesize;
 893                 }
 894
 895                 err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
 896                                         &iter);
 897                 if (err < 0)
 898                         goto fail;
 899
 900                 for (;;) {
 901                         /* Prepare header of the next frame,
 902                          * before previous one went down. */
 903                         if (iter.frag)
 904                                 ip6_fraglist_prepare(skb, &iter);
 905
 906                         skb->tstamp = tstamp;
 907                         err = output(net, sk, skb);
 908                         if (!err)
 909                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 910                                               IPSTATS_MIB_FRAGCREATES);
 911
 912                         if (err || !iter.frag)
 913                                 break;
 914
 915                         skb = ip6_fraglist_next(&iter);
 916                 }
 917
 918                 kfree(iter.tmp_hdr);
 919
 920                 if (err == 0) {
 921                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 922                                       IPSTATS_MIB_FRAGOKS);
 923                         return 0;
 924                 }
 925
 926                 kfree_skb_list(iter.frag);
 927
 928                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 929                               IPSTATS_MIB_FRAGFAILS);
 930                 return err;
 931
 932 slow_path_clean:
 933                 skb_walk_frags(skb, frag2) {
 934                         if (frag2 == frag)
 935                                 break;
 936                         frag2->sk = NULL;
 937                         frag2->destructor = NULL;
 938                         skb->truesize += frag2->truesize;
 939                 }
 940         }
 941
 942 slow_path:
 943         /*
 944          *      Fragment the datagram.
 945          */
 946
 947         ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
 948                       LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
 949                       &state);
 950
 951         /*
 952          *      Keep copying data until we run out.
 953          */
 954
 955         while (state.left > 0) {
 956                 frag = ip6_frag_next(skb, &state);
 957                 if (IS_ERR(frag)) {
 958                         err = PTR_ERR(frag);
 959                         goto fail;
 960                 }
 961
 962                 /*
 963                  *      Put this fragment into the sending queue.
 964                  */
 965                 frag->tstamp = tstamp;
 966                 err = output(net, sk, frag);
 967                 if (err)
 968                         goto fail;
 969
 970                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 971                               IPSTATS_MIB_FRAGCREATES);
 972         }
 973         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 974                       IPSTATS_MIB_FRAGOKS);
 975         consume_skb(skb);
 976         return err;
 977
 978 fail_toobig:
 979         if (skb->sk && dst_allfrag(skb_dst(skb)))
 980                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
 981
 982         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 983         err = -EMSGSIZE;
 984
 985 fail:
 986         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 987                       IPSTATS_MIB_FRAGFAILS);
 988         kfree_skb(skb);
 989         return err;
 990 }
 991
 992 static inline int ip6_rt_check(const struct rt6key *rt_key,
 993                                const struct in6_addr *fl_addr,
 994                                const struct in6_addr *addr_cache)
 995 {
 996         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 997                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
 998 }
 999
1000 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1001                                           struct dst_entry *dst,
1002                                           const struct flowi6 *fl6)
1003 {
1004         struct ipv6_pinfo *np = inet6_sk(sk);
1005         struct rt6_info *rt;
1006
1007         if (!dst)
1008                 goto out;
1009
1010         if (dst->ops->family != AF_INET6) {
1011                 dst_release(dst);
1012                 return NULL;
1013         }
1014
1015         rt = (struct rt6_info *)dst;
1016         /* Yes, checking route validity in not connected
1017          * case is not very simple. Take into account,
1018          * that we do not support routing by source, TOS,
1019          * and MSG_DONTROUTE            --ANK (980726)
1020          *
1021          * 1. ip6_rt_check(): If route was host route,
1022          *    check that cached destination is current.
1023          *    If it is network route, we still may
1024          *    check its validity using saved pointer
1025          *    to the last used address: daddr_cache.
1026          *    We do not want to save whole address now,
1027          *    (because main consumer of this service
1028          *    is tcp, which has not this problem),
1029          *    so that the last trick works only on connected
1030          *    sockets.
1031          * 2. oif also should be the same.
1032          */
1033         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1034 #ifdef CONFIG_IPV6_SUBTREES
1035             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1036 #endif
1037            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
1038               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
1039                 dst_release(dst);
1040                 dst = NULL;
1041         }
1042
1043 out:
1044         return dst;
1045 }
1046
1047 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1048                                struct dst_entry **dst, struct flowi6 *fl6)
1049 {
1050 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1051         struct neighbour *n;
1052         struct rt6_info *rt;
1053 #endif
1054         int err;
1055         int flags = 0;
1056
1057         /* The correct way to handle this would be to do
1058          * ip6_route_get_saddr, and then ip6_route_output; however,
1059          * the route-specific preferred source forces the
1060          * ip6_route_output call _before_ ip6_route_get_saddr.
1061          *
1062          * In source specific routing (no src=any default route),
1063          * ip6_route_output will fail given src=any saddr, though, so
1064          * that's why we try it again later.
1065          */
1066         if (ipv6_addr_any(&fl6->saddr)) {
1067                 struct fib6_info *from;
1068                 struct rt6_info *rt;
1069
1070                 *dst = ip6_route_output(net, sk, fl6);
1071                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1072
1073                 rcu_read_lock();
1074                 from = rt ? rcu_dereference(rt->from) : NULL;
1075                 err = ip6_route_get_saddr(net, from, &fl6->daddr,
1076                                           sk ? inet6_sk(sk)->srcprefs : 0,
1077                                           &fl6->saddr);
1078                 rcu_read_unlock();
1079
1080                 if (err)
1081                         goto out_err_release;
1082
1083                 /* If we had an erroneous initial result, pretend it
1084                  * never existed and let the SA-enabled version take
1085                  * over.
1086                  */
1087                 if ((*dst)->error) {
1088                         dst_release(*dst);
1089                         *dst = NULL;
1090                 }
1091
1092                 if (fl6->flowi6_oif)
1093                         flags |= RT6_LOOKUP_F_IFACE;
1094         }
1095
1096         if (!*dst)
1097                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
1098
1099         err = (*dst)->error;
1100         if (err)
1101                 goto out_err_release;
1102
1103 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1104         /*
1105          * Here if the dst entry we've looked up
1106          * has a neighbour entry that is in the INCOMPLETE
1107          * state and the src address from the flow is
1108          * marked as OPTIMISTIC, we release the found
1109          * dst entry and replace it instead with the
1110          * dst entry of the nexthop router
1111          */
1112         rt = (struct rt6_info *) *dst;
1113         rcu_read_lock_bh();
1114         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1115                                       rt6_nexthop(rt, &fl6->daddr));
1116         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1117         rcu_read_unlock_bh();
1118
1119         if (err) {
1120                 struct inet6_ifaddr *ifp;
1121                 struct flowi6 fl_gw6;
1122                 int redirect;
1123
1124                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1125                                       (*dst)->dev, 1);
1126
1127                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1128                 if (ifp)
1129                         in6_ifa_put(ifp);
1130
1131                 if (redirect) {
1132                         /*
1133                          * We need to get the dst entry for the
1134                          * default router instead
1135                          */
1136                         dst_release(*dst);
1137                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1138                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1139                         *dst = ip6_route_output(net, sk, &fl_gw6);
1140                         err = (*dst)->error;
1141                         if (err)
1142                                 goto out_err_release;
1143                 }
1144         }
1145 #endif
1146         if (ipv6_addr_v4mapped(&fl6->saddr) &&
1147             !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1148                 err = -EAFNOSUPPORT;
1149                 goto out_err_release;
1150         }
1151
1152         return 0;
1153
1154 out_err_release:
1155         dst_release(*dst);
1156         *dst = NULL;
1157
1158         if (err == -ENETUNREACH)
1159                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1160         return err;
1161 }
1162
1163 /**
1164  *      ip6_dst_lookup - perform route lookup on flow
1165  *      @net: Network namespace to perform lookup in
1166  *      @sk: socket which provides route info
1167  *      @dst: pointer to dst_entry * for result
1168  *      @fl6: flow to lookup
1169  *
1170  *      This function performs a route lookup on the given flow.
1171  *
1172  *      It returns zero on success, or a standard errno code on error.
1173  */
1174 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1175                    struct flowi6 *fl6)
1176 {
1177         *dst = NULL;
1178         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1179 }
1180 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1181
1182 /**
1183  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1184  *      @net: Network namespace to perform lookup in
1185  *      @sk: socket which provides route info
1186  *      @fl6: flow to lookup
1187  *      @final_dst: final destination address for ipsec lookup
1188  *
1189  *      This function performs a route lookup on the given flow.
1190  *
1191  *      It returns a valid dst pointer on success, or a pointer encoded
1192  *      error code.
1193  */
1194 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1195                                       const struct in6_addr *final_dst)
1196 {
1197         struct dst_entry *dst = NULL;
1198         int err;
1199
1200         err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1201         if (err)
1202                 return ERR_PTR(err);
1203         if (final_dst)
1204                 fl6->daddr = *final_dst;
1205
1206         return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1207 }
1208 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1209
1210 /**
1211  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1212  *      @sk: socket which provides the dst cache and route info
1213  *      @fl6: flow to lookup
1214  *      @final_dst: final destination address for ipsec lookup
1215  *      @connected: whether @sk is connected or not
1216  *
1217  *      This function performs a route lookup on the given flow with the
1218  *      possibility of using the cached route in the socket if it is valid.
1219  *      It will take the socket dst lock when operating on the dst cache.
1220  *      As a result, this function can only be used in process context.
1221  *
1222  *      In addition, for a connected socket, cache the dst in the socket
1223  *      if the current cache is not valid.
1224  *
1225  *      It returns a valid dst pointer on success, or a pointer encoded
1226  *      error code.
1227  */
1228 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1229                                          const struct in6_addr *final_dst,
1230                                          bool connected)
1231 {
1232         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1233
1234         dst = ip6_sk_dst_check(sk, dst, fl6);
1235         if (dst)
1236                 return dst;
1237
1238         dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1239         if (connected && !IS_ERR(dst))
1240                 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1241
1242         return dst;
1243 }
1244 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1245
1246 /**
1247  *      ip6_dst_lookup_tunnel - perform route lookup on tunnel
1248  *      @skb: Packet for which lookup is done
1249  *      @dev: Tunnel device
1250  *      @net: Network namespace of tunnel device
1251  *      @sock: Socket which provides route info
1252  *      @saddr: Memory to store the src ip address
1253  *      @info: Tunnel information
1254  *      @protocol: IP protocol
1255  *      @use_cache: Flag to enable cache usage
1256  *      This function performs a route lookup on a tunnel
1257  *
1258  *      It returns a valid dst pointer and stores src address to be used in
1259  *      tunnel in param saddr on success, else a pointer encoded error code.
1260  */
1261
1262 struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1263                                         struct net_device *dev,
1264                                         struct net *net,
1265                                         struct socket *sock,
1266                                         struct in6_addr *saddr,
1267                                         const struct ip_tunnel_info *info,
1268                                         u8 protocol,
1269                                         bool use_cache)
1270 {
1271         struct dst_entry *dst = NULL;
1272 #ifdef CONFIG_DST_CACHE
1273         struct dst_cache *dst_cache;
1274 #endif
1275         struct flowi6 fl6;
1276         __u8 prio;
1277
1278 #ifdef CONFIG_DST_CACHE
1279         dst_cache = (struct dst_cache *)&info->dst_cache;
1280         if (use_cache) {
1281                 dst = dst_cache_get_ip6(dst_cache, saddr);
1282                 if (dst)
1283                         return dst;
1284         }
1285 #endif
1286         memset(&fl6, 0, sizeof(fl6));
1287         fl6.flowi6_mark = skb->mark;
1288         fl6.flowi6_proto = protocol;
1289         fl6.daddr = info->key.u.ipv6.dst;
1290         fl6.saddr = info->key.u.ipv6.src;
1291         prio = info->key.tos;
1292         fl6.flowlabel = ip6_make_flowinfo(RT_TOS(prio),
1293                                           info->key.label);
1294
1295         dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1296                                               NULL);
1297         if (IS_ERR(dst)) {
1298                 netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1299                 return ERR_PTR(-ENETUNREACH);
1300         }
1301         if (dst->dev == dev) { /* is this necessary? */
1302                 netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1303                 dst_release(dst);
1304                 return ERR_PTR(-ELOOP);
1305         }
1306 #ifdef CONFIG_DST_CACHE
1307         if (use_cache)
1308                 dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1309 #endif
1310         *saddr = fl6.saddr;
1311         return dst;
1312 }
1313 EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1314
1315 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1316                                                gfp_t gfp)
1317 {
1318         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1319 }
1320
1321 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1322                                                 gfp_t gfp)
1323 {
1324         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1325 }
1326
1327 static void ip6_append_data_mtu(unsigned int *mtu,
1328                                 int *maxfraglen,
1329                                 unsigned int fragheaderlen,
1330                                 struct sk_buff *skb,
1331                                 struct rt6_info *rt,
1332                                 unsigned int orig_mtu)
1333 {
1334         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1335                 if (!skb) {
1336                         /* first fragment, reserve header_len */
1337                         *mtu = orig_mtu - rt->dst.header_len;
1338
1339                 } else {
1340                         /*
1341                          * this fragment is not first, the headers
1342                          * space is regarded as data space.
1343                          */
1344                         *mtu = orig_mtu;
1345                 }
1346                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1347                               + fragheaderlen - sizeof(struct frag_hdr);
1348         }
1349 }
1350
1351 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1352                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1353                           struct rt6_info *rt, struct flowi6 *fl6)
1354 {
1355         struct ipv6_pinfo *np = inet6_sk(sk);
1356         unsigned int mtu;
1357         struct ipv6_txoptions *opt = ipc6->opt;
1358
1359         /*
1360          * setup for corking
1361          */
1362         if (opt) {
1363                 if (WARN_ON(v6_cork->opt))
1364                         return -EINVAL;
1365
1366                 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1367                 if (unlikely(!v6_cork->opt))
1368                         return -ENOBUFS;
1369
1370                 v6_cork->opt->tot_len = sizeof(*opt);
1371                 v6_cork->opt->opt_flen = opt->opt_flen;
1372                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1373
1374                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1375                                                     sk->sk_allocation);
1376                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1377                         return -ENOBUFS;
1378
1379                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1380                                                     sk->sk_allocation);
1381                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1382                         return -ENOBUFS;
1383
1384                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1385                                                    sk->sk_allocation);
1386                 if (opt->hopopt && !v6_cork->opt->hopopt)
1387                         return -ENOBUFS;
1388
1389                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1390                                                     sk->sk_allocation);
1391                 if (opt->srcrt && !v6_cork->opt->srcrt)
1392                         return -ENOBUFS;
1393
1394                 /* need source address above miyazawa*/
1395         }
1396         dst_hold(&rt->dst);
1397         cork->base.dst = &rt->dst;
1398         cork->fl.u.ip6 = *fl6;
1399         v6_cork->hop_limit = ipc6->hlimit;
1400         v6_cork->tclass = ipc6->tclass;
1401         if (rt->dst.flags & DST_XFRM_TUNNEL)
1402                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1403                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1404         else
1405                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1406                         READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1407         if (np->frag_size < mtu) {
1408                 if (np->frag_size)
1409                         mtu = np->frag_size;
1410         }
1411         cork->base.fragsize = mtu;
1412         cork->base.gso_size = ipc6->gso_size;
1413         cork->base.tx_flags = 0;
1414         cork->base.mark = ipc6->sockc.mark;
1415         sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1416
1417         if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1418                 cork->base.flags |= IPCORK_ALLFRAG;
1419         cork->base.length = 0;
1420
1421         cork->base.transmit_time = ipc6->sockc.transmit_time;
1422
1423         return 0;
1424 }
1425
1426 static int __ip6_append_data(struct sock *sk,
1427                              struct flowi6 *fl6,
1428                              struct sk_buff_head *queue,
1429                              struct inet_cork *cork,
1430                              struct inet6_cork *v6_cork,
1431                              struct page_frag *pfrag,
1432                              int getfrag(void *from, char *to, int offset,
1433                                          int len, int odd, struct sk_buff *skb),
1434                              void *from, int length, int transhdrlen,
1435                              unsigned int flags, struct ipcm6_cookie *ipc6)
1436 {
1437         struct sk_buff *skb, *skb_prev = NULL;
1438         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1439         struct ubuf_info *uarg = NULL;
1440         int exthdrlen = 0;
1441         int dst_exthdrlen = 0;
1442         int hh_len;
1443         int copy;
1444         int err;
1445         int offset = 0;
1446         u32 tskey = 0;
1447         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1448         struct ipv6_txoptions *opt = v6_cork->opt;
1449         int csummode = CHECKSUM_NONE;
1450         unsigned int maxnonfragsize, headersize;
1451         unsigned int wmem_alloc_delta = 0;
1452         bool paged, extra_uref = false;
1453
1454         skb = skb_peek_tail(queue);
1455         if (!skb) {
1456                 exthdrlen = opt ? opt->opt_flen : 0;
1457                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1458         }
1459
1460         paged = !!cork->gso_size;
1461         mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1462         orig_mtu = mtu;
1463
1464         if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1465             sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1466                 tskey = atomic_inc_return(&sk->sk_tskey) - 1;
1467
1468         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1469
1470         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1471                         (opt ? opt->opt_nflen : 0);
1472
1473         headersize = sizeof(struct ipv6hdr) +
1474                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1475                      (dst_allfrag(&rt->dst) ?
1476                       sizeof(struct frag_hdr) : 0) +
1477                      rt->rt6i_nfheader_len;
1478
1479         if (mtu < fragheaderlen ||
1480             ((mtu - fragheaderlen) & ~7) + fragheaderlen < sizeof(struct frag_hdr))
1481                 goto emsgsize;
1482
1483         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1484                      sizeof(struct frag_hdr);
1485
1486         /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1487          * the first fragment
1488          */
1489         if (headersize + transhdrlen > mtu)
1490                 goto emsgsize;
1491
1492         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1493             (sk->sk_protocol == IPPROTO_UDP ||
1494              sk->sk_protocol == IPPROTO_RAW)) {
1495                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1496                                 sizeof(struct ipv6hdr));
1497                 goto emsgsize;
1498         }
1499
1500         if (ip6_sk_ignore_df(sk))
1501                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1502         else
1503                 maxnonfragsize = mtu;
1504
1505         if (cork->length + length > maxnonfragsize - headersize) {
1506 emsgsize:
1507                 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1508                 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1509                 return -EMSGSIZE;
1510         }
1511
1512         /* CHECKSUM_PARTIAL only with no extension headers and when
1513          * we are not going to fragment
1514          */
1515         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1516             headersize == sizeof(struct ipv6hdr) &&
1517             length <= mtu - headersize &&
1518             (!(flags & MSG_MORE) || cork->gso_size) &&
1519             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1520                 csummode = CHECKSUM_PARTIAL;
1521
1522         if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
1523                 uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
1524                 if (!uarg)
1525                         return -ENOBUFS;
1526                 extra_uref = !skb_zcopy(skb);   /* only ref on new uarg */
1527                 if (rt->dst.dev->features & NETIF_F_SG &&
1528                     csummode == CHECKSUM_PARTIAL) {
1529                         paged = true;
1530                 } else {
1531                         uarg->zerocopy = 0;
1532                         skb_zcopy_set(skb, uarg, &extra_uref);
1533                 }
1534         }
1535
1536         /*
1537          * Let's try using as much space as possible.
1538          * Use MTU if total length of the message fits into the MTU.
1539          * Otherwise, we need to reserve fragment header and
1540          * fragment alignment (= 8-15 octects, in total).
1541          *
1542          * Note that we may need to "move" the data from the tail
1543          * of the buffer to the new fragment when we split
1544          * the message.
1545          *
1546          * FIXME: It may be fragmented into multiple chunks
1547          *        at once if non-fragmentable extension headers
1548          *        are too large.
1549          * --yoshfuji
1550          */
1551
1552         cork->length += length;
1553         if (!skb)
1554                 goto alloc_new_skb;
1555
1556         while (length > 0) {
1557                 /* Check if the remaining data fits into current packet. */
1558                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1559                 if (copy < length)
1560                         copy = maxfraglen - skb->len;
1561
1562                 if (copy <= 0) {
1563                         char *data;
1564                         unsigned int datalen;
1565                         unsigned int fraglen;
1566                         unsigned int fraggap;
1567                         unsigned int alloclen, alloc_extra;
1568                         unsigned int pagedlen;
1569 alloc_new_skb:
1570                         /* There's no room in the current skb */
1571                         if (skb)
1572                                 fraggap = skb->len - maxfraglen;
1573                         else
1574                                 fraggap = 0;
1575                         /* update mtu and maxfraglen if necessary */
1576                         if (!skb || !skb_prev)
1577                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1578                                                     fragheaderlen, skb, rt,
1579                                                     orig_mtu);
1580
1581                         skb_prev = skb;
1582
1583                         /*
1584                          * If remaining data exceeds the mtu,
1585                          * we know we need more fragment(s).
1586                          */
1587                         datalen = length + fraggap;
1588
1589                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1590                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1591                         fraglen = datalen + fragheaderlen;
1592                         pagedlen = 0;
1593
1594                         alloc_extra = hh_len;
1595                         alloc_extra += dst_exthdrlen;
1596                         alloc_extra += rt->dst.trailer_len;
1597
1598                         /* We just reserve space for fragment header.
1599                          * Note: this may be overallocation if the message
1600                          * (without MSG_MORE) fits into the MTU.
1601                          */
1602                         alloc_extra += sizeof(struct frag_hdr);
1603
1604                         if ((flags & MSG_MORE) &&
1605                             !(rt->dst.dev->features&NETIF_F_SG))
1606                                 alloclen = mtu;
1607                         else if (!paged &&
1608                                  (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1609                                   !(rt->dst.dev->features & NETIF_F_SG)))
1610                                 alloclen = fraglen;
1611                         else {
1612                                 alloclen = min_t(int, fraglen, MAX_HEADER);
1613                                 pagedlen = fraglen - alloclen;
1614                         }
1615                         alloclen += alloc_extra;
1616
1617                         if (datalen != length + fraggap) {
1618                                 /*
1619                                  * this is not the last fragment, the trailer
1620                                  * space is regarded as data space.
1621                                  */
1622                                 datalen += rt->dst.trailer_len;
1623                         }
1624
1625                         fraglen = datalen + fragheaderlen;
1626
1627                         copy = datalen - transhdrlen - fraggap - pagedlen;
1628                         if (copy < 0) {
1629                                 err = -EINVAL;
1630                                 goto error;
1631                         }
1632                         if (transhdrlen) {
1633                                 skb = sock_alloc_send_skb(sk, alloclen,
1634                                                 (flags & MSG_DONTWAIT), &err);
1635                         } else {
1636                                 skb = NULL;
1637                                 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1638                                     2 * sk->sk_sndbuf)
1639                                         skb = alloc_skb(alloclen,
1640                                                         sk->sk_allocation);
1641                                 if (unlikely(!skb))
1642                                         err = -ENOBUFS;
1643                         }
1644                         if (!skb)
1645                                 goto error;
1646                         /*
1647                          *      Fill in the control structures
1648                          */
1649                         skb->protocol = htons(ETH_P_IPV6);
1650                         skb->ip_summed = csummode;
1651                         skb->csum = 0;
1652                         /* reserve for fragmentation and ipsec header */
1653                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1654                                     dst_exthdrlen);
1655
1656                         /*
1657                          *      Find where to start putting bytes
1658                          */
1659                         data = skb_put(skb, fraglen - pagedlen);
1660                         skb_set_network_header(skb, exthdrlen);
1661                         data += fragheaderlen;
1662                         skb->transport_header = (skb->network_header +
1663                                                  fragheaderlen);
1664                         if (fraggap) {
1665                                 skb->csum = skb_copy_and_csum_bits(
1666                                         skb_prev, maxfraglen,
1667                                         data + transhdrlen, fraggap);
1668                                 skb_prev->csum = csum_sub(skb_prev->csum,
1669                                                           skb->csum);
1670                                 data += fraggap;
1671                                 pskb_trim_unique(skb_prev, maxfraglen);
1672                         }
1673                         if (copy > 0 &&
1674                             getfrag(from, data + transhdrlen, offset,
1675                                     copy, fraggap, skb) < 0) {
1676                                 err = -EFAULT;
1677                                 kfree_skb(skb);
1678                                 goto error;
1679                         }
1680
1681                         offset += copy;
1682                         length -= copy + transhdrlen;
1683                         transhdrlen = 0;
1684                         exthdrlen = 0;
1685                         dst_exthdrlen = 0;
1686
1687                         /* Only the initial fragment is time stamped */
1688                         skb_shinfo(skb)->tx_flags = cork->tx_flags;
1689                         cork->tx_flags = 0;
1690                         skb_shinfo(skb)->tskey = tskey;
1691                         tskey = 0;
1692                         skb_zcopy_set(skb, uarg, &extra_uref);
1693
1694                         if ((flags & MSG_CONFIRM) && !skb_prev)
1695                                 skb_set_dst_pending_confirm(skb, 1);
1696
1697                         /*
1698                          * Put the packet on the pending queue
1699                          */
1700                         if (!skb->destructor) {
1701                                 skb->destructor = sock_wfree;
1702                                 skb->sk = sk;
1703                                 wmem_alloc_delta += skb->truesize;
1704                         }
1705                         __skb_queue_tail(queue, skb);
1706                         continue;
1707                 }
1708
1709                 if (copy > length)
1710                         copy = length;
1711
1712                 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1713                     skb_tailroom(skb) >= copy) {
1714                         unsigned int off;
1715
1716                         off = skb->len;
1717                         if (getfrag(from, skb_put(skb, copy),
1718                                                 offset, copy, off, skb) < 0) {
1719                                 __skb_trim(skb, off);
1720                                 err = -EFAULT;
1721                                 goto error;
1722                         }
1723                 } else if (!uarg || !uarg->zerocopy) {
1724                         int i = skb_shinfo(skb)->nr_frags;
1725
1726                         err = -ENOMEM;
1727                         if (!sk_page_frag_refill(sk, pfrag))
1728                                 goto error;
1729
1730                         if (!skb_can_coalesce(skb, i, pfrag->page,
1731                                               pfrag->offset)) {
1732                                 err = -EMSGSIZE;
1733                                 if (i == MAX_SKB_FRAGS)
1734                                         goto error;
1735
1736                                 __skb_fill_page_desc(skb, i, pfrag->page,
1737                                                      pfrag->offset, 0);
1738                                 skb_shinfo(skb)->nr_frags = ++i;
1739                                 get_page(pfrag->page);
1740                         }
1741                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1742                         if (getfrag(from,
1743                                     page_address(pfrag->page) + pfrag->offset,
1744                                     offset, copy, skb->len, skb) < 0)
1745                                 goto error_efault;
1746
1747                         pfrag->offset += copy;
1748                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1749                         skb->len += copy;
1750                         skb->data_len += copy;
1751                         skb->truesize += copy;
1752                         wmem_alloc_delta += copy;
1753                 } else {
1754                         err = skb_zerocopy_iter_dgram(skb, from, copy);
1755                         if (err < 0)
1756                                 goto error;
1757                 }
1758                 offset += copy;
1759                 length -= copy;
1760         }
1761
1762         if (wmem_alloc_delta)
1763                 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1764         return 0;
1765
1766 error_efault:
1767         err = -EFAULT;
1768 error:
1769         net_zcopy_put_abort(uarg, extra_uref);
1770         cork->length -= length;
1771         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1772         refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1773         return err;
1774 }
1775
1776 int ip6_append_data(struct sock *sk,
1777                     int getfrag(void *from, char *to, int offset, int len,
1778                                 int odd, struct sk_buff *skb),
1779                     void *from, int length, int transhdrlen,
1780                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1781                     struct rt6_info *rt, unsigned int flags)
1782 {
1783         struct inet_sock *inet = inet_sk(sk);
1784         struct ipv6_pinfo *np = inet6_sk(sk);
1785         int exthdrlen;
1786         int err;
1787
1788         if (flags&MSG_PROBE)
1789                 return 0;
1790         if (skb_queue_empty(&sk->sk_write_queue)) {
1791                 /*
1792                  * setup for corking
1793                  */
1794                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1795                                      ipc6, rt, fl6);
1796                 if (err)
1797                         return err;
1798
1799                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1800                 length += exthdrlen;
1801                 transhdrlen += exthdrlen;
1802         } else {
1803                 fl6 = &inet->cork.fl.u.ip6;
1804                 transhdrlen = 0;
1805         }
1806
1807         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1808                                  &np->cork, sk_page_frag(sk), getfrag,
1809                                  from, length, transhdrlen, flags, ipc6);
1810 }
1811 EXPORT_SYMBOL_GPL(ip6_append_data);
1812
1813 static void ip6_cork_release(struct inet_cork_full *cork,
1814                              struct inet6_cork *v6_cork)
1815 {
1816         if (v6_cork->opt) {
1817                 kfree(v6_cork->opt->dst0opt);
1818                 kfree(v6_cork->opt->dst1opt);
1819                 kfree(v6_cork->opt->hopopt);
1820                 kfree(v6_cork->opt->srcrt);
1821                 kfree(v6_cork->opt);
1822                 v6_cork->opt = NULL;
1823         }
1824
1825         if (cork->base.dst) {
1826                 dst_release(cork->base.dst);
1827                 cork->base.dst = NULL;
1828                 cork->base.flags &= ~IPCORK_ALLFRAG;
1829         }
1830         memset(&cork->fl, 0, sizeof(cork->fl));
1831 }
1832
1833 struct sk_buff *__ip6_make_skb(struct sock *sk,
1834                                struct sk_buff_head *queue,
1835                                struct inet_cork_full *cork,
1836                                struct inet6_cork *v6_cork)
1837 {
1838         struct sk_buff *skb, *tmp_skb;
1839         struct sk_buff **tail_skb;
1840         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1841         struct ipv6_pinfo *np = inet6_sk(sk);
1842         struct net *net = sock_net(sk);
1843         struct ipv6hdr *hdr;
1844         struct ipv6_txoptions *opt = v6_cork->opt;
1845         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1846         struct flowi6 *fl6 = &cork->fl.u.ip6;
1847         unsigned char proto = fl6->flowi6_proto;
1848
1849         skb = __skb_dequeue(queue);
1850         if (!skb)
1851                 goto out;
1852         tail_skb = &(skb_shinfo(skb)->frag_list);
1853
1854         /* move skb->data to ip header from ext header */
1855         if (skb->data < skb_network_header(skb))
1856                 __skb_pull(skb, skb_network_offset(skb));
1857         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1858                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1859                 *tail_skb = tmp_skb;
1860                 tail_skb = &(tmp_skb->next);
1861                 skb->len += tmp_skb->len;
1862                 skb->data_len += tmp_skb->len;
1863                 skb->truesize += tmp_skb->truesize;
1864                 tmp_skb->destructor = NULL;
1865                 tmp_skb->sk = NULL;
1866         }
1867
1868         /* Allow local fragmentation. */
1869         skb->ignore_df = ip6_sk_ignore_df(sk);
1870
1871         *final_dst = fl6->daddr;
1872         __skb_pull(skb, skb_network_header_len(skb));
1873         if (opt && opt->opt_flen)
1874                 ipv6_push_frag_opts(skb, opt, &proto);
1875         if (opt && opt->opt_nflen)
1876                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1877
1878         skb_push(skb, sizeof(struct ipv6hdr));
1879         skb_reset_network_header(skb);
1880         hdr = ipv6_hdr(skb);
1881
1882         ip6_flow_hdr(hdr, v6_cork->tclass,
1883                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1884                                         ip6_autoflowlabel(net, np), fl6));
1885         hdr->hop_limit = v6_cork->hop_limit;
1886         hdr->nexthdr = proto;
1887         hdr->saddr = fl6->saddr;
1888         hdr->daddr = *final_dst;
1889
1890         skb->priority = sk->sk_priority;
1891         skb->mark = cork->base.mark;
1892
1893         skb->tstamp = cork->base.transmit_time;
1894
1895         skb_dst_set(skb, dst_clone(&rt->dst));
1896         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1897         if (proto == IPPROTO_ICMPV6) {
1898                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1899
1900                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1901                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1902         }
1903
1904         ip6_cork_release(cork, v6_cork);
1905 out:
1906         return skb;
1907 }
1908
1909 int ip6_send_skb(struct sk_buff *skb)
1910 {
1911         struct net *net = sock_net(skb->sk);
1912         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1913         int err;
1914
1915         err = ip6_local_out(net, skb->sk, skb);
1916         if (err) {
1917                 if (err > 0)
1918                         err = net_xmit_errno(err);
1919                 if (err)
1920                         IP6_INC_STATS(net, rt->rt6i_idev,
1921                                       IPSTATS_MIB_OUTDISCARDS);
1922         }
1923
1924         return err;
1925 }
1926
1927 int ip6_push_pending_frames(struct sock *sk)
1928 {
1929         struct sk_buff *skb;
1930
1931         skb = ip6_finish_skb(sk);
1932         if (!skb)
1933                 return 0;
1934
1935         return ip6_send_skb(skb);
1936 }
1937 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1938
1939 static void __ip6_flush_pending_frames(struct sock *sk,
1940                                        struct sk_buff_head *queue,
1941                                        struct inet_cork_full *cork,
1942                                        struct inet6_cork *v6_cork)
1943 {
1944         struct sk_buff *skb;
1945
1946         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1947                 if (skb_dst(skb))
1948                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1949                                       IPSTATS_MIB_OUTDISCARDS);
1950                 kfree_skb(skb);
1951         }
1952
1953         ip6_cork_release(cork, v6_cork);
1954 }
1955
1956 void ip6_flush_pending_frames(struct sock *sk)
1957 {
1958         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1959                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1960 }
1961 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1962
1963 struct sk_buff *ip6_make_skb(struct sock *sk,
1964                              int getfrag(void *from, char *to, int offset,
1965                                          int len, int odd, struct sk_buff *skb),
1966                              void *from, int length, int transhdrlen,
1967                              struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1968                              struct rt6_info *rt, unsigned int flags,
1969                              struct inet_cork_full *cork)
1970 {
1971         struct inet6_cork v6_cork;
1972         struct sk_buff_head queue;
1973         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1974         int err;
1975
1976         if (flags & MSG_PROBE)
1977                 return NULL;
1978
1979         __skb_queue_head_init(&queue);
1980
1981         cork->base.flags = 0;
1982         cork->base.addr = 0;
1983         cork->base.opt = NULL;
1984         cork->base.dst = NULL;
1985         v6_cork.opt = NULL;
1986         err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
1987         if (err) {
1988                 ip6_cork_release(cork, &v6_cork);
1989                 return ERR_PTR(err);
1990         }
1991         if (ipc6->dontfrag < 0)
1992                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1993
1994         err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
1995                                 &current->task_frag, getfrag, from,
1996                                 length + exthdrlen, transhdrlen + exthdrlen,
1997                                 flags, ipc6);
1998         if (err) {
1999                 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
2000                 return ERR_PTR(err);
2001         }
2002
2003         return __ip6_make_skb(sk, &queue, cork, &v6_cork);
2004 }