net/ipv6/ip6_output.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  *      IPv6 output functions
   4  *      Linux INET6 implementation
   5  *
   6  *      Authors:
   7  *      Pedro Roque             <roque@di.fc.ul.pt>
   8  *
   9  *      Based on linux/net/ipv4/ip_output.c
  10  *
  11  *      Changes:
  12  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  13  *                              extension headers are implemented.
  14  *                              route changes now work.
  15  *                              ip6_forward does not confuse sniffers.
  16  *                              etc.
  17  *
  18  *      H. von Brand    :       Added missing #include <linux/string.h>
  19  *      Imran Patel     :       frag id should be in NBO
  20  *      Kazunori MIYAZAWA @USAGI
  21  *                      :       add ip6_append_data and related functions
  22  *                              for datagram xmit
  23  */
  24
  25 #include <linux/errno.h>
  26 #include <linux/kernel.h>
  27 #include <linux/string.h>
  28 #include <linux/socket.h>
  29 #include <linux/net.h>
  30 #include <linux/netdevice.h>
  31 #include <linux/if_arp.h>
  32 #include <linux/in6.h>
  33 #include <linux/tcp.h>
  34 #include <linux/route.h>
  35 #include <linux/module.h>
  36 #include <linux/slab.h>
  37
  38 #include <linux/bpf-cgroup.h>
  39 #include <linux/netfilter.h>
  40 #include <linux/netfilter_ipv6.h>
  41
  42 #include <net/sock.h>
  43 #include <net/snmp.h>
  44
  45 #include <net/gso.h>
  46 #include <net/ipv6.h>
  47 #include <net/ndisc.h>
  48 #include <net/protocol.h>
  49 #include <net/ip6_route.h>
  50 #include <net/addrconf.h>
  51 #include <net/rawv6.h>
  52 #include <net/icmp.h>
  53 #include <net/xfrm.h>
  54 #include <net/checksum.h>
  55 #include <linux/mroute6.h>
  56 #include <net/l3mdev.h>
  57 #include <net/lwtunnel.h>
  58 #include <net/ip_tunnels.h>
  59
  60 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
  61 {
  62         struct dst_entry *dst = skb_dst(skb);
  63         struct net_device *dev = dst->dev;
  64         struct inet6_dev *idev = ip6_dst_idev(dst);
  65         unsigned int hh_len = LL_RESERVED_SPACE(dev);
  66         const struct in6_addr *daddr, *nexthop;
  67         struct ipv6hdr *hdr;
  68         struct neighbour *neigh;
  69         int ret;
  70
  71         /* Be paranoid, rather than too clever. */
  72         if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
  73                 skb = skb_expand_head(skb, hh_len);
  74                 if (!skb) {
  75                         IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
  76                         return -ENOMEM;
  77                 }
  78         }
  79
  80         hdr = ipv6_hdr(skb);
  81         daddr = &hdr->daddr;
  82         if (ipv6_addr_is_multicast(daddr)) {
  83                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
  84                     ((mroute6_is_socket(net, skb) &&
  85                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
  86                      ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
  87                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
  88
  89                         /* Do not check for IFF_ALLMULTI; multicast routing
  90                            is not supported in any case.
  91                          */
  92                         if (newskb)
  93                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
  94                                         net, sk, newskb, NULL, newskb->dev,
  95                                         dev_loopback_xmit);
  96
  97                         if (hdr->hop_limit == 0) {
  98                                 IP6_INC_STATS(net, idev,
  99                                               IPSTATS_MIB_OUTDISCARDS);
 100                                 kfree_skb(skb);
 101                                 return 0;
 102                         }
 103                 }
 104
 105                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
 106                 if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
 107                     !(dev->flags & IFF_LOOPBACK)) {
 108                         kfree_skb(skb);
 109                         return 0;
 110                 }
 111         }
 112
 113         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
 114                 int res = lwtunnel_xmit(skb);
 115
 116                 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
 117                         return res;
 118         }
 119
 120         rcu_read_lock();
 121         nexthop = rt6_nexthop((struct rt6_info *)dst, daddr);
 122         neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
 123
 124         if (unlikely(IS_ERR_OR_NULL(neigh))) {
 125                 if (unlikely(!neigh))
 126                         neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
 127                 if (IS_ERR(neigh)) {
 128                         rcu_read_unlock();
 129                         IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
 130                         kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
 131                         return -EINVAL;
 132                 }
 133         }
 134         sock_confirm_neigh(skb, neigh);
 135         ret = neigh_output(neigh, skb, false);
 136         rcu_read_unlock();
 137         return ret;
 138 }
 139
 140 static int
 141 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
 142                                     struct sk_buff *skb, unsigned int mtu)
 143 {
 144         struct sk_buff *segs, *nskb;
 145         netdev_features_t features;
 146         int ret = 0;
 147
 148         /* Please see corresponding comment in ip_finish_output_gso
 149          * describing the cases where GSO segment length exceeds the
 150          * egress MTU.
 151          */
 152         features = netif_skb_features(skb);
 153         segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
 154         if (IS_ERR_OR_NULL(segs)) {
 155                 kfree_skb(skb);
 156                 return -ENOMEM;
 157         }
 158
 159         consume_skb(skb);
 160
 161         skb_list_walk_safe(segs, segs, nskb) {
 162                 int err;
 163
 164                 skb_mark_not_on_list(segs);
 165                 err = ip6_fragment(net, sk, segs, ip6_finish_output2);
 166                 if (err && ret == 0)
 167                         ret = err;
 168         }
 169
 170         return ret;
 171 }
 172
 173 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 174 {
 175         unsigned int mtu;
 176
 177 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 178         /* Policy lookup after SNAT yielded a new policy */
 179         if (skb_dst(skb)->xfrm) {
 180                 IP6CB(skb)->flags |= IP6SKB_REROUTED;
 181                 return dst_output(net, sk, skb);
 182         }
 183 #endif
 184
 185         mtu = ip6_skb_dst_mtu(skb);
 186         if (skb_is_gso(skb) &&
 187             !(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) &&
 188             !skb_gso_validate_network_len(skb, mtu))
 189                 return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
 190
 191         if ((skb->len > mtu && !skb_is_gso(skb)) ||
 192             dst_allfrag(skb_dst(skb)) ||
 193             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
 194                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
 195         else
 196                 return ip6_finish_output2(net, sk, skb);
 197 }
 198
 199 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 200 {
 201         int ret;
 202
 203         ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
 204         switch (ret) {
 205         case NET_XMIT_SUCCESS:
 206         case NET_XMIT_CN:
 207                 return __ip6_finish_output(net, sk, skb) ? : ret;
 208         default:
 209                 kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
 210                 return ret;
 211         }
 212 }
 213
 214 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 215 {
 216         struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
 217         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 218
 219         skb->protocol = htons(ETH_P_IPV6);
 220         skb->dev = dev;
 221
 222         if (unlikely(idev->cnf.disable_ipv6)) {
 223                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 224                 kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED);
 225                 return 0;
 226         }
 227
 228         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 229                             net, sk, skb, indev, dev,
 230                             ip6_finish_output,
 231                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 232 }
 233 EXPORT_SYMBOL(ip6_output);
 234
 235 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
 236 {
 237         if (!np->autoflowlabel_set)
 238                 return ip6_default_np_autolabel(net);
 239         else
 240                 return np->autoflowlabel;
 241 }
 242
 243 /*
 244  * xmit an sk_buff (used by TCP, SCTP and DCCP)
 245  * Note : socket lock is not held for SYNACK packets, but might be modified
 246  * by calls to skb_set_owner_w() and ipv6_local_error(),
 247  * which are using proper atomic operations or spinlocks.
 248  */
 249 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 250              __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
 251 {
 252         struct net *net = sock_net(sk);
 253         const struct ipv6_pinfo *np = inet6_sk(sk);
 254         struct in6_addr *first_hop = &fl6->daddr;
 255         struct dst_entry *dst = skb_dst(skb);
 256         struct net_device *dev = dst->dev;
 257         struct inet6_dev *idev = ip6_dst_idev(dst);
 258         struct hop_jumbo_hdr *hop_jumbo;
 259         int hoplen = sizeof(*hop_jumbo);
 260         unsigned int head_room;
 261         struct ipv6hdr *hdr;
 262         u8  proto = fl6->flowi6_proto;
 263         int seg_len = skb->len;
 264         int hlimit = -1;
 265         u32 mtu;
 266
 267         head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev);
 268         if (opt)
 269                 head_room += opt->opt_nflen + opt->opt_flen;
 270
 271         if (unlikely(head_room > skb_headroom(skb))) {
 272                 skb = skb_expand_head(skb, head_room);
 273                 if (!skb) {
 274                         IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 275                         return -ENOBUFS;
 276                 }
 277         }
 278
 279         if (opt) {
 280                 seg_len += opt->opt_nflen + opt->opt_flen;
 281
 282                 if (opt->opt_flen)
 283                         ipv6_push_frag_opts(skb, opt, &proto);
 284
 285                 if (opt->opt_nflen)
 286                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
 287                                              &fl6->saddr);
 288         }
 289
 290         if (unlikely(seg_len > IPV6_MAXPLEN)) {
 291                 hop_jumbo = skb_push(skb, hoplen);
 292
 293                 hop_jumbo->nexthdr = proto;
 294                 hop_jumbo->hdrlen = 0;
 295                 hop_jumbo->tlv_type = IPV6_TLV_JUMBO;
 296                 hop_jumbo->tlv_len = 4;
 297                 hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen);
 298
 299                 proto = IPPROTO_HOPOPTS;
 300                 seg_len = 0;
 301                 IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO;
 302         }
 303
 304         skb_push(skb, sizeof(struct ipv6hdr));
 305         skb_reset_network_header(skb);
 306         hdr = ipv6_hdr(skb);
 307
 308         /*
 309          *      Fill in the IPv6 header
 310          */
 311         if (np)
 312                 hlimit = np->hop_limit;
 313         if (hlimit < 0)
 314                 hlimit = ip6_dst_hoplimit(dst);
 315
 316         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
 317                                 ip6_autoflowlabel(net, np), fl6));
 318
 319         hdr->payload_len = htons(seg_len);
 320         hdr->nexthdr = proto;
 321         hdr->hop_limit = hlimit;
 322
 323         hdr->saddr = fl6->saddr;
 324         hdr->daddr = *first_hop;
 325
 326         skb->protocol = htons(ETH_P_IPV6);
 327         skb->priority = priority;
 328         skb->mark = mark;
 329
 330         mtu = dst_mtu(dst);
 331         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
 332                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
 333
 334                 /* if egress device is enslaved to an L3 master device pass the
 335                  * skb to its handler for processing
 336                  */
 337                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
 338                 if (unlikely(!skb))
 339                         return 0;
 340
 341                 /* hooks should never assume socket lock is held.
 342                  * we promote our socket to non const
 343                  */
 344                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
 345                                net, (struct sock *)sk, skb, NULL, dev,
 346                                dst_output);
 347         }
 348
 349         skb->dev = dev;
 350         /* ipv6_local_error() does not require socket lock,
 351          * we promote our socket to non const
 352          */
 353         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
 354
 355         IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
 356         kfree_skb(skb);
 357         return -EMSGSIZE;
 358 }
 359 EXPORT_SYMBOL(ip6_xmit);
 360
 361 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 362 {
 363         struct ip6_ra_chain *ra;
 364         struct sock *last = NULL;
 365
 366         read_lock(&ip6_ra_lock);
 367         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 368                 struct sock *sk = ra->sk;
 369                 if (sk && ra->sel == sel &&
 370                     (!sk->sk_bound_dev_if ||
 371                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 372                         struct ipv6_pinfo *np = inet6_sk(sk);
 373
 374                         if (np && np->rtalert_isolate &&
 375                             !net_eq(sock_net(sk), dev_net(skb->dev))) {
 376                                 continue;
 377                         }
 378                         if (last) {
 379                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 380                                 if (skb2)
 381                                         rawv6_rcv(last, skb2);
 382                         }
 383                         last = sk;
 384                 }
 385         }
 386
 387         if (last) {
 388                 rawv6_rcv(last, skb);
 389                 read_unlock(&ip6_ra_lock);
 390                 return 1;
 391         }
 392         read_unlock(&ip6_ra_lock);
 393         return 0;
 394 }
 395
 396 static int ip6_forward_proxy_check(struct sk_buff *skb)
 397 {
 398         struct ipv6hdr *hdr = ipv6_hdr(skb);
 399         u8 nexthdr = hdr->nexthdr;
 400         __be16 frag_off;
 401         int offset;
 402
 403         if (ipv6_ext_hdr(nexthdr)) {
 404                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 405                 if (offset < 0)
 406                         return 0;
 407         } else
 408                 offset = sizeof(struct ipv6hdr);
 409
 410         if (nexthdr == IPPROTO_ICMPV6) {
 411                 struct icmp6hdr *icmp6;
 412
 413                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 414                                          offset + 1 - skb->data)))
 415                         return 0;
 416
 417                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 418
 419                 switch (icmp6->icmp6_type) {
 420                 case NDISC_ROUTER_SOLICITATION:
 421                 case NDISC_ROUTER_ADVERTISEMENT:
 422                 case NDISC_NEIGHBOUR_SOLICITATION:
 423                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 424                 case NDISC_REDIRECT:
 425                         /* For reaction involving unicast neighbor discovery
 426                          * message destined to the proxied address, pass it to
 427                          * input function.
 428                          */
 429                         return 1;
 430                 default:
 431                         break;
 432                 }
 433         }
 434
 435         /*
 436          * The proxying router can't forward traffic sent to a link-local
 437          * address, so signal the sender and discard the packet. This
 438          * behavior is clarified by the MIPv6 specification.
 439          */
 440         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 441                 dst_link_failure(skb);
 442                 return -1;
 443         }
 444
 445         return 0;
 446 }
 447
 448 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 449                                      struct sk_buff *skb)
 450 {
 451         struct dst_entry *dst = skb_dst(skb);
 452
 453         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 454         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 455
 456 #ifdef CONFIG_NET_SWITCHDEV
 457         if (skb->offload_l3_fwd_mark) {
 458                 consume_skb(skb);
 459                 return 0;
 460         }
 461 #endif
 462
 463         skb_clear_tstamp(skb);
 464         return dst_output(net, sk, skb);
 465 }
 466
 467 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 468 {
 469         if (skb->len <= mtu)
 470                 return false;
 471
 472         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
 473         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
 474                 return true;
 475
 476         if (skb->ignore_df)
 477                 return false;
 478
 479         if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
 480                 return false;
 481
 482         return true;
 483 }
 484
 485 int ip6_forward(struct sk_buff *skb)
 486 {
 487         struct dst_entry *dst = skb_dst(skb);
 488         struct ipv6hdr *hdr = ipv6_hdr(skb);
 489         struct inet6_skb_parm *opt = IP6CB(skb);
 490         struct net *net = dev_net(dst->dev);
 491         struct inet6_dev *idev;
 492         SKB_DR(reason);
 493         u32 mtu;
 494
 495         idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
 496         if (net->ipv6.devconf_all->forwarding == 0)
 497                 goto error;
 498
 499         if (skb->pkt_type != PACKET_HOST)
 500                 goto drop;
 501
 502         if (unlikely(skb->sk))
 503                 goto drop;
 504
 505         if (skb_warn_if_lro(skb))
 506                 goto drop;
 507
 508         if (!net->ipv6.devconf_all->disable_policy &&
 509             (!idev || !idev->cnf.disable_policy) &&
 510             !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 511                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 512                 goto drop;
 513         }
 514
 515         skb_forward_csum(skb);
 516
 517         /*
 518          *      We DO NOT make any processing on
 519          *      RA packets, pushing them to user level AS IS
 520          *      without ane WARRANTY that application will be able
 521          *      to interpret them. The reason is that we
 522          *      cannot make anything clever here.
 523          *
 524          *      We are not end-node, so that if packet contains
 525          *      AH/ESP, we cannot make anything.
 526          *      Defragmentation also would be mistake, RA packets
 527          *      cannot be fragmented, because there is no warranty
 528          *      that different fragments will go along one path. --ANK
 529          */
 530         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
 531                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
 532                         return 0;
 533         }
 534
 535         /*
 536          *      check and decrement ttl
 537          */
 538         if (hdr->hop_limit <= 1) {
 539                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 540                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
 541
 542                 kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
 543                 return -ETIMEDOUT;
 544         }
 545
 546         /* XXX: idev->cnf.proxy_ndp? */
 547         if (net->ipv6.devconf_all->proxy_ndp &&
 548             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 549                 int proxied = ip6_forward_proxy_check(skb);
 550                 if (proxied > 0) {
 551                         /* It's tempting to decrease the hop limit
 552                          * here by 1, as we do at the end of the
 553                          * function too.
 554                          *
 555                          * But that would be incorrect, as proxying is
 556                          * not forwarding.  The ip6_input function
 557                          * will handle this packet locally, and it
 558                          * depends on the hop limit being unchanged.
 559                          *
 560                          * One example is the NDP hop limit, that
 561                          * always has to stay 255, but other would be
 562                          * similar checks around RA packets, where the
 563                          * user can even change the desired limit.
 564                          */
 565                         return ip6_input(skb);
 566                 } else if (proxied < 0) {
 567                         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 568                         goto drop;
 569                 }
 570         }
 571
 572         if (!xfrm6_route_forward(skb)) {
 573                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 574                 SKB_DR_SET(reason, XFRM_POLICY);
 575                 goto drop;
 576         }
 577         dst = skb_dst(skb);
 578
 579         /* IPv6 specs say nothing about it, but it is clear that we cannot
 580            send redirects to source routed frames.
 581            We don't send redirects to frames decapsulated from IPsec.
 582          */
 583         if (IP6CB(skb)->iif == dst->dev->ifindex &&
 584             opt->srcrt == 0 && !skb_sec_path(skb)) {
 585                 struct in6_addr *target = NULL;
 586                 struct inet_peer *peer;
 587                 struct rt6_info *rt;
 588
 589                 /*
 590                  *      incoming and outgoing devices are the same
 591                  *      send a redirect.
 592                  */
 593
 594                 rt = (struct rt6_info *) dst;
 595                 if (rt->rt6i_flags & RTF_GATEWAY)
 596                         target = &rt->rt6i_gateway;
 597                 else
 598                         target = &hdr->daddr;
 599
 600                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
 601
 602                 /* Limit redirects both by destination (here)
 603                    and by source (inside ndisc_send_redirect)
 604                  */
 605                 if (inet_peer_xrlim_allow(peer, 1*HZ))
 606                         ndisc_send_redirect(skb, target);
 607                 if (peer)
 608                         inet_putpeer(peer);
 609         } else {
 610                 int addrtype = ipv6_addr_type(&hdr->saddr);
 611
 612                 /* This check is security critical. */
 613                 if (addrtype == IPV6_ADDR_ANY ||
 614                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 615                         goto error;
 616                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 617                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 618                                     ICMPV6_NOT_NEIGHBOUR, 0);
 619                         goto error;
 620                 }
 621         }
 622
 623         mtu = ip6_dst_mtu_maybe_forward(dst, true);
 624         if (mtu < IPV6_MIN_MTU)
 625                 mtu = IPV6_MIN_MTU;
 626
 627         if (ip6_pkt_too_big(skb, mtu)) {
 628                 /* Again, force OUTPUT device used as source address */
 629                 skb->dev = dst->dev;
 630                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 631                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
 632                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 633                                 IPSTATS_MIB_FRAGFAILS);
 634                 kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
 635                 return -EMSGSIZE;
 636         }
 637
 638         if (skb_cow(skb, dst->dev->hard_header_len)) {
 639                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 640                                 IPSTATS_MIB_OUTDISCARDS);
 641                 goto drop;
 642         }
 643
 644         hdr = ipv6_hdr(skb);
 645
 646         /* Mangling hops number delayed to point after skb COW */
 647
 648         hdr->hop_limit--;
 649
 650         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
 651                        net, NULL, skb, skb->dev, dst->dev,
 652                        ip6_forward_finish);
 653
 654 error:
 655         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
 656         SKB_DR_SET(reason, IP_INADDRERRORS);
 657 drop:
 658         kfree_skb_reason(skb, reason);
 659         return -EINVAL;
 660 }
 661
 662 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 663 {
 664         to->pkt_type = from->pkt_type;
 665         to->priority = from->priority;
 666         to->protocol = from->protocol;
 667         skb_dst_drop(to);
 668         skb_dst_set(to, dst_clone(skb_dst(from)));
 669         to->dev = from->dev;
 670         to->mark = from->mark;
 671
 672         skb_copy_hash(to, from);
 673
 674 #ifdef CONFIG_NET_SCHED
 675         to->tc_index = from->tc_index;
 676 #endif
 677         nf_copy(to, from);
 678         skb_ext_copy(to, from);
 679         skb_copy_secmark(to, from);
 680 }
 681
 682 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
 683                       u8 nexthdr, __be32 frag_id,
 684                       struct ip6_fraglist_iter *iter)
 685 {
 686         unsigned int first_len;
 687         struct frag_hdr *fh;
 688
 689         /* BUILD HEADER */
 690         *prevhdr = NEXTHDR_FRAGMENT;
 691         iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 692         if (!iter->tmp_hdr)
 693                 return -ENOMEM;
 694
 695         iter->frag = skb_shinfo(skb)->frag_list;
 696         skb_frag_list_init(skb);
 697
 698         iter->offset = 0;
 699         iter->hlen = hlen;
 700         iter->frag_id = frag_id;
 701         iter->nexthdr = nexthdr;
 702
 703         __skb_pull(skb, hlen);
 704         fh = __skb_push(skb, sizeof(struct frag_hdr));
 705         __skb_push(skb, hlen);
 706         skb_reset_network_header(skb);
 707         memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
 708
 709         fh->nexthdr = nexthdr;
 710         fh->reserved = 0;
 711         fh->frag_off = htons(IP6_MF);
 712         fh->identification = frag_id;
 713
 714         first_len = skb_pagelen(skb);
 715         skb->data_len = first_len - skb_headlen(skb);
 716         skb->len = first_len;
 717         ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
 718
 719         return 0;
 720 }
 721 EXPORT_SYMBOL(ip6_fraglist_init);
 722
 723 void ip6_fraglist_prepare(struct sk_buff *skb,
 724                           struct ip6_fraglist_iter *iter)
 725 {
 726         struct sk_buff *frag = iter->frag;
 727         unsigned int hlen = iter->hlen;
 728         struct frag_hdr *fh;
 729
 730         frag->ip_summed = CHECKSUM_NONE;
 731         skb_reset_transport_header(frag);
 732         fh = __skb_push(frag, sizeof(struct frag_hdr));
 733         __skb_push(frag, hlen);
 734         skb_reset_network_header(frag);
 735         memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
 736         iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
 737         fh->nexthdr = iter->nexthdr;
 738         fh->reserved = 0;
 739         fh->frag_off = htons(iter->offset);
 740         if (frag->next)
 741                 fh->frag_off |= htons(IP6_MF);
 742         fh->identification = iter->frag_id;
 743         ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
 744         ip6_copy_metadata(frag, skb);
 745 }
 746 EXPORT_SYMBOL(ip6_fraglist_prepare);
 747
 748 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
 749                    unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
 750                    u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
 751 {
 752         state->prevhdr = prevhdr;
 753         state->nexthdr = nexthdr;
 754         state->frag_id = frag_id;
 755
 756         state->hlen = hlen;
 757         state->mtu = mtu;
 758
 759         state->left = skb->len - hlen;  /* Space per frame */
 760         state->ptr = hlen;              /* Where to start from */
 761
 762         state->hroom = hdr_room;
 763         state->troom = needed_tailroom;
 764
 765         state->offset = 0;
 766 }
 767 EXPORT_SYMBOL(ip6_frag_init);
 768
 769 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
 770 {
 771         u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
 772         struct sk_buff *frag;
 773         struct frag_hdr *fh;
 774         unsigned int len;
 775
 776         len = state->left;
 777         /* IF: it doesn't fit, use 'mtu' - the data space left */
 778         if (len > state->mtu)
 779                 len = state->mtu;
 780         /* IF: we are not sending up to and including the packet end
 781            then align the next start on an eight byte boundary */
 782         if (len < state->left)
 783                 len &= ~7;
 784
 785         /* Allocate buffer */
 786         frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
 787                          state->hroom + state->troom, GFP_ATOMIC);
 788         if (!frag)
 789                 return ERR_PTR(-ENOMEM);
 790
 791         /*
 792          *      Set up data on packet
 793          */
 794
 795         ip6_copy_metadata(frag, skb);
 796         skb_reserve(frag, state->hroom);
 797         skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
 798         skb_reset_network_header(frag);
 799         fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
 800         frag->transport_header = (frag->network_header + state->hlen +
 801                                   sizeof(struct frag_hdr));
 802
 803         /*
 804          *      Charge the memory for the fragment to any owner
 805          *      it might possess
 806          */
 807         if (skb->sk)
 808                 skb_set_owner_w(frag, skb->sk);
 809
 810         /*
 811          *      Copy the packet header into the new buffer.
 812          */
 813         skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
 814
 815         fragnexthdr_offset = skb_network_header(frag);
 816         fragnexthdr_offset += prevhdr - skb_network_header(skb);
 817         *fragnexthdr_offset = NEXTHDR_FRAGMENT;
 818
 819         /*
 820          *      Build fragment header.
 821          */
 822         fh->nexthdr = state->nexthdr;
 823         fh->reserved = 0;
 824         fh->identification = state->frag_id;
 825
 826         /*
 827          *      Copy a block of the IP datagram.
 828          */
 829         BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
 830                              len));
 831         state->left -= len;
 832
 833         fh->frag_off = htons(state->offset);
 834         if (state->left > 0)
 835                 fh->frag_off |= htons(IP6_MF);
 836         ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
 837
 838         state->ptr += len;
 839         state->offset += len;
 840
 841         return frag;
 842 }
 843 EXPORT_SYMBOL(ip6_frag_next);
 844
 845 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 846                  int (*output)(struct net *, struct sock *, struct sk_buff *))
 847 {
 848         struct sk_buff *frag;
 849         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
 850         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
 851                                 inet6_sk(skb->sk) : NULL;
 852         bool mono_delivery_time = skb->mono_delivery_time;
 853         struct ip6_frag_state state;
 854         unsigned int mtu, hlen, nexthdr_offset;
 855         ktime_t tstamp = skb->tstamp;
 856         int hroom, err = 0;
 857         __be32 frag_id;
 858         u8 *prevhdr, nexthdr = 0;
 859
 860         err = ip6_find_1stfragopt(skb, &prevhdr);
 861         if (err < 0)
 862                 goto fail;
 863         hlen = err;
 864         nexthdr = *prevhdr;
 865         nexthdr_offset = prevhdr - skb_network_header(skb);
 866
 867         mtu = ip6_skb_dst_mtu(skb);
 868
 869         /* We must not fragment if the socket is set to force MTU discovery
 870          * or if the skb it not generated by a local socket.
 871          */
 872         if (unlikely(!skb->ignore_df && skb->len > mtu))
 873                 goto fail_toobig;
 874
 875         if (IP6CB(skb)->frag_max_size) {
 876                 if (IP6CB(skb)->frag_max_size > mtu)
 877                         goto fail_toobig;
 878
 879                 /* don't send fragments larger than what we received */
 880                 mtu = IP6CB(skb)->frag_max_size;
 881                 if (mtu < IPV6_MIN_MTU)
 882                         mtu = IPV6_MIN_MTU;
 883         }
 884
 885         if (np && np->frag_size < mtu) {
 886                 if (np->frag_size)
 887                         mtu = np->frag_size;
 888         }
 889         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
 890                 goto fail_toobig;
 891         mtu -= hlen + sizeof(struct frag_hdr);
 892
 893         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
 894                                     &ipv6_hdr(skb)->saddr);
 895
 896         if (skb->ip_summed == CHECKSUM_PARTIAL &&
 897             (err = skb_checksum_help(skb)))
 898                 goto fail;
 899
 900         prevhdr = skb_network_header(skb) + nexthdr_offset;
 901         hroom = LL_RESERVED_SPACE(rt->dst.dev);
 902         if (skb_has_frag_list(skb)) {
 903                 unsigned int first_len = skb_pagelen(skb);
 904                 struct ip6_fraglist_iter iter;
 905                 struct sk_buff *frag2;
 906
 907                 if (first_len - hlen > mtu ||
 908                     ((first_len - hlen) & 7) ||
 909                     skb_cloned(skb) ||
 910                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
 911                         goto slow_path;
 912
 913                 skb_walk_frags(skb, frag) {
 914                         /* Correct geometry. */
 915                         if (frag->len > mtu ||
 916                             ((frag->len & 7) && frag->next) ||
 917                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
 918                                 goto slow_path_clean;
 919
 920                         /* Partially cloned skb? */
 921                         if (skb_shared(frag))
 922                                 goto slow_path_clean;
 923
 924                         BUG_ON(frag->sk);
 925                         if (skb->sk) {
 926                                 frag->sk = skb->sk;
 927                                 frag->destructor = sock_wfree;
 928                         }
 929                         skb->truesize -= frag->truesize;
 930                 }
 931
 932                 err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
 933                                         &iter);
 934                 if (err < 0)
 935                         goto fail;
 936
 937                 /* We prevent @rt from being freed. */
 938                 rcu_read_lock();
 939
 940                 for (;;) {
 941                         /* Prepare header of the next frame,
 942                          * before previous one went down. */
 943                         if (iter.frag)
 944                                 ip6_fraglist_prepare(skb, &iter);
 945
 946                         skb_set_delivery_time(skb, tstamp, mono_delivery_time);
 947                         err = output(net, sk, skb);
 948                         if (!err)
 949                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 950                                               IPSTATS_MIB_FRAGCREATES);
 951
 952                         if (err || !iter.frag)
 953                                 break;
 954
 955                         skb = ip6_fraglist_next(&iter);
 956                 }
 957
 958                 kfree(iter.tmp_hdr);
 959
 960                 if (err == 0) {
 961                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 962                                       IPSTATS_MIB_FRAGOKS);
 963                         rcu_read_unlock();
 964                         return 0;
 965                 }
 966
 967                 kfree_skb_list(iter.frag);
 968
 969                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 970                               IPSTATS_MIB_FRAGFAILS);
 971                 rcu_read_unlock();
 972                 return err;
 973
 974 slow_path_clean:
 975                 skb_walk_frags(skb, frag2) {
 976                         if (frag2 == frag)
 977                                 break;
 978                         frag2->sk = NULL;
 979                         frag2->destructor = NULL;
 980                         skb->truesize += frag2->truesize;
 981                 }
 982         }
 983
 984 slow_path:
 985         /*
 986          *      Fragment the datagram.
 987          */
 988
 989         ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
 990                       LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
 991                       &state);
 992
 993         /*
 994          *      Keep copying data until we run out.
 995          */
 996
 997         while (state.left > 0) {
 998                 frag = ip6_frag_next(skb, &state);
 999                 if (IS_ERR(frag)) {
1000                         err = PTR_ERR(frag);
1001                         goto fail;
1002                 }
1003
1004                 /*
1005                  *      Put this fragment into the sending queue.
1006                  */
1007                 skb_set_delivery_time(frag, tstamp, mono_delivery_time);
1008                 err = output(net, sk, frag);
1009                 if (err)
1010                         goto fail;
1011
1012                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1013                               IPSTATS_MIB_FRAGCREATES);
1014         }
1015         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1016                       IPSTATS_MIB_FRAGOKS);
1017         consume_skb(skb);
1018         return err;
1019
1020 fail_toobig:
1021         if (skb->sk && dst_allfrag(skb_dst(skb)))
1022                 sk_gso_disable(skb->sk);
1023
1024         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1025         err = -EMSGSIZE;
1026
1027 fail:
1028         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1029                       IPSTATS_MIB_FRAGFAILS);
1030         kfree_skb(skb);
1031         return err;
1032 }
1033
1034 static inline int ip6_rt_check(const struct rt6key *rt_key,
1035                                const struct in6_addr *fl_addr,
1036                                const struct in6_addr *addr_cache)
1037 {
1038         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
1039                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
1040 }
1041
1042 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1043                                           struct dst_entry *dst,
1044                                           const struct flowi6 *fl6)
1045 {
1046         struct ipv6_pinfo *np = inet6_sk(sk);
1047         struct rt6_info *rt;
1048
1049         if (!dst)
1050                 goto out;
1051
1052         if (dst->ops->family != AF_INET6) {
1053                 dst_release(dst);
1054                 return NULL;
1055         }
1056
1057         rt = (struct rt6_info *)dst;
1058         /* Yes, checking route validity in not connected
1059          * case is not very simple. Take into account,
1060          * that we do not support routing by source, TOS,
1061          * and MSG_DONTROUTE            --ANK (980726)
1062          *
1063          * 1. ip6_rt_check(): If route was host route,
1064          *    check that cached destination is current.
1065          *    If it is network route, we still may
1066          *    check its validity using saved pointer
1067          *    to the last used address: daddr_cache.
1068          *    We do not want to save whole address now,
1069          *    (because main consumer of this service
1070          *    is tcp, which has not this problem),
1071          *    so that the last trick works only on connected
1072          *    sockets.
1073          * 2. oif also should be the same.
1074          */
1075         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1076 #ifdef CONFIG_IPV6_SUBTREES
1077             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1078 #endif
1079            (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
1080                 dst_release(dst);
1081                 dst = NULL;
1082         }
1083
1084 out:
1085         return dst;
1086 }
1087
1088 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1089                                struct dst_entry **dst, struct flowi6 *fl6)
1090 {
1091 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1092         struct neighbour *n;
1093         struct rt6_info *rt;
1094 #endif
1095         int err;
1096         int flags = 0;
1097
1098         /* The correct way to handle this would be to do
1099          * ip6_route_get_saddr, and then ip6_route_output; however,
1100          * the route-specific preferred source forces the
1101          * ip6_route_output call _before_ ip6_route_get_saddr.
1102          *
1103          * In source specific routing (no src=any default route),
1104          * ip6_route_output will fail given src=any saddr, though, so
1105          * that's why we try it again later.
1106          */
1107         if (ipv6_addr_any(&fl6->saddr)) {
1108                 struct fib6_info *from;
1109                 struct rt6_info *rt;
1110
1111                 *dst = ip6_route_output(net, sk, fl6);
1112                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1113
1114                 rcu_read_lock();
1115                 from = rt ? rcu_dereference(rt->from) : NULL;
1116                 err = ip6_route_get_saddr(net, from, &fl6->daddr,
1117                                           sk ? inet6_sk(sk)->srcprefs : 0,
1118                                           &fl6->saddr);
1119                 rcu_read_unlock();
1120
1121                 if (err)
1122                         goto out_err_release;
1123
1124                 /* If we had an erroneous initial result, pretend it
1125                  * never existed and let the SA-enabled version take
1126                  * over.
1127                  */
1128                 if ((*dst)->error) {
1129                         dst_release(*dst);
1130                         *dst = NULL;
1131                 }
1132
1133                 if (fl6->flowi6_oif)
1134                         flags |= RT6_LOOKUP_F_IFACE;
1135         }
1136
1137         if (!*dst)
1138                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
1139
1140         err = (*dst)->error;
1141         if (err)
1142                 goto out_err_release;
1143
1144 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1145         /*
1146          * Here if the dst entry we've looked up
1147          * has a neighbour entry that is in the INCOMPLETE
1148          * state and the src address from the flow is
1149          * marked as OPTIMISTIC, we release the found
1150          * dst entry and replace it instead with the
1151          * dst entry of the nexthop router
1152          */
1153         rt = (struct rt6_info *) *dst;
1154         rcu_read_lock();
1155         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1156                                       rt6_nexthop(rt, &fl6->daddr));
1157         err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0;
1158         rcu_read_unlock();
1159
1160         if (err) {
1161                 struct inet6_ifaddr *ifp;
1162                 struct flowi6 fl_gw6;
1163                 int redirect;
1164
1165                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1166                                       (*dst)->dev, 1);
1167
1168                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1169                 if (ifp)
1170                         in6_ifa_put(ifp);
1171
1172                 if (redirect) {
1173                         /*
1174                          * We need to get the dst entry for the
1175                          * default router instead
1176                          */
1177                         dst_release(*dst);
1178                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1179                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1180                         *dst = ip6_route_output(net, sk, &fl_gw6);
1181                         err = (*dst)->error;
1182                         if (err)
1183                                 goto out_err_release;
1184                 }
1185         }
1186 #endif
1187         if (ipv6_addr_v4mapped(&fl6->saddr) &&
1188             !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1189                 err = -EAFNOSUPPORT;
1190                 goto out_err_release;
1191         }
1192
1193         return 0;
1194
1195 out_err_release:
1196         dst_release(*dst);
1197         *dst = NULL;
1198
1199         if (err == -ENETUNREACH)
1200                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1201         return err;
1202 }
1203
1204 /**
1205  *      ip6_dst_lookup - perform route lookup on flow
1206  *      @net: Network namespace to perform lookup in
1207  *      @sk: socket which provides route info
1208  *      @dst: pointer to dst_entry * for result
1209  *      @fl6: flow to lookup
1210  *
1211  *      This function performs a route lookup on the given flow.
1212  *
1213  *      It returns zero on success, or a standard errno code on error.
1214  */
1215 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1216                    struct flowi6 *fl6)
1217 {
1218         *dst = NULL;
1219         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1220 }
1221 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1222
1223 /**
1224  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1225  *      @net: Network namespace to perform lookup in
1226  *      @sk: socket which provides route info
1227  *      @fl6: flow to lookup
1228  *      @final_dst: final destination address for ipsec lookup
1229  *
1230  *      This function performs a route lookup on the given flow.
1231  *
1232  *      It returns a valid dst pointer on success, or a pointer encoded
1233  *      error code.
1234  */
1235 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1236                                       const struct in6_addr *final_dst)
1237 {
1238         struct dst_entry *dst = NULL;
1239         int err;
1240
1241         err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1242         if (err)
1243                 return ERR_PTR(err);
1244         if (final_dst)
1245                 fl6->daddr = *final_dst;
1246
1247         return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1248 }
1249 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1250
1251 /**
1252  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1253  *      @sk: socket which provides the dst cache and route info
1254  *      @fl6: flow to lookup
1255  *      @final_dst: final destination address for ipsec lookup
1256  *      @connected: whether @sk is connected or not
1257  *
1258  *      This function performs a route lookup on the given flow with the
1259  *      possibility of using the cached route in the socket if it is valid.
1260  *      It will take the socket dst lock when operating on the dst cache.
1261  *      As a result, this function can only be used in process context.
1262  *
1263  *      In addition, for a connected socket, cache the dst in the socket
1264  *      if the current cache is not valid.
1265  *
1266  *      It returns a valid dst pointer on success, or a pointer encoded
1267  *      error code.
1268  */
1269 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1270                                          const struct in6_addr *final_dst,
1271                                          bool connected)
1272 {
1273         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1274
1275         dst = ip6_sk_dst_check(sk, dst, fl6);
1276         if (dst)
1277                 return dst;
1278
1279         dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1280         if (connected && !IS_ERR(dst))
1281                 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1282
1283         return dst;
1284 }
1285 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1286
1287 /**
1288  *      ip6_dst_lookup_tunnel - perform route lookup on tunnel
1289  *      @skb: Packet for which lookup is done
1290  *      @dev: Tunnel device
1291  *      @net: Network namespace of tunnel device
1292  *      @sock: Socket which provides route info
1293  *      @saddr: Memory to store the src ip address
1294  *      @info: Tunnel information
1295  *      @protocol: IP protocol
1296  *      @use_cache: Flag to enable cache usage
1297  *      This function performs a route lookup on a tunnel
1298  *
1299  *      It returns a valid dst pointer and stores src address to be used in
1300  *      tunnel in param saddr on success, else a pointer encoded error code.
1301  */
1302
1303 struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1304                                         struct net_device *dev,
1305                                         struct net *net,
1306                                         struct socket *sock,
1307                                         struct in6_addr *saddr,
1308                                         const struct ip_tunnel_info *info,
1309                                         u8 protocol,
1310                                         bool use_cache)
1311 {
1312         struct dst_entry *dst = NULL;
1313 #ifdef CONFIG_DST_CACHE
1314         struct dst_cache *dst_cache;
1315 #endif
1316         struct flowi6 fl6;
1317         __u8 prio;
1318
1319 #ifdef CONFIG_DST_CACHE
1320         dst_cache = (struct dst_cache *)&info->dst_cache;
1321         if (use_cache) {
1322                 dst = dst_cache_get_ip6(dst_cache, saddr);
1323                 if (dst)
1324                         return dst;
1325         }
1326 #endif
1327         memset(&fl6, 0, sizeof(fl6));
1328         fl6.flowi6_mark = skb->mark;
1329         fl6.flowi6_proto = protocol;
1330         fl6.daddr = info->key.u.ipv6.dst;
1331         fl6.saddr = info->key.u.ipv6.src;
1332         prio = info->key.tos;
1333         fl6.flowlabel = ip6_make_flowinfo(prio, info->key.label);
1334
1335         dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1336                                               NULL);
1337         if (IS_ERR(dst)) {
1338                 netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1339                 return ERR_PTR(-ENETUNREACH);
1340         }
1341         if (dst->dev == dev) { /* is this necessary? */
1342                 netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1343                 dst_release(dst);
1344                 return ERR_PTR(-ELOOP);
1345         }
1346 #ifdef CONFIG_DST_CACHE
1347         if (use_cache)
1348                 dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1349 #endif
1350         *saddr = fl6.saddr;
1351         return dst;
1352 }
1353 EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1354
1355 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1356                                                gfp_t gfp)
1357 {
1358         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1359 }
1360
1361 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1362                                                 gfp_t gfp)
1363 {
1364         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1365 }
1366
1367 static void ip6_append_data_mtu(unsigned int *mtu,
1368                                 int *maxfraglen,
1369                                 unsigned int fragheaderlen,
1370                                 struct sk_buff *skb,
1371                                 struct rt6_info *rt,
1372                                 unsigned int orig_mtu)
1373 {
1374         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1375                 if (!skb) {
1376                         /* first fragment, reserve header_len */
1377                         *mtu = orig_mtu - rt->dst.header_len;
1378
1379                 } else {
1380                         /*
1381                          * this fragment is not first, the headers
1382                          * space is regarded as data space.
1383                          */
1384                         *mtu = orig_mtu;
1385                 }
1386                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1387                               + fragheaderlen - sizeof(struct frag_hdr);
1388         }
1389 }
1390
1391 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1392                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1393                           struct rt6_info *rt)
1394 {
1395         struct ipv6_pinfo *np = inet6_sk(sk);
1396         unsigned int mtu;
1397         struct ipv6_txoptions *nopt, *opt = ipc6->opt;
1398
1399         /* callers pass dst together with a reference, set it first so
1400          * ip6_cork_release() can put it down even in case of an error.
1401          */
1402         cork->base.dst = &rt->dst;
1403
1404         /*
1405          * setup for corking
1406          */
1407         if (opt) {
1408                 if (WARN_ON(v6_cork->opt))
1409                         return -EINVAL;
1410
1411                 nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1412                 if (unlikely(!nopt))
1413                         return -ENOBUFS;
1414
1415                 nopt->tot_len = sizeof(*opt);
1416                 nopt->opt_flen = opt->opt_flen;
1417                 nopt->opt_nflen = opt->opt_nflen;
1418
1419                 nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation);
1420                 if (opt->dst0opt && !nopt->dst0opt)
1421                         return -ENOBUFS;
1422
1423                 nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation);
1424                 if (opt->dst1opt && !nopt->dst1opt)
1425                         return -ENOBUFS;
1426
1427                 nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation);
1428                 if (opt->hopopt && !nopt->hopopt)
1429                         return -ENOBUFS;
1430
1431                 nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation);
1432                 if (opt->srcrt && !nopt->srcrt)
1433                         return -ENOBUFS;
1434
1435                 /* need source address above miyazawa*/
1436         }
1437         v6_cork->hop_limit = ipc6->hlimit;
1438         v6_cork->tclass = ipc6->tclass;
1439         if (rt->dst.flags & DST_XFRM_TUNNEL)
1440                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1441                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1442         else
1443                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1444                         READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1445         if (np->frag_size < mtu) {
1446                 if (np->frag_size)
1447                         mtu = np->frag_size;
1448         }
1449         cork->base.fragsize = mtu;
1450         cork->base.gso_size = ipc6->gso_size;
1451         cork->base.tx_flags = 0;
1452         cork->base.mark = ipc6->sockc.mark;
1453         sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1454
1455         if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1456                 cork->base.flags |= IPCORK_ALLFRAG;
1457         cork->base.length = 0;
1458
1459         cork->base.transmit_time = ipc6->sockc.transmit_time;
1460
1461         return 0;
1462 }
1463
1464 static int __ip6_append_data(struct sock *sk,
1465                              struct sk_buff_head *queue,
1466                              struct inet_cork_full *cork_full,
1467                              struct inet6_cork *v6_cork,
1468                              struct page_frag *pfrag,
1469                              int getfrag(void *from, char *to, int offset,
1470                                          int len, int odd, struct sk_buff *skb),
1471                              void *from, size_t length, int transhdrlen,
1472                              unsigned int flags, struct ipcm6_cookie *ipc6)
1473 {
1474         struct sk_buff *skb, *skb_prev = NULL;
1475         struct inet_cork *cork = &cork_full->base;
1476         struct flowi6 *fl6 = &cork_full->fl.u.ip6;
1477         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1478         struct ubuf_info *uarg = NULL;
1479         int exthdrlen = 0;
1480         int dst_exthdrlen = 0;
1481         int hh_len;
1482         int copy;
1483         int err;
1484         int offset = 0;
1485         bool zc = false;
1486         u32 tskey = 0;
1487         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1488         struct ipv6_txoptions *opt = v6_cork->opt;
1489         int csummode = CHECKSUM_NONE;
1490         unsigned int maxnonfragsize, headersize;
1491         unsigned int wmem_alloc_delta = 0;
1492         bool paged, extra_uref = false;
1493
1494         skb = skb_peek_tail(queue);
1495         if (!skb) {
1496                 exthdrlen = opt ? opt->opt_flen : 0;
1497                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1498         }
1499
1500         paged = !!cork->gso_size;
1501         mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1502         orig_mtu = mtu;
1503
1504         if (cork->tx_flags & SKBTX_ANY_TSTAMP &&
1505             sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1506                 tskey = atomic_inc_return(&sk->sk_tskey) - 1;
1507
1508         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1509
1510         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1511                         (opt ? opt->opt_nflen : 0);
1512
1513         headersize = sizeof(struct ipv6hdr) +
1514                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1515                      (dst_allfrag(&rt->dst) ?
1516                       sizeof(struct frag_hdr) : 0) +
1517                      rt->rt6i_nfheader_len;
1518
1519         if (mtu <= fragheaderlen ||
1520             ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1521                 goto emsgsize;
1522
1523         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1524                      sizeof(struct frag_hdr);
1525
1526         /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1527          * the first fragment
1528          */
1529         if (headersize + transhdrlen > mtu)
1530                 goto emsgsize;
1531
1532         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1533             (sk->sk_protocol == IPPROTO_UDP ||
1534              sk->sk_protocol == IPPROTO_ICMPV6 ||
1535              sk->sk_protocol == IPPROTO_RAW)) {
1536                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1537                                 sizeof(struct ipv6hdr));
1538                 goto emsgsize;
1539         }
1540
1541         if (ip6_sk_ignore_df(sk))
1542                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1543         else
1544                 maxnonfragsize = mtu;
1545
1546         if (cork->length + length > maxnonfragsize - headersize) {
1547 emsgsize:
1548                 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1549                 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1550                 return -EMSGSIZE;
1551         }
1552
1553         /* CHECKSUM_PARTIAL only with no extension headers and when
1554          * we are not going to fragment
1555          */
1556         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1557             headersize == sizeof(struct ipv6hdr) &&
1558             length <= mtu - headersize &&
1559             (!(flags & MSG_MORE) || cork->gso_size) &&
1560             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1561                 csummode = CHECKSUM_PARTIAL;
1562
1563         if ((flags & MSG_ZEROCOPY) && length) {
1564                 struct msghdr *msg = from;
1565
1566                 if (getfrag == ip_generic_getfrag && msg->msg_ubuf) {
1567                         if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb))
1568                                 return -EINVAL;
1569
1570                         /* Leave uarg NULL if can't zerocopy, callers should
1571                          * be able to handle it.
1572                          */
1573                         if ((rt->dst.dev->features & NETIF_F_SG) &&
1574                             csummode == CHECKSUM_PARTIAL) {
1575                                 paged = true;
1576                                 zc = true;
1577                                 uarg = msg->msg_ubuf;
1578                         }
1579                 } else if (sock_flag(sk, SOCK_ZEROCOPY)) {
1580                         uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
1581                         if (!uarg)
1582                                 return -ENOBUFS;
1583                         extra_uref = !skb_zcopy(skb);   /* only ref on new uarg */
1584                         if (rt->dst.dev->features & NETIF_F_SG &&
1585                             csummode == CHECKSUM_PARTIAL) {
1586                                 paged = true;
1587                                 zc = true;
1588                         } else {
1589                                 uarg_to_msgzc(uarg)->zerocopy = 0;
1590                                 skb_zcopy_set(skb, uarg, &extra_uref);
1591                         }
1592                 }
1593         } else if ((flags & MSG_SPLICE_PAGES) && length) {
1594                 if (inet_sk(sk)->hdrincl)
1595                         return -EPERM;
1596                 if (rt->dst.dev->features & NETIF_F_SG)
1597                         /* We need an empty buffer to attach stuff to */
1598                         paged = true;
1599                 else
1600                         flags &= ~MSG_SPLICE_PAGES;
1601         }
1602
1603         /*
1604          * Let's try using as much space as possible.
1605          * Use MTU if total length of the message fits into the MTU.
1606          * Otherwise, we need to reserve fragment header and
1607          * fragment alignment (= 8-15 octects, in total).
1608          *
1609          * Note that we may need to "move" the data from the tail
1610          * of the buffer to the new fragment when we split
1611          * the message.
1612          *
1613          * FIXME: It may be fragmented into multiple chunks
1614          *        at once if non-fragmentable extension headers
1615          *        are too large.
1616          * --yoshfuji
1617          */
1618
1619         cork->length += length;
1620         if (!skb)
1621                 goto alloc_new_skb;
1622
1623         while (length > 0) {
1624                 /* Check if the remaining data fits into current packet. */
1625                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1626                 if (copy < length)
1627                         copy = maxfraglen - skb->len;
1628
1629                 if (copy <= 0) {
1630                         char *data;
1631                         unsigned int datalen;
1632                         unsigned int fraglen;
1633                         unsigned int fraggap;
1634                         unsigned int alloclen, alloc_extra;
1635                         unsigned int pagedlen;
1636 alloc_new_skb:
1637                         /* There's no room in the current skb */
1638                         if (skb)
1639                                 fraggap = skb->len - maxfraglen;
1640                         else
1641                                 fraggap = 0;
1642                         /* update mtu and maxfraglen if necessary */
1643                         if (!skb || !skb_prev)
1644                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1645                                                     fragheaderlen, skb, rt,
1646                                                     orig_mtu);
1647
1648                         skb_prev = skb;
1649
1650                         /*
1651                          * If remaining data exceeds the mtu,
1652                          * we know we need more fragment(s).
1653                          */
1654                         datalen = length + fraggap;
1655
1656                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1657                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1658                         fraglen = datalen + fragheaderlen;
1659                         pagedlen = 0;
1660
1661                         alloc_extra = hh_len;
1662                         alloc_extra += dst_exthdrlen;
1663                         alloc_extra += rt->dst.trailer_len;
1664
1665                         /* We just reserve space for fragment header.
1666                          * Note: this may be overallocation if the message
1667                          * (without MSG_MORE) fits into the MTU.
1668                          */
1669                         alloc_extra += sizeof(struct frag_hdr);
1670
1671                         if ((flags & MSG_MORE) &&
1672                             !(rt->dst.dev->features&NETIF_F_SG))
1673                                 alloclen = mtu;
1674                         else if (!paged &&
1675                                  (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1676                                   !(rt->dst.dev->features & NETIF_F_SG)))
1677                                 alloclen = fraglen;
1678                         else {
1679                                 alloclen = fragheaderlen + transhdrlen;
1680                                 pagedlen = datalen - transhdrlen;
1681                         }
1682                         alloclen += alloc_extra;
1683
1684                         if (datalen != length + fraggap) {
1685                                 /*
1686                                  * this is not the last fragment, the trailer
1687                                  * space is regarded as data space.
1688                                  */
1689                                 datalen += rt->dst.trailer_len;
1690                         }
1691
1692                         fraglen = datalen + fragheaderlen;
1693
1694                         copy = datalen - transhdrlen - fraggap - pagedlen;
1695                         if (copy < 0) {
1696                                 err = -EINVAL;
1697                                 goto error;
1698                         }
1699                         if (transhdrlen) {
1700                                 skb = sock_alloc_send_skb(sk, alloclen,
1701                                                 (flags & MSG_DONTWAIT), &err);
1702                         } else {
1703                                 skb = NULL;
1704                                 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1705                                     2 * sk->sk_sndbuf)
1706                                         skb = alloc_skb(alloclen,
1707                                                         sk->sk_allocation);
1708                                 if (unlikely(!skb))
1709                                         err = -ENOBUFS;
1710                         }
1711                         if (!skb)
1712                                 goto error;
1713                         /*
1714                          *      Fill in the control structures
1715                          */
1716                         skb->protocol = htons(ETH_P_IPV6);
1717                         skb->ip_summed = csummode;
1718                         skb->csum = 0;
1719                         /* reserve for fragmentation and ipsec header */
1720                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1721                                     dst_exthdrlen);
1722
1723                         /*
1724                          *      Find where to start putting bytes
1725                          */
1726                         data = skb_put(skb, fraglen - pagedlen);
1727                         skb_set_network_header(skb, exthdrlen);
1728                         data += fragheaderlen;
1729                         skb->transport_header = (skb->network_header +
1730                                                  fragheaderlen);
1731                         if (fraggap) {
1732                                 skb->csum = skb_copy_and_csum_bits(
1733                                         skb_prev, maxfraglen,
1734                                         data + transhdrlen, fraggap);
1735                                 skb_prev->csum = csum_sub(skb_prev->csum,
1736                                                           skb->csum);
1737                                 data += fraggap;
1738                                 pskb_trim_unique(skb_prev, maxfraglen);
1739                         }
1740                         if (copy > 0 &&
1741                             getfrag(from, data + transhdrlen, offset,
1742                                     copy, fraggap, skb) < 0) {
1743                                 err = -EFAULT;
1744                                 kfree_skb(skb);
1745                                 goto error;
1746                         }
1747
1748                         offset += copy;
1749                         length -= copy + transhdrlen;
1750                         transhdrlen = 0;
1751                         exthdrlen = 0;
1752                         dst_exthdrlen = 0;
1753
1754                         /* Only the initial fragment is time stamped */
1755                         skb_shinfo(skb)->tx_flags = cork->tx_flags;
1756                         cork->tx_flags = 0;
1757                         skb_shinfo(skb)->tskey = tskey;
1758                         tskey = 0;
1759                         skb_zcopy_set(skb, uarg, &extra_uref);
1760
1761                         if ((flags & MSG_CONFIRM) && !skb_prev)
1762                                 skb_set_dst_pending_confirm(skb, 1);
1763
1764                         /*
1765                          * Put the packet on the pending queue
1766                          */
1767                         if (!skb->destructor) {
1768                                 skb->destructor = sock_wfree;
1769                                 skb->sk = sk;
1770                                 wmem_alloc_delta += skb->truesize;
1771                         }
1772                         __skb_queue_tail(queue, skb);
1773                         continue;
1774                 }
1775
1776                 if (copy > length)
1777                         copy = length;
1778
1779                 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1780                     skb_tailroom(skb) >= copy) {
1781                         unsigned int off;
1782
1783                         off = skb->len;
1784                         if (getfrag(from, skb_put(skb, copy),
1785                                                 offset, copy, off, skb) < 0) {
1786                                 __skb_trim(skb, off);
1787                                 err = -EFAULT;
1788                                 goto error;
1789                         }
1790                 } else if (flags & MSG_SPLICE_PAGES) {
1791                         struct msghdr *msg = from;
1792
1793                         err = skb_splice_from_iter(skb, &msg->msg_iter, copy,
1794                                                    sk->sk_allocation);
1795                         if (err < 0)
1796                                 goto error;
1797                         copy = err;
1798                         wmem_alloc_delta += copy;
1799                 } else if (!zc) {
1800                         int i = skb_shinfo(skb)->nr_frags;
1801
1802                         err = -ENOMEM;
1803                         if (!sk_page_frag_refill(sk, pfrag))
1804                                 goto error;
1805
1806                         skb_zcopy_downgrade_managed(skb);
1807                         if (!skb_can_coalesce(skb, i, pfrag->page,
1808                                               pfrag->offset)) {
1809                                 err = -EMSGSIZE;
1810                                 if (i == MAX_SKB_FRAGS)
1811                                         goto error;
1812
1813                                 __skb_fill_page_desc(skb, i, pfrag->page,
1814                                                      pfrag->offset, 0);
1815                                 skb_shinfo(skb)->nr_frags = ++i;
1816                                 get_page(pfrag->page);
1817                         }
1818                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1819                         if (getfrag(from,
1820                                     page_address(pfrag->page) + pfrag->offset,
1821                                     offset, copy, skb->len, skb) < 0)
1822                                 goto error_efault;
1823
1824                         pfrag->offset += copy;
1825                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1826                         skb->len += copy;
1827                         skb->data_len += copy;
1828                         skb->truesize += copy;
1829                         wmem_alloc_delta += copy;
1830                 } else {
1831                         err = skb_zerocopy_iter_dgram(skb, from, copy);
1832                         if (err < 0)
1833                                 goto error;
1834                 }
1835                 offset += copy;
1836                 length -= copy;
1837         }
1838
1839         if (wmem_alloc_delta)
1840                 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1841         return 0;
1842
1843 error_efault:
1844         err = -EFAULT;
1845 error:
1846         net_zcopy_put_abort(uarg, extra_uref);
1847         cork->length -= length;
1848         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1849         refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1850         return err;
1851 }
1852
1853 int ip6_append_data(struct sock *sk,
1854                     int getfrag(void *from, char *to, int offset, int len,
1855                                 int odd, struct sk_buff *skb),
1856                     void *from, size_t length, int transhdrlen,
1857                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1858                     struct rt6_info *rt, unsigned int flags)
1859 {
1860         struct inet_sock *inet = inet_sk(sk);
1861         struct ipv6_pinfo *np = inet6_sk(sk);
1862         int exthdrlen;
1863         int err;
1864
1865         if (flags&MSG_PROBE)
1866                 return 0;
1867         if (skb_queue_empty(&sk->sk_write_queue)) {
1868                 /*
1869                  * setup for corking
1870                  */
1871                 dst_hold(&rt->dst);
1872                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1873                                      ipc6, rt);
1874                 if (err)
1875                         return err;
1876
1877                 inet->cork.fl.u.ip6 = *fl6;
1878                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1879                 length += exthdrlen;
1880                 transhdrlen += exthdrlen;
1881         } else {
1882                 transhdrlen = 0;
1883         }
1884
1885         return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork,
1886                                  &np->cork, sk_page_frag(sk), getfrag,
1887                                  from, length, transhdrlen, flags, ipc6);
1888 }
1889 EXPORT_SYMBOL_GPL(ip6_append_data);
1890
1891 static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork)
1892 {
1893         struct dst_entry *dst = cork->base.dst;
1894
1895         cork->base.dst = NULL;
1896         cork->base.flags &= ~IPCORK_ALLFRAG;
1897         skb_dst_set(skb, dst);
1898 }
1899
1900 static void ip6_cork_release(struct inet_cork_full *cork,
1901                              struct inet6_cork *v6_cork)
1902 {
1903         if (v6_cork->opt) {
1904                 struct ipv6_txoptions *opt = v6_cork->opt;
1905
1906                 kfree(opt->dst0opt);
1907                 kfree(opt->dst1opt);
1908                 kfree(opt->hopopt);
1909                 kfree(opt->srcrt);
1910                 kfree(opt);
1911                 v6_cork->opt = NULL;
1912         }
1913
1914         if (cork->base.dst) {
1915                 dst_release(cork->base.dst);
1916                 cork->base.dst = NULL;
1917                 cork->base.flags &= ~IPCORK_ALLFRAG;
1918         }
1919 }
1920
1921 struct sk_buff *__ip6_make_skb(struct sock *sk,
1922                                struct sk_buff_head *queue,
1923                                struct inet_cork_full *cork,
1924                                struct inet6_cork *v6_cork)
1925 {
1926         struct sk_buff *skb, *tmp_skb;
1927         struct sk_buff **tail_skb;
1928         struct in6_addr *final_dst;
1929         struct ipv6_pinfo *np = inet6_sk(sk);
1930         struct net *net = sock_net(sk);
1931         struct ipv6hdr *hdr;
1932         struct ipv6_txoptions *opt = v6_cork->opt;
1933         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1934         struct flowi6 *fl6 = &cork->fl.u.ip6;
1935         unsigned char proto = fl6->flowi6_proto;
1936
1937         skb = __skb_dequeue(queue);
1938         if (!skb)
1939                 goto out;
1940         tail_skb = &(skb_shinfo(skb)->frag_list);
1941
1942         /* move skb->data to ip header from ext header */
1943         if (skb->data < skb_network_header(skb))
1944                 __skb_pull(skb, skb_network_offset(skb));
1945         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1946                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1947                 *tail_skb = tmp_skb;
1948                 tail_skb = &(tmp_skb->next);
1949                 skb->len += tmp_skb->len;
1950                 skb->data_len += tmp_skb->len;
1951                 skb->truesize += tmp_skb->truesize;
1952                 tmp_skb->destructor = NULL;
1953                 tmp_skb->sk = NULL;
1954         }
1955
1956         /* Allow local fragmentation. */
1957         skb->ignore_df = ip6_sk_ignore_df(sk);
1958         __skb_pull(skb, skb_network_header_len(skb));
1959
1960         final_dst = &fl6->daddr;
1961         if (opt && opt->opt_flen)
1962                 ipv6_push_frag_opts(skb, opt, &proto);
1963         if (opt && opt->opt_nflen)
1964                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1965
1966         skb_push(skb, sizeof(struct ipv6hdr));
1967         skb_reset_network_header(skb);
1968         hdr = ipv6_hdr(skb);
1969
1970         ip6_flow_hdr(hdr, v6_cork->tclass,
1971                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1972                                         ip6_autoflowlabel(net, np), fl6));
1973         hdr->hop_limit = v6_cork->hop_limit;
1974         hdr->nexthdr = proto;
1975         hdr->saddr = fl6->saddr;
1976         hdr->daddr = *final_dst;
1977
1978         skb->priority = sk->sk_priority;
1979         skb->mark = cork->base.mark;
1980         skb->tstamp = cork->base.transmit_time;
1981
1982         ip6_cork_steal_dst(skb, cork);
1983         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1984         if (proto == IPPROTO_ICMPV6) {
1985                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1986                 u8 icmp6_type;
1987
1988                 if (sk->sk_socket->type == SOCK_RAW && !inet_sk(sk)->hdrincl)
1989                         icmp6_type = fl6->fl6_icmp_type;
1990                 else
1991                         icmp6_type = icmp6_hdr(skb)->icmp6_type;
1992                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type);
1993                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1994         }
1995
1996         ip6_cork_release(cork, v6_cork);
1997 out:
1998         return skb;
1999 }
2000
2001 int ip6_send_skb(struct sk_buff *skb)
2002 {
2003         struct net *net = sock_net(skb->sk);
2004         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
2005         int err;
2006
2007         err = ip6_local_out(net, skb->sk, skb);
2008         if (err) {
2009                 if (err > 0)
2010                         err = net_xmit_errno(err);
2011                 if (err)
2012                         IP6_INC_STATS(net, rt->rt6i_idev,
2013                                       IPSTATS_MIB_OUTDISCARDS);
2014         }
2015
2016         return err;
2017 }
2018
2019 int ip6_push_pending_frames(struct sock *sk)
2020 {
2021         struct sk_buff *skb;
2022
2023         skb = ip6_finish_skb(sk);
2024         if (!skb)
2025                 return 0;
2026
2027         return ip6_send_skb(skb);
2028 }
2029 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
2030
2031 static void __ip6_flush_pending_frames(struct sock *sk,
2032                                        struct sk_buff_head *queue,
2033                                        struct inet_cork_full *cork,
2034                                        struct inet6_cork *v6_cork)
2035 {
2036         struct sk_buff *skb;
2037
2038         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
2039                 if (skb_dst(skb))
2040                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
2041                                       IPSTATS_MIB_OUTDISCARDS);
2042                 kfree_skb(skb);
2043         }
2044
2045         ip6_cork_release(cork, v6_cork);
2046 }
2047
2048 void ip6_flush_pending_frames(struct sock *sk)
2049 {
2050         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
2051                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
2052 }
2053 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
2054
2055 struct sk_buff *ip6_make_skb(struct sock *sk,
2056                              int getfrag(void *from, char *to, int offset,
2057                                          int len, int odd, struct sk_buff *skb),
2058                              void *from, size_t length, int transhdrlen,
2059                              struct ipcm6_cookie *ipc6, struct rt6_info *rt,
2060                              unsigned int flags, struct inet_cork_full *cork)
2061 {
2062         struct inet6_cork v6_cork;
2063         struct sk_buff_head queue;
2064         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
2065         int err;
2066
2067         if (flags & MSG_PROBE) {
2068                 dst_release(&rt->dst);
2069                 return NULL;
2070         }
2071
2072         __skb_queue_head_init(&queue);
2073
2074         cork->base.flags = 0;
2075         cork->base.addr = 0;
2076         cork->base.opt = NULL;
2077         v6_cork.opt = NULL;
2078         err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt);
2079         if (err) {
2080                 ip6_cork_release(cork, &v6_cork);
2081                 return ERR_PTR(err);
2082         }
2083         if (ipc6->dontfrag < 0)
2084                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
2085
2086         err = __ip6_append_data(sk, &queue, cork, &v6_cork,
2087                                 &current->task_frag, getfrag, from,
2088                                 length + exthdrlen, transhdrlen + exthdrlen,
2089                                 flags, ipc6);
2090         if (err) {
2091                 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
2092                 return ERR_PTR(err);
2093         }
2094
2095         return __ip6_make_skb(sk, &queue, cork, &v6_cork);
2096 }