Merge tag 'io_uring-6.1-2022-12-08' of git://git.kernel.dk/linux
[platform/kernel/linux-starfive.git] / net / ipv6 / ip6_output.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *      IPv6 output functions
4  *      Linux INET6 implementation
5  *
6  *      Authors:
7  *      Pedro Roque             <roque@di.fc.ul.pt>
8  *
9  *      Based on linux/net/ipv4/ip_output.c
10  *
11  *      Changes:
12  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
13  *                              extension headers are implemented.
14  *                              route changes now work.
15  *                              ip6_forward does not confuse sniffers.
16  *                              etc.
17  *
18  *      H. von Brand    :       Added missing #include <linux/string.h>
19  *      Imran Patel     :       frag id should be in NBO
20  *      Kazunori MIYAZAWA @USAGI
21  *                      :       add ip6_append_data and related functions
22  *                              for datagram xmit
23  */
24
25 #include <linux/errno.h>
26 #include <linux/kernel.h>
27 #include <linux/string.h>
28 #include <linux/socket.h>
29 #include <linux/net.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/in6.h>
33 #include <linux/tcp.h>
34 #include <linux/route.h>
35 #include <linux/module.h>
36 #include <linux/slab.h>
37
38 #include <linux/bpf-cgroup.h>
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv6.h>
41
42 #include <net/sock.h>
43 #include <net/snmp.h>
44
45 #include <net/ipv6.h>
46 #include <net/ndisc.h>
47 #include <net/protocol.h>
48 #include <net/ip6_route.h>
49 #include <net/addrconf.h>
50 #include <net/rawv6.h>
51 #include <net/icmp.h>
52 #include <net/xfrm.h>
53 #include <net/checksum.h>
54 #include <linux/mroute6.h>
55 #include <net/l3mdev.h>
56 #include <net/lwtunnel.h>
57 #include <net/ip_tunnels.h>
58
59 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
60 {
61         struct dst_entry *dst = skb_dst(skb);
62         struct net_device *dev = dst->dev;
63         struct inet6_dev *idev = ip6_dst_idev(dst);
64         unsigned int hh_len = LL_RESERVED_SPACE(dev);
65         const struct in6_addr *daddr, *nexthop;
66         struct ipv6hdr *hdr;
67         struct neighbour *neigh;
68         int ret;
69
70         /* Be paranoid, rather than too clever. */
71         if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
72                 skb = skb_expand_head(skb, hh_len);
73                 if (!skb) {
74                         IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
75                         return -ENOMEM;
76                 }
77         }
78
79         hdr = ipv6_hdr(skb);
80         daddr = &hdr->daddr;
81         if (ipv6_addr_is_multicast(daddr)) {
82                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
83                     ((mroute6_is_socket(net, skb) &&
84                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
85                      ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
86                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
87
88                         /* Do not check for IFF_ALLMULTI; multicast routing
89                            is not supported in any case.
90                          */
91                         if (newskb)
92                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
93                                         net, sk, newskb, NULL, newskb->dev,
94                                         dev_loopback_xmit);
95
96                         if (hdr->hop_limit == 0) {
97                                 IP6_INC_STATS(net, idev,
98                                               IPSTATS_MIB_OUTDISCARDS);
99                                 kfree_skb(skb);
100                                 return 0;
101                         }
102                 }
103
104                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
105                 if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
106                     !(dev->flags & IFF_LOOPBACK)) {
107                         kfree_skb(skb);
108                         return 0;
109                 }
110         }
111
112         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
113                 int res = lwtunnel_xmit(skb);
114
115                 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
116                         return res;
117         }
118
119         rcu_read_lock_bh();
120         nexthop = rt6_nexthop((struct rt6_info *)dst, daddr);
121         neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
122
123         if (unlikely(IS_ERR_OR_NULL(neigh))) {
124                 if (unlikely(!neigh))
125                         neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
126                 if (IS_ERR(neigh)) {
127                         rcu_read_unlock_bh();
128                         IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
129                         kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
130                         return -EINVAL;
131                 }
132         }
133         sock_confirm_neigh(skb, neigh);
134         ret = neigh_output(neigh, skb, false);
135         rcu_read_unlock_bh();
136         return ret;
137 }
138
139 static int
140 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
141                                     struct sk_buff *skb, unsigned int mtu)
142 {
143         struct sk_buff *segs, *nskb;
144         netdev_features_t features;
145         int ret = 0;
146
147         /* Please see corresponding comment in ip_finish_output_gso
148          * describing the cases where GSO segment length exceeds the
149          * egress MTU.
150          */
151         features = netif_skb_features(skb);
152         segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
153         if (IS_ERR_OR_NULL(segs)) {
154                 kfree_skb(skb);
155                 return -ENOMEM;
156         }
157
158         consume_skb(skb);
159
160         skb_list_walk_safe(segs, segs, nskb) {
161                 int err;
162
163                 skb_mark_not_on_list(segs);
164                 err = ip6_fragment(net, sk, segs, ip6_finish_output2);
165                 if (err && ret == 0)
166                         ret = err;
167         }
168
169         return ret;
170 }
171
172 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
173 {
174         unsigned int mtu;
175
176 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
177         /* Policy lookup after SNAT yielded a new policy */
178         if (skb_dst(skb)->xfrm) {
179                 IP6CB(skb)->flags |= IP6SKB_REROUTED;
180                 return dst_output(net, sk, skb);
181         }
182 #endif
183
184         mtu = ip6_skb_dst_mtu(skb);
185         if (skb_is_gso(skb) &&
186             !(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) &&
187             !skb_gso_validate_network_len(skb, mtu))
188                 return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
189
190         if ((skb->len > mtu && !skb_is_gso(skb)) ||
191             dst_allfrag(skb_dst(skb)) ||
192             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
193                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
194         else
195                 return ip6_finish_output2(net, sk, skb);
196 }
197
198 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
199 {
200         int ret;
201
202         ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
203         switch (ret) {
204         case NET_XMIT_SUCCESS:
205         case NET_XMIT_CN:
206                 return __ip6_finish_output(net, sk, skb) ? : ret;
207         default:
208                 kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
209                 return ret;
210         }
211 }
212
213 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
214 {
215         struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
216         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
217
218         skb->protocol = htons(ETH_P_IPV6);
219         skb->dev = dev;
220
221         if (unlikely(idev->cnf.disable_ipv6)) {
222                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
223                 kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED);
224                 return 0;
225         }
226
227         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
228                             net, sk, skb, indev, dev,
229                             ip6_finish_output,
230                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
231 }
232 EXPORT_SYMBOL(ip6_output);
233
234 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
235 {
236         if (!np->autoflowlabel_set)
237                 return ip6_default_np_autolabel(net);
238         else
239                 return np->autoflowlabel;
240 }
241
242 /*
243  * xmit an sk_buff (used by TCP, SCTP and DCCP)
244  * Note : socket lock is not held for SYNACK packets, but might be modified
245  * by calls to skb_set_owner_w() and ipv6_local_error(),
246  * which are using proper atomic operations or spinlocks.
247  */
248 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
249              __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
250 {
251         struct net *net = sock_net(sk);
252         const struct ipv6_pinfo *np = inet6_sk(sk);
253         struct in6_addr *first_hop = &fl6->daddr;
254         struct dst_entry *dst = skb_dst(skb);
255         struct net_device *dev = dst->dev;
256         struct inet6_dev *idev = ip6_dst_idev(dst);
257         struct hop_jumbo_hdr *hop_jumbo;
258         int hoplen = sizeof(*hop_jumbo);
259         unsigned int head_room;
260         struct ipv6hdr *hdr;
261         u8  proto = fl6->flowi6_proto;
262         int seg_len = skb->len;
263         int hlimit = -1;
264         u32 mtu;
265
266         head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev);
267         if (opt)
268                 head_room += opt->opt_nflen + opt->opt_flen;
269
270         if (unlikely(head_room > skb_headroom(skb))) {
271                 skb = skb_expand_head(skb, head_room);
272                 if (!skb) {
273                         IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
274                         return -ENOBUFS;
275                 }
276         }
277
278         if (opt) {
279                 seg_len += opt->opt_nflen + opt->opt_flen;
280
281                 if (opt->opt_flen)
282                         ipv6_push_frag_opts(skb, opt, &proto);
283
284                 if (opt->opt_nflen)
285                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
286                                              &fl6->saddr);
287         }
288
289         if (unlikely(seg_len > IPV6_MAXPLEN)) {
290                 hop_jumbo = skb_push(skb, hoplen);
291
292                 hop_jumbo->nexthdr = proto;
293                 hop_jumbo->hdrlen = 0;
294                 hop_jumbo->tlv_type = IPV6_TLV_JUMBO;
295                 hop_jumbo->tlv_len = 4;
296                 hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen);
297
298                 proto = IPPROTO_HOPOPTS;
299                 seg_len = 0;
300                 IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO;
301         }
302
303         skb_push(skb, sizeof(struct ipv6hdr));
304         skb_reset_network_header(skb);
305         hdr = ipv6_hdr(skb);
306
307         /*
308          *      Fill in the IPv6 header
309          */
310         if (np)
311                 hlimit = np->hop_limit;
312         if (hlimit < 0)
313                 hlimit = ip6_dst_hoplimit(dst);
314
315         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
316                                 ip6_autoflowlabel(net, np), fl6));
317
318         hdr->payload_len = htons(seg_len);
319         hdr->nexthdr = proto;
320         hdr->hop_limit = hlimit;
321
322         hdr->saddr = fl6->saddr;
323         hdr->daddr = *first_hop;
324
325         skb->protocol = htons(ETH_P_IPV6);
326         skb->priority = priority;
327         skb->mark = mark;
328
329         mtu = dst_mtu(dst);
330         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
331                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
332
333                 /* if egress device is enslaved to an L3 master device pass the
334                  * skb to its handler for processing
335                  */
336                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
337                 if (unlikely(!skb))
338                         return 0;
339
340                 /* hooks should never assume socket lock is held.
341                  * we promote our socket to non const
342                  */
343                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
344                                net, (struct sock *)sk, skb, NULL, dev,
345                                dst_output);
346         }
347
348         skb->dev = dev;
349         /* ipv6_local_error() does not require socket lock,
350          * we promote our socket to non const
351          */
352         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
353
354         IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
355         kfree_skb(skb);
356         return -EMSGSIZE;
357 }
358 EXPORT_SYMBOL(ip6_xmit);
359
360 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
361 {
362         struct ip6_ra_chain *ra;
363         struct sock *last = NULL;
364
365         read_lock(&ip6_ra_lock);
366         for (ra = ip6_ra_chain; ra; ra = ra->next) {
367                 struct sock *sk = ra->sk;
368                 if (sk && ra->sel == sel &&
369                     (!sk->sk_bound_dev_if ||
370                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
371                         struct ipv6_pinfo *np = inet6_sk(sk);
372
373                         if (np && np->rtalert_isolate &&
374                             !net_eq(sock_net(sk), dev_net(skb->dev))) {
375                                 continue;
376                         }
377                         if (last) {
378                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
379                                 if (skb2)
380                                         rawv6_rcv(last, skb2);
381                         }
382                         last = sk;
383                 }
384         }
385
386         if (last) {
387                 rawv6_rcv(last, skb);
388                 read_unlock(&ip6_ra_lock);
389                 return 1;
390         }
391         read_unlock(&ip6_ra_lock);
392         return 0;
393 }
394
395 static int ip6_forward_proxy_check(struct sk_buff *skb)
396 {
397         struct ipv6hdr *hdr = ipv6_hdr(skb);
398         u8 nexthdr = hdr->nexthdr;
399         __be16 frag_off;
400         int offset;
401
402         if (ipv6_ext_hdr(nexthdr)) {
403                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
404                 if (offset < 0)
405                         return 0;
406         } else
407                 offset = sizeof(struct ipv6hdr);
408
409         if (nexthdr == IPPROTO_ICMPV6) {
410                 struct icmp6hdr *icmp6;
411
412                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
413                                          offset + 1 - skb->data)))
414                         return 0;
415
416                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
417
418                 switch (icmp6->icmp6_type) {
419                 case NDISC_ROUTER_SOLICITATION:
420                 case NDISC_ROUTER_ADVERTISEMENT:
421                 case NDISC_NEIGHBOUR_SOLICITATION:
422                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
423                 case NDISC_REDIRECT:
424                         /* For reaction involving unicast neighbor discovery
425                          * message destined to the proxied address, pass it to
426                          * input function.
427                          */
428                         return 1;
429                 default:
430                         break;
431                 }
432         }
433
434         /*
435          * The proxying router can't forward traffic sent to a link-local
436          * address, so signal the sender and discard the packet. This
437          * behavior is clarified by the MIPv6 specification.
438          */
439         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
440                 dst_link_failure(skb);
441                 return -1;
442         }
443
444         return 0;
445 }
446
447 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
448                                      struct sk_buff *skb)
449 {
450         struct dst_entry *dst = skb_dst(skb);
451
452         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
453         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
454
455 #ifdef CONFIG_NET_SWITCHDEV
456         if (skb->offload_l3_fwd_mark) {
457                 consume_skb(skb);
458                 return 0;
459         }
460 #endif
461
462         skb_clear_tstamp(skb);
463         return dst_output(net, sk, skb);
464 }
465
466 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
467 {
468         if (skb->len <= mtu)
469                 return false;
470
471         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
472         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
473                 return true;
474
475         if (skb->ignore_df)
476                 return false;
477
478         if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
479                 return false;
480
481         return true;
482 }
483
484 int ip6_forward(struct sk_buff *skb)
485 {
486         struct dst_entry *dst = skb_dst(skb);
487         struct ipv6hdr *hdr = ipv6_hdr(skb);
488         struct inet6_skb_parm *opt = IP6CB(skb);
489         struct net *net = dev_net(dst->dev);
490         struct inet6_dev *idev;
491         SKB_DR(reason);
492         u32 mtu;
493
494         idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
495         if (net->ipv6.devconf_all->forwarding == 0)
496                 goto error;
497
498         if (skb->pkt_type != PACKET_HOST)
499                 goto drop;
500
501         if (unlikely(skb->sk))
502                 goto drop;
503
504         if (skb_warn_if_lro(skb))
505                 goto drop;
506
507         if (!net->ipv6.devconf_all->disable_policy &&
508             (!idev || !idev->cnf.disable_policy) &&
509             !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
510                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
511                 goto drop;
512         }
513
514         skb_forward_csum(skb);
515
516         /*
517          *      We DO NOT make any processing on
518          *      RA packets, pushing them to user level AS IS
519          *      without ane WARRANTY that application will be able
520          *      to interpret them. The reason is that we
521          *      cannot make anything clever here.
522          *
523          *      We are not end-node, so that if packet contains
524          *      AH/ESP, we cannot make anything.
525          *      Defragmentation also would be mistake, RA packets
526          *      cannot be fragmented, because there is no warranty
527          *      that different fragments will go along one path. --ANK
528          */
529         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
530                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
531                         return 0;
532         }
533
534         /*
535          *      check and decrement ttl
536          */
537         if (hdr->hop_limit <= 1) {
538                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
539                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
540
541                 kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
542                 return -ETIMEDOUT;
543         }
544
545         /* XXX: idev->cnf.proxy_ndp? */
546         if (net->ipv6.devconf_all->proxy_ndp &&
547             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
548                 int proxied = ip6_forward_proxy_check(skb);
549                 if (proxied > 0) {
550                         hdr->hop_limit--;
551                         return ip6_input(skb);
552                 } else if (proxied < 0) {
553                         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
554                         goto drop;
555                 }
556         }
557
558         if (!xfrm6_route_forward(skb)) {
559                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
560                 SKB_DR_SET(reason, XFRM_POLICY);
561                 goto drop;
562         }
563         dst = skb_dst(skb);
564
565         /* IPv6 specs say nothing about it, but it is clear that we cannot
566            send redirects to source routed frames.
567            We don't send redirects to frames decapsulated from IPsec.
568          */
569         if (IP6CB(skb)->iif == dst->dev->ifindex &&
570             opt->srcrt == 0 && !skb_sec_path(skb)) {
571                 struct in6_addr *target = NULL;
572                 struct inet_peer *peer;
573                 struct rt6_info *rt;
574
575                 /*
576                  *      incoming and outgoing devices are the same
577                  *      send a redirect.
578                  */
579
580                 rt = (struct rt6_info *) dst;
581                 if (rt->rt6i_flags & RTF_GATEWAY)
582                         target = &rt->rt6i_gateway;
583                 else
584                         target = &hdr->daddr;
585
586                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
587
588                 /* Limit redirects both by destination (here)
589                    and by source (inside ndisc_send_redirect)
590                  */
591                 if (inet_peer_xrlim_allow(peer, 1*HZ))
592                         ndisc_send_redirect(skb, target);
593                 if (peer)
594                         inet_putpeer(peer);
595         } else {
596                 int addrtype = ipv6_addr_type(&hdr->saddr);
597
598                 /* This check is security critical. */
599                 if (addrtype == IPV6_ADDR_ANY ||
600                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
601                         goto error;
602                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
603                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
604                                     ICMPV6_NOT_NEIGHBOUR, 0);
605                         goto error;
606                 }
607         }
608
609         mtu = ip6_dst_mtu_maybe_forward(dst, true);
610         if (mtu < IPV6_MIN_MTU)
611                 mtu = IPV6_MIN_MTU;
612
613         if (ip6_pkt_too_big(skb, mtu)) {
614                 /* Again, force OUTPUT device used as source address */
615                 skb->dev = dst->dev;
616                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
617                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
618                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
619                                 IPSTATS_MIB_FRAGFAILS);
620                 kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
621                 return -EMSGSIZE;
622         }
623
624         if (skb_cow(skb, dst->dev->hard_header_len)) {
625                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
626                                 IPSTATS_MIB_OUTDISCARDS);
627                 goto drop;
628         }
629
630         hdr = ipv6_hdr(skb);
631
632         /* Mangling hops number delayed to point after skb COW */
633
634         hdr->hop_limit--;
635
636         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
637                        net, NULL, skb, skb->dev, dst->dev,
638                        ip6_forward_finish);
639
640 error:
641         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
642         SKB_DR_SET(reason, IP_INADDRERRORS);
643 drop:
644         kfree_skb_reason(skb, reason);
645         return -EINVAL;
646 }
647
648 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
649 {
650         to->pkt_type = from->pkt_type;
651         to->priority = from->priority;
652         to->protocol = from->protocol;
653         skb_dst_drop(to);
654         skb_dst_set(to, dst_clone(skb_dst(from)));
655         to->dev = from->dev;
656         to->mark = from->mark;
657
658         skb_copy_hash(to, from);
659
660 #ifdef CONFIG_NET_SCHED
661         to->tc_index = from->tc_index;
662 #endif
663         nf_copy(to, from);
664         skb_ext_copy(to, from);
665         skb_copy_secmark(to, from);
666 }
667
668 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
669                       u8 nexthdr, __be32 frag_id,
670                       struct ip6_fraglist_iter *iter)
671 {
672         unsigned int first_len;
673         struct frag_hdr *fh;
674
675         /* BUILD HEADER */
676         *prevhdr = NEXTHDR_FRAGMENT;
677         iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
678         if (!iter->tmp_hdr)
679                 return -ENOMEM;
680
681         iter->frag = skb_shinfo(skb)->frag_list;
682         skb_frag_list_init(skb);
683
684         iter->offset = 0;
685         iter->hlen = hlen;
686         iter->frag_id = frag_id;
687         iter->nexthdr = nexthdr;
688
689         __skb_pull(skb, hlen);
690         fh = __skb_push(skb, sizeof(struct frag_hdr));
691         __skb_push(skb, hlen);
692         skb_reset_network_header(skb);
693         memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
694
695         fh->nexthdr = nexthdr;
696         fh->reserved = 0;
697         fh->frag_off = htons(IP6_MF);
698         fh->identification = frag_id;
699
700         first_len = skb_pagelen(skb);
701         skb->data_len = first_len - skb_headlen(skb);
702         skb->len = first_len;
703         ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
704
705         return 0;
706 }
707 EXPORT_SYMBOL(ip6_fraglist_init);
708
709 void ip6_fraglist_prepare(struct sk_buff *skb,
710                           struct ip6_fraglist_iter *iter)
711 {
712         struct sk_buff *frag = iter->frag;
713         unsigned int hlen = iter->hlen;
714         struct frag_hdr *fh;
715
716         frag->ip_summed = CHECKSUM_NONE;
717         skb_reset_transport_header(frag);
718         fh = __skb_push(frag, sizeof(struct frag_hdr));
719         __skb_push(frag, hlen);
720         skb_reset_network_header(frag);
721         memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
722         iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
723         fh->nexthdr = iter->nexthdr;
724         fh->reserved = 0;
725         fh->frag_off = htons(iter->offset);
726         if (frag->next)
727                 fh->frag_off |= htons(IP6_MF);
728         fh->identification = iter->frag_id;
729         ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
730         ip6_copy_metadata(frag, skb);
731 }
732 EXPORT_SYMBOL(ip6_fraglist_prepare);
733
734 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
735                    unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
736                    u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
737 {
738         state->prevhdr = prevhdr;
739         state->nexthdr = nexthdr;
740         state->frag_id = frag_id;
741
742         state->hlen = hlen;
743         state->mtu = mtu;
744
745         state->left = skb->len - hlen;  /* Space per frame */
746         state->ptr = hlen;              /* Where to start from */
747
748         state->hroom = hdr_room;
749         state->troom = needed_tailroom;
750
751         state->offset = 0;
752 }
753 EXPORT_SYMBOL(ip6_frag_init);
754
755 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
756 {
757         u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
758         struct sk_buff *frag;
759         struct frag_hdr *fh;
760         unsigned int len;
761
762         len = state->left;
763         /* IF: it doesn't fit, use 'mtu' - the data space left */
764         if (len > state->mtu)
765                 len = state->mtu;
766         /* IF: we are not sending up to and including the packet end
767            then align the next start on an eight byte boundary */
768         if (len < state->left)
769                 len &= ~7;
770
771         /* Allocate buffer */
772         frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
773                          state->hroom + state->troom, GFP_ATOMIC);
774         if (!frag)
775                 return ERR_PTR(-ENOMEM);
776
777         /*
778          *      Set up data on packet
779          */
780
781         ip6_copy_metadata(frag, skb);
782         skb_reserve(frag, state->hroom);
783         skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
784         skb_reset_network_header(frag);
785         fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
786         frag->transport_header = (frag->network_header + state->hlen +
787                                   sizeof(struct frag_hdr));
788
789         /*
790          *      Charge the memory for the fragment to any owner
791          *      it might possess
792          */
793         if (skb->sk)
794                 skb_set_owner_w(frag, skb->sk);
795
796         /*
797          *      Copy the packet header into the new buffer.
798          */
799         skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
800
801         fragnexthdr_offset = skb_network_header(frag);
802         fragnexthdr_offset += prevhdr - skb_network_header(skb);
803         *fragnexthdr_offset = NEXTHDR_FRAGMENT;
804
805         /*
806          *      Build fragment header.
807          */
808         fh->nexthdr = state->nexthdr;
809         fh->reserved = 0;
810         fh->identification = state->frag_id;
811
812         /*
813          *      Copy a block of the IP datagram.
814          */
815         BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
816                              len));
817         state->left -= len;
818
819         fh->frag_off = htons(state->offset);
820         if (state->left > 0)
821                 fh->frag_off |= htons(IP6_MF);
822         ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
823
824         state->ptr += len;
825         state->offset += len;
826
827         return frag;
828 }
829 EXPORT_SYMBOL(ip6_frag_next);
830
831 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
832                  int (*output)(struct net *, struct sock *, struct sk_buff *))
833 {
834         struct sk_buff *frag;
835         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
836         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
837                                 inet6_sk(skb->sk) : NULL;
838         bool mono_delivery_time = skb->mono_delivery_time;
839         struct ip6_frag_state state;
840         unsigned int mtu, hlen, nexthdr_offset;
841         ktime_t tstamp = skb->tstamp;
842         int hroom, err = 0;
843         __be32 frag_id;
844         u8 *prevhdr, nexthdr = 0;
845
846         err = ip6_find_1stfragopt(skb, &prevhdr);
847         if (err < 0)
848                 goto fail;
849         hlen = err;
850         nexthdr = *prevhdr;
851         nexthdr_offset = prevhdr - skb_network_header(skb);
852
853         mtu = ip6_skb_dst_mtu(skb);
854
855         /* We must not fragment if the socket is set to force MTU discovery
856          * or if the skb it not generated by a local socket.
857          */
858         if (unlikely(!skb->ignore_df && skb->len > mtu))
859                 goto fail_toobig;
860
861         if (IP6CB(skb)->frag_max_size) {
862                 if (IP6CB(skb)->frag_max_size > mtu)
863                         goto fail_toobig;
864
865                 /* don't send fragments larger than what we received */
866                 mtu = IP6CB(skb)->frag_max_size;
867                 if (mtu < IPV6_MIN_MTU)
868                         mtu = IPV6_MIN_MTU;
869         }
870
871         if (np && np->frag_size < mtu) {
872                 if (np->frag_size)
873                         mtu = np->frag_size;
874         }
875         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
876                 goto fail_toobig;
877         mtu -= hlen + sizeof(struct frag_hdr);
878
879         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
880                                     &ipv6_hdr(skb)->saddr);
881
882         if (skb->ip_summed == CHECKSUM_PARTIAL &&
883             (err = skb_checksum_help(skb)))
884                 goto fail;
885
886         prevhdr = skb_network_header(skb) + nexthdr_offset;
887         hroom = LL_RESERVED_SPACE(rt->dst.dev);
888         if (skb_has_frag_list(skb)) {
889                 unsigned int first_len = skb_pagelen(skb);
890                 struct ip6_fraglist_iter iter;
891                 struct sk_buff *frag2;
892
893                 if (first_len - hlen > mtu ||
894                     ((first_len - hlen) & 7) ||
895                     skb_cloned(skb) ||
896                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
897                         goto slow_path;
898
899                 skb_walk_frags(skb, frag) {
900                         /* Correct geometry. */
901                         if (frag->len > mtu ||
902                             ((frag->len & 7) && frag->next) ||
903                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
904                                 goto slow_path_clean;
905
906                         /* Partially cloned skb? */
907                         if (skb_shared(frag))
908                                 goto slow_path_clean;
909
910                         BUG_ON(frag->sk);
911                         if (skb->sk) {
912                                 frag->sk = skb->sk;
913                                 frag->destructor = sock_wfree;
914                         }
915                         skb->truesize -= frag->truesize;
916                 }
917
918                 err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
919                                         &iter);
920                 if (err < 0)
921                         goto fail;
922
923                 /* We prevent @rt from being freed. */
924                 rcu_read_lock();
925
926                 for (;;) {
927                         /* Prepare header of the next frame,
928                          * before previous one went down. */
929                         if (iter.frag)
930                                 ip6_fraglist_prepare(skb, &iter);
931
932                         skb_set_delivery_time(skb, tstamp, mono_delivery_time);
933                         err = output(net, sk, skb);
934                         if (!err)
935                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
936                                               IPSTATS_MIB_FRAGCREATES);
937
938                         if (err || !iter.frag)
939                                 break;
940
941                         skb = ip6_fraglist_next(&iter);
942                 }
943
944                 kfree(iter.tmp_hdr);
945
946                 if (err == 0) {
947                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
948                                       IPSTATS_MIB_FRAGOKS);
949                         rcu_read_unlock();
950                         return 0;
951                 }
952
953                 kfree_skb_list(iter.frag);
954
955                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
956                               IPSTATS_MIB_FRAGFAILS);
957                 rcu_read_unlock();
958                 return err;
959
960 slow_path_clean:
961                 skb_walk_frags(skb, frag2) {
962                         if (frag2 == frag)
963                                 break;
964                         frag2->sk = NULL;
965                         frag2->destructor = NULL;
966                         skb->truesize += frag2->truesize;
967                 }
968         }
969
970 slow_path:
971         /*
972          *      Fragment the datagram.
973          */
974
975         ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
976                       LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
977                       &state);
978
979         /*
980          *      Keep copying data until we run out.
981          */
982
983         while (state.left > 0) {
984                 frag = ip6_frag_next(skb, &state);
985                 if (IS_ERR(frag)) {
986                         err = PTR_ERR(frag);
987                         goto fail;
988                 }
989
990                 /*
991                  *      Put this fragment into the sending queue.
992                  */
993                 skb_set_delivery_time(frag, tstamp, mono_delivery_time);
994                 err = output(net, sk, frag);
995                 if (err)
996                         goto fail;
997
998                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
999                               IPSTATS_MIB_FRAGCREATES);
1000         }
1001         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1002                       IPSTATS_MIB_FRAGOKS);
1003         consume_skb(skb);
1004         return err;
1005
1006 fail_toobig:
1007         if (skb->sk && dst_allfrag(skb_dst(skb)))
1008                 sk_gso_disable(skb->sk);
1009
1010         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1011         err = -EMSGSIZE;
1012
1013 fail:
1014         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1015                       IPSTATS_MIB_FRAGFAILS);
1016         kfree_skb(skb);
1017         return err;
1018 }
1019
1020 static inline int ip6_rt_check(const struct rt6key *rt_key,
1021                                const struct in6_addr *fl_addr,
1022                                const struct in6_addr *addr_cache)
1023 {
1024         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
1025                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
1026 }
1027
1028 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1029                                           struct dst_entry *dst,
1030                                           const struct flowi6 *fl6)
1031 {
1032         struct ipv6_pinfo *np = inet6_sk(sk);
1033         struct rt6_info *rt;
1034
1035         if (!dst)
1036                 goto out;
1037
1038         if (dst->ops->family != AF_INET6) {
1039                 dst_release(dst);
1040                 return NULL;
1041         }
1042
1043         rt = (struct rt6_info *)dst;
1044         /* Yes, checking route validity in not connected
1045          * case is not very simple. Take into account,
1046          * that we do not support routing by source, TOS,
1047          * and MSG_DONTROUTE            --ANK (980726)
1048          *
1049          * 1. ip6_rt_check(): If route was host route,
1050          *    check that cached destination is current.
1051          *    If it is network route, we still may
1052          *    check its validity using saved pointer
1053          *    to the last used address: daddr_cache.
1054          *    We do not want to save whole address now,
1055          *    (because main consumer of this service
1056          *    is tcp, which has not this problem),
1057          *    so that the last trick works only on connected
1058          *    sockets.
1059          * 2. oif also should be the same.
1060          */
1061         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1062 #ifdef CONFIG_IPV6_SUBTREES
1063             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1064 #endif
1065            (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
1066                 dst_release(dst);
1067                 dst = NULL;
1068         }
1069
1070 out:
1071         return dst;
1072 }
1073
1074 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1075                                struct dst_entry **dst, struct flowi6 *fl6)
1076 {
1077 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1078         struct neighbour *n;
1079         struct rt6_info *rt;
1080 #endif
1081         int err;
1082         int flags = 0;
1083
1084         /* The correct way to handle this would be to do
1085          * ip6_route_get_saddr, and then ip6_route_output; however,
1086          * the route-specific preferred source forces the
1087          * ip6_route_output call _before_ ip6_route_get_saddr.
1088          *
1089          * In source specific routing (no src=any default route),
1090          * ip6_route_output will fail given src=any saddr, though, so
1091          * that's why we try it again later.
1092          */
1093         if (ipv6_addr_any(&fl6->saddr)) {
1094                 struct fib6_info *from;
1095                 struct rt6_info *rt;
1096
1097                 *dst = ip6_route_output(net, sk, fl6);
1098                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1099
1100                 rcu_read_lock();
1101                 from = rt ? rcu_dereference(rt->from) : NULL;
1102                 err = ip6_route_get_saddr(net, from, &fl6->daddr,
1103                                           sk ? inet6_sk(sk)->srcprefs : 0,
1104                                           &fl6->saddr);
1105                 rcu_read_unlock();
1106
1107                 if (err)
1108                         goto out_err_release;
1109
1110                 /* If we had an erroneous initial result, pretend it
1111                  * never existed and let the SA-enabled version take
1112                  * over.
1113                  */
1114                 if ((*dst)->error) {
1115                         dst_release(*dst);
1116                         *dst = NULL;
1117                 }
1118
1119                 if (fl6->flowi6_oif)
1120                         flags |= RT6_LOOKUP_F_IFACE;
1121         }
1122
1123         if (!*dst)
1124                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
1125
1126         err = (*dst)->error;
1127         if (err)
1128                 goto out_err_release;
1129
1130 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1131         /*
1132          * Here if the dst entry we've looked up
1133          * has a neighbour entry that is in the INCOMPLETE
1134          * state and the src address from the flow is
1135          * marked as OPTIMISTIC, we release the found
1136          * dst entry and replace it instead with the
1137          * dst entry of the nexthop router
1138          */
1139         rt = (struct rt6_info *) *dst;
1140         rcu_read_lock_bh();
1141         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1142                                       rt6_nexthop(rt, &fl6->daddr));
1143         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1144         rcu_read_unlock_bh();
1145
1146         if (err) {
1147                 struct inet6_ifaddr *ifp;
1148                 struct flowi6 fl_gw6;
1149                 int redirect;
1150
1151                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1152                                       (*dst)->dev, 1);
1153
1154                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1155                 if (ifp)
1156                         in6_ifa_put(ifp);
1157
1158                 if (redirect) {
1159                         /*
1160                          * We need to get the dst entry for the
1161                          * default router instead
1162                          */
1163                         dst_release(*dst);
1164                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1165                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1166                         *dst = ip6_route_output(net, sk, &fl_gw6);
1167                         err = (*dst)->error;
1168                         if (err)
1169                                 goto out_err_release;
1170                 }
1171         }
1172 #endif
1173         if (ipv6_addr_v4mapped(&fl6->saddr) &&
1174             !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1175                 err = -EAFNOSUPPORT;
1176                 goto out_err_release;
1177         }
1178
1179         return 0;
1180
1181 out_err_release:
1182         dst_release(*dst);
1183         *dst = NULL;
1184
1185         if (err == -ENETUNREACH)
1186                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1187         return err;
1188 }
1189
1190 /**
1191  *      ip6_dst_lookup - perform route lookup on flow
1192  *      @net: Network namespace to perform lookup in
1193  *      @sk: socket which provides route info
1194  *      @dst: pointer to dst_entry * for result
1195  *      @fl6: flow to lookup
1196  *
1197  *      This function performs a route lookup on the given flow.
1198  *
1199  *      It returns zero on success, or a standard errno code on error.
1200  */
1201 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1202                    struct flowi6 *fl6)
1203 {
1204         *dst = NULL;
1205         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1206 }
1207 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1208
1209 /**
1210  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1211  *      @net: Network namespace to perform lookup in
1212  *      @sk: socket which provides route info
1213  *      @fl6: flow to lookup
1214  *      @final_dst: final destination address for ipsec lookup
1215  *
1216  *      This function performs a route lookup on the given flow.
1217  *
1218  *      It returns a valid dst pointer on success, or a pointer encoded
1219  *      error code.
1220  */
1221 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1222                                       const struct in6_addr *final_dst)
1223 {
1224         struct dst_entry *dst = NULL;
1225         int err;
1226
1227         err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1228         if (err)
1229                 return ERR_PTR(err);
1230         if (final_dst)
1231                 fl6->daddr = *final_dst;
1232
1233         return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1234 }
1235 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1236
1237 /**
1238  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1239  *      @sk: socket which provides the dst cache and route info
1240  *      @fl6: flow to lookup
1241  *      @final_dst: final destination address for ipsec lookup
1242  *      @connected: whether @sk is connected or not
1243  *
1244  *      This function performs a route lookup on the given flow with the
1245  *      possibility of using the cached route in the socket if it is valid.
1246  *      It will take the socket dst lock when operating on the dst cache.
1247  *      As a result, this function can only be used in process context.
1248  *
1249  *      In addition, for a connected socket, cache the dst in the socket
1250  *      if the current cache is not valid.
1251  *
1252  *      It returns a valid dst pointer on success, or a pointer encoded
1253  *      error code.
1254  */
1255 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1256                                          const struct in6_addr *final_dst,
1257                                          bool connected)
1258 {
1259         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1260
1261         dst = ip6_sk_dst_check(sk, dst, fl6);
1262         if (dst)
1263                 return dst;
1264
1265         dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1266         if (connected && !IS_ERR(dst))
1267                 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1268
1269         return dst;
1270 }
1271 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1272
1273 /**
1274  *      ip6_dst_lookup_tunnel - perform route lookup on tunnel
1275  *      @skb: Packet for which lookup is done
1276  *      @dev: Tunnel device
1277  *      @net: Network namespace of tunnel device
1278  *      @sock: Socket which provides route info
1279  *      @saddr: Memory to store the src ip address
1280  *      @info: Tunnel information
1281  *      @protocol: IP protocol
1282  *      @use_cache: Flag to enable cache usage
1283  *      This function performs a route lookup on a tunnel
1284  *
1285  *      It returns a valid dst pointer and stores src address to be used in
1286  *      tunnel in param saddr on success, else a pointer encoded error code.
1287  */
1288
1289 struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1290                                         struct net_device *dev,
1291                                         struct net *net,
1292                                         struct socket *sock,
1293                                         struct in6_addr *saddr,
1294                                         const struct ip_tunnel_info *info,
1295                                         u8 protocol,
1296                                         bool use_cache)
1297 {
1298         struct dst_entry *dst = NULL;
1299 #ifdef CONFIG_DST_CACHE
1300         struct dst_cache *dst_cache;
1301 #endif
1302         struct flowi6 fl6;
1303         __u8 prio;
1304
1305 #ifdef CONFIG_DST_CACHE
1306         dst_cache = (struct dst_cache *)&info->dst_cache;
1307         if (use_cache) {
1308                 dst = dst_cache_get_ip6(dst_cache, saddr);
1309                 if (dst)
1310                         return dst;
1311         }
1312 #endif
1313         memset(&fl6, 0, sizeof(fl6));
1314         fl6.flowi6_mark = skb->mark;
1315         fl6.flowi6_proto = protocol;
1316         fl6.daddr = info->key.u.ipv6.dst;
1317         fl6.saddr = info->key.u.ipv6.src;
1318         prio = info->key.tos;
1319         fl6.flowlabel = ip6_make_flowinfo(prio, info->key.label);
1320
1321         dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1322                                               NULL);
1323         if (IS_ERR(dst)) {
1324                 netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1325                 return ERR_PTR(-ENETUNREACH);
1326         }
1327         if (dst->dev == dev) { /* is this necessary? */
1328                 netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1329                 dst_release(dst);
1330                 return ERR_PTR(-ELOOP);
1331         }
1332 #ifdef CONFIG_DST_CACHE
1333         if (use_cache)
1334                 dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1335 #endif
1336         *saddr = fl6.saddr;
1337         return dst;
1338 }
1339 EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1340
1341 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1342                                                gfp_t gfp)
1343 {
1344         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1345 }
1346
1347 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1348                                                 gfp_t gfp)
1349 {
1350         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1351 }
1352
1353 static void ip6_append_data_mtu(unsigned int *mtu,
1354                                 int *maxfraglen,
1355                                 unsigned int fragheaderlen,
1356                                 struct sk_buff *skb,
1357                                 struct rt6_info *rt,
1358                                 unsigned int orig_mtu)
1359 {
1360         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1361                 if (!skb) {
1362                         /* first fragment, reserve header_len */
1363                         *mtu = orig_mtu - rt->dst.header_len;
1364
1365                 } else {
1366                         /*
1367                          * this fragment is not first, the headers
1368                          * space is regarded as data space.
1369                          */
1370                         *mtu = orig_mtu;
1371                 }
1372                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1373                               + fragheaderlen - sizeof(struct frag_hdr);
1374         }
1375 }
1376
1377 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1378                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1379                           struct rt6_info *rt)
1380 {
1381         struct ipv6_pinfo *np = inet6_sk(sk);
1382         unsigned int mtu;
1383         struct ipv6_txoptions *nopt, *opt = ipc6->opt;
1384
1385         /* callers pass dst together with a reference, set it first so
1386          * ip6_cork_release() can put it down even in case of an error.
1387          */
1388         cork->base.dst = &rt->dst;
1389
1390         /*
1391          * setup for corking
1392          */
1393         if (opt) {
1394                 if (WARN_ON(v6_cork->opt))
1395                         return -EINVAL;
1396
1397                 nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1398                 if (unlikely(!nopt))
1399                         return -ENOBUFS;
1400
1401                 nopt->tot_len = sizeof(*opt);
1402                 nopt->opt_flen = opt->opt_flen;
1403                 nopt->opt_nflen = opt->opt_nflen;
1404
1405                 nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation);
1406                 if (opt->dst0opt && !nopt->dst0opt)
1407                         return -ENOBUFS;
1408
1409                 nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation);
1410                 if (opt->dst1opt && !nopt->dst1opt)
1411                         return -ENOBUFS;
1412
1413                 nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation);
1414                 if (opt->hopopt && !nopt->hopopt)
1415                         return -ENOBUFS;
1416
1417                 nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation);
1418                 if (opt->srcrt && !nopt->srcrt)
1419                         return -ENOBUFS;
1420
1421                 /* need source address above miyazawa*/
1422         }
1423         v6_cork->hop_limit = ipc6->hlimit;
1424         v6_cork->tclass = ipc6->tclass;
1425         if (rt->dst.flags & DST_XFRM_TUNNEL)
1426                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1427                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1428         else
1429                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1430                         READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1431         if (np->frag_size < mtu) {
1432                 if (np->frag_size)
1433                         mtu = np->frag_size;
1434         }
1435         cork->base.fragsize = mtu;
1436         cork->base.gso_size = ipc6->gso_size;
1437         cork->base.tx_flags = 0;
1438         cork->base.mark = ipc6->sockc.mark;
1439         sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1440
1441         if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1442                 cork->base.flags |= IPCORK_ALLFRAG;
1443         cork->base.length = 0;
1444
1445         cork->base.transmit_time = ipc6->sockc.transmit_time;
1446
1447         return 0;
1448 }
1449
1450 static int __ip6_append_data(struct sock *sk,
1451                              struct sk_buff_head *queue,
1452                              struct inet_cork_full *cork_full,
1453                              struct inet6_cork *v6_cork,
1454                              struct page_frag *pfrag,
1455                              int getfrag(void *from, char *to, int offset,
1456                                          int len, int odd, struct sk_buff *skb),
1457                              void *from, size_t length, int transhdrlen,
1458                              unsigned int flags, struct ipcm6_cookie *ipc6)
1459 {
1460         struct sk_buff *skb, *skb_prev = NULL;
1461         struct inet_cork *cork = &cork_full->base;
1462         struct flowi6 *fl6 = &cork_full->fl.u.ip6;
1463         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1464         struct ubuf_info *uarg = NULL;
1465         int exthdrlen = 0;
1466         int dst_exthdrlen = 0;
1467         int hh_len;
1468         int copy;
1469         int err;
1470         int offset = 0;
1471         bool zc = false;
1472         u32 tskey = 0;
1473         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1474         struct ipv6_txoptions *opt = v6_cork->opt;
1475         int csummode = CHECKSUM_NONE;
1476         unsigned int maxnonfragsize, headersize;
1477         unsigned int wmem_alloc_delta = 0;
1478         bool paged, extra_uref = false;
1479
1480         skb = skb_peek_tail(queue);
1481         if (!skb) {
1482                 exthdrlen = opt ? opt->opt_flen : 0;
1483                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1484         }
1485
1486         paged = !!cork->gso_size;
1487         mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1488         orig_mtu = mtu;
1489
1490         if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1491             sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1492                 tskey = atomic_inc_return(&sk->sk_tskey) - 1;
1493
1494         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1495
1496         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1497                         (opt ? opt->opt_nflen : 0);
1498
1499         headersize = sizeof(struct ipv6hdr) +
1500                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1501                      (dst_allfrag(&rt->dst) ?
1502                       sizeof(struct frag_hdr) : 0) +
1503                      rt->rt6i_nfheader_len;
1504
1505         if (mtu <= fragheaderlen ||
1506             ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1507                 goto emsgsize;
1508
1509         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1510                      sizeof(struct frag_hdr);
1511
1512         /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1513          * the first fragment
1514          */
1515         if (headersize + transhdrlen > mtu)
1516                 goto emsgsize;
1517
1518         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1519             (sk->sk_protocol == IPPROTO_UDP ||
1520              sk->sk_protocol == IPPROTO_ICMPV6 ||
1521              sk->sk_protocol == IPPROTO_RAW)) {
1522                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1523                                 sizeof(struct ipv6hdr));
1524                 goto emsgsize;
1525         }
1526
1527         if (ip6_sk_ignore_df(sk))
1528                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1529         else
1530                 maxnonfragsize = mtu;
1531
1532         if (cork->length + length > maxnonfragsize - headersize) {
1533 emsgsize:
1534                 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1535                 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1536                 return -EMSGSIZE;
1537         }
1538
1539         /* CHECKSUM_PARTIAL only with no extension headers and when
1540          * we are not going to fragment
1541          */
1542         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1543             headersize == sizeof(struct ipv6hdr) &&
1544             length <= mtu - headersize &&
1545             (!(flags & MSG_MORE) || cork->gso_size) &&
1546             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1547                 csummode = CHECKSUM_PARTIAL;
1548
1549         if ((flags & MSG_ZEROCOPY) && length) {
1550                 struct msghdr *msg = from;
1551
1552                 if (getfrag == ip_generic_getfrag && msg->msg_ubuf) {
1553                         if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb))
1554                                 return -EINVAL;
1555
1556                         /* Leave uarg NULL if can't zerocopy, callers should
1557                          * be able to handle it.
1558                          */
1559                         if ((rt->dst.dev->features & NETIF_F_SG) &&
1560                             csummode == CHECKSUM_PARTIAL) {
1561                                 paged = true;
1562                                 zc = true;
1563                                 uarg = msg->msg_ubuf;
1564                         }
1565                 } else if (sock_flag(sk, SOCK_ZEROCOPY)) {
1566                         uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
1567                         if (!uarg)
1568                                 return -ENOBUFS;
1569                         extra_uref = !skb_zcopy(skb);   /* only ref on new uarg */
1570                         if (rt->dst.dev->features & NETIF_F_SG &&
1571                             csummode == CHECKSUM_PARTIAL) {
1572                                 paged = true;
1573                                 zc = true;
1574                         } else {
1575                                 uarg_to_msgzc(uarg)->zerocopy = 0;
1576                                 skb_zcopy_set(skb, uarg, &extra_uref);
1577                         }
1578                 }
1579         }
1580
1581         /*
1582          * Let's try using as much space as possible.
1583          * Use MTU if total length of the message fits into the MTU.
1584          * Otherwise, we need to reserve fragment header and
1585          * fragment alignment (= 8-15 octects, in total).
1586          *
1587          * Note that we may need to "move" the data from the tail
1588          * of the buffer to the new fragment when we split
1589          * the message.
1590          *
1591          * FIXME: It may be fragmented into multiple chunks
1592          *        at once if non-fragmentable extension headers
1593          *        are too large.
1594          * --yoshfuji
1595          */
1596
1597         cork->length += length;
1598         if (!skb)
1599                 goto alloc_new_skb;
1600
1601         while (length > 0) {
1602                 /* Check if the remaining data fits into current packet. */
1603                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1604                 if (copy < length)
1605                         copy = maxfraglen - skb->len;
1606
1607                 if (copy <= 0) {
1608                         char *data;
1609                         unsigned int datalen;
1610                         unsigned int fraglen;
1611                         unsigned int fraggap;
1612                         unsigned int alloclen, alloc_extra;
1613                         unsigned int pagedlen;
1614 alloc_new_skb:
1615                         /* There's no room in the current skb */
1616                         if (skb)
1617                                 fraggap = skb->len - maxfraglen;
1618                         else
1619                                 fraggap = 0;
1620                         /* update mtu and maxfraglen if necessary */
1621                         if (!skb || !skb_prev)
1622                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1623                                                     fragheaderlen, skb, rt,
1624                                                     orig_mtu);
1625
1626                         skb_prev = skb;
1627
1628                         /*
1629                          * If remaining data exceeds the mtu,
1630                          * we know we need more fragment(s).
1631                          */
1632                         datalen = length + fraggap;
1633
1634                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1635                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1636                         fraglen = datalen + fragheaderlen;
1637                         pagedlen = 0;
1638
1639                         alloc_extra = hh_len;
1640                         alloc_extra += dst_exthdrlen;
1641                         alloc_extra += rt->dst.trailer_len;
1642
1643                         /* We just reserve space for fragment header.
1644                          * Note: this may be overallocation if the message
1645                          * (without MSG_MORE) fits into the MTU.
1646                          */
1647                         alloc_extra += sizeof(struct frag_hdr);
1648
1649                         if ((flags & MSG_MORE) &&
1650                             !(rt->dst.dev->features&NETIF_F_SG))
1651                                 alloclen = mtu;
1652                         else if (!paged &&
1653                                  (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1654                                   !(rt->dst.dev->features & NETIF_F_SG)))
1655                                 alloclen = fraglen;
1656                         else {
1657                                 alloclen = fragheaderlen + transhdrlen;
1658                                 pagedlen = datalen - transhdrlen;
1659                         }
1660                         alloclen += alloc_extra;
1661
1662                         if (datalen != length + fraggap) {
1663                                 /*
1664                                  * this is not the last fragment, the trailer
1665                                  * space is regarded as data space.
1666                                  */
1667                                 datalen += rt->dst.trailer_len;
1668                         }
1669
1670                         fraglen = datalen + fragheaderlen;
1671
1672                         copy = datalen - transhdrlen - fraggap - pagedlen;
1673                         if (copy < 0) {
1674                                 err = -EINVAL;
1675                                 goto error;
1676                         }
1677                         if (transhdrlen) {
1678                                 skb = sock_alloc_send_skb(sk, alloclen,
1679                                                 (flags & MSG_DONTWAIT), &err);
1680                         } else {
1681                                 skb = NULL;
1682                                 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1683                                     2 * sk->sk_sndbuf)
1684                                         skb = alloc_skb(alloclen,
1685                                                         sk->sk_allocation);
1686                                 if (unlikely(!skb))
1687                                         err = -ENOBUFS;
1688                         }
1689                         if (!skb)
1690                                 goto error;
1691                         /*
1692                          *      Fill in the control structures
1693                          */
1694                         skb->protocol = htons(ETH_P_IPV6);
1695                         skb->ip_summed = csummode;
1696                         skb->csum = 0;
1697                         /* reserve for fragmentation and ipsec header */
1698                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1699                                     dst_exthdrlen);
1700
1701                         /*
1702                          *      Find where to start putting bytes
1703                          */
1704                         data = skb_put(skb, fraglen - pagedlen);
1705                         skb_set_network_header(skb, exthdrlen);
1706                         data += fragheaderlen;
1707                         skb->transport_header = (skb->network_header +
1708                                                  fragheaderlen);
1709                         if (fraggap) {
1710                                 skb->csum = skb_copy_and_csum_bits(
1711                                         skb_prev, maxfraglen,
1712                                         data + transhdrlen, fraggap);
1713                                 skb_prev->csum = csum_sub(skb_prev->csum,
1714                                                           skb->csum);
1715                                 data += fraggap;
1716                                 pskb_trim_unique(skb_prev, maxfraglen);
1717                         }
1718                         if (copy > 0 &&
1719                             getfrag(from, data + transhdrlen, offset,
1720                                     copy, fraggap, skb) < 0) {
1721                                 err = -EFAULT;
1722                                 kfree_skb(skb);
1723                                 goto error;
1724                         }
1725
1726                         offset += copy;
1727                         length -= copy + transhdrlen;
1728                         transhdrlen = 0;
1729                         exthdrlen = 0;
1730                         dst_exthdrlen = 0;
1731
1732                         /* Only the initial fragment is time stamped */
1733                         skb_shinfo(skb)->tx_flags = cork->tx_flags;
1734                         cork->tx_flags = 0;
1735                         skb_shinfo(skb)->tskey = tskey;
1736                         tskey = 0;
1737                         skb_zcopy_set(skb, uarg, &extra_uref);
1738
1739                         if ((flags & MSG_CONFIRM) && !skb_prev)
1740                                 skb_set_dst_pending_confirm(skb, 1);
1741
1742                         /*
1743                          * Put the packet on the pending queue
1744                          */
1745                         if (!skb->destructor) {
1746                                 skb->destructor = sock_wfree;
1747                                 skb->sk = sk;
1748                                 wmem_alloc_delta += skb->truesize;
1749                         }
1750                         __skb_queue_tail(queue, skb);
1751                         continue;
1752                 }
1753
1754                 if (copy > length)
1755                         copy = length;
1756
1757                 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1758                     skb_tailroom(skb) >= copy) {
1759                         unsigned int off;
1760
1761                         off = skb->len;
1762                         if (getfrag(from, skb_put(skb, copy),
1763                                                 offset, copy, off, skb) < 0) {
1764                                 __skb_trim(skb, off);
1765                                 err = -EFAULT;
1766                                 goto error;
1767                         }
1768                 } else if (!zc) {
1769                         int i = skb_shinfo(skb)->nr_frags;
1770
1771                         err = -ENOMEM;
1772                         if (!sk_page_frag_refill(sk, pfrag))
1773                                 goto error;
1774
1775                         skb_zcopy_downgrade_managed(skb);
1776                         if (!skb_can_coalesce(skb, i, pfrag->page,
1777                                               pfrag->offset)) {
1778                                 err = -EMSGSIZE;
1779                                 if (i == MAX_SKB_FRAGS)
1780                                         goto error;
1781
1782                                 __skb_fill_page_desc(skb, i, pfrag->page,
1783                                                      pfrag->offset, 0);
1784                                 skb_shinfo(skb)->nr_frags = ++i;
1785                                 get_page(pfrag->page);
1786                         }
1787                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1788                         if (getfrag(from,
1789                                     page_address(pfrag->page) + pfrag->offset,
1790                                     offset, copy, skb->len, skb) < 0)
1791                                 goto error_efault;
1792
1793                         pfrag->offset += copy;
1794                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1795                         skb->len += copy;
1796                         skb->data_len += copy;
1797                         skb->truesize += copy;
1798                         wmem_alloc_delta += copy;
1799                 } else {
1800                         err = skb_zerocopy_iter_dgram(skb, from, copy);
1801                         if (err < 0)
1802                                 goto error;
1803                 }
1804                 offset += copy;
1805                 length -= copy;
1806         }
1807
1808         if (wmem_alloc_delta)
1809                 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1810         return 0;
1811
1812 error_efault:
1813         err = -EFAULT;
1814 error:
1815         net_zcopy_put_abort(uarg, extra_uref);
1816         cork->length -= length;
1817         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1818         refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1819         return err;
1820 }
1821
1822 int ip6_append_data(struct sock *sk,
1823                     int getfrag(void *from, char *to, int offset, int len,
1824                                 int odd, struct sk_buff *skb),
1825                     void *from, size_t length, int transhdrlen,
1826                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1827                     struct rt6_info *rt, unsigned int flags)
1828 {
1829         struct inet_sock *inet = inet_sk(sk);
1830         struct ipv6_pinfo *np = inet6_sk(sk);
1831         int exthdrlen;
1832         int err;
1833
1834         if (flags&MSG_PROBE)
1835                 return 0;
1836         if (skb_queue_empty(&sk->sk_write_queue)) {
1837                 /*
1838                  * setup for corking
1839                  */
1840                 dst_hold(&rt->dst);
1841                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1842                                      ipc6, rt);
1843                 if (err)
1844                         return err;
1845
1846                 inet->cork.fl.u.ip6 = *fl6;
1847                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1848                 length += exthdrlen;
1849                 transhdrlen += exthdrlen;
1850         } else {
1851                 transhdrlen = 0;
1852         }
1853
1854         return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork,
1855                                  &np->cork, sk_page_frag(sk), getfrag,
1856                                  from, length, transhdrlen, flags, ipc6);
1857 }
1858 EXPORT_SYMBOL_GPL(ip6_append_data);
1859
1860 static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork)
1861 {
1862         struct dst_entry *dst = cork->base.dst;
1863
1864         cork->base.dst = NULL;
1865         cork->base.flags &= ~IPCORK_ALLFRAG;
1866         skb_dst_set(skb, dst);
1867 }
1868
1869 static void ip6_cork_release(struct inet_cork_full *cork,
1870                              struct inet6_cork *v6_cork)
1871 {
1872         if (v6_cork->opt) {
1873                 struct ipv6_txoptions *opt = v6_cork->opt;
1874
1875                 kfree(opt->dst0opt);
1876                 kfree(opt->dst1opt);
1877                 kfree(opt->hopopt);
1878                 kfree(opt->srcrt);
1879                 kfree(opt);
1880                 v6_cork->opt = NULL;
1881         }
1882
1883         if (cork->base.dst) {
1884                 dst_release(cork->base.dst);
1885                 cork->base.dst = NULL;
1886                 cork->base.flags &= ~IPCORK_ALLFRAG;
1887         }
1888 }
1889
1890 struct sk_buff *__ip6_make_skb(struct sock *sk,
1891                                struct sk_buff_head *queue,
1892                                struct inet_cork_full *cork,
1893                                struct inet6_cork *v6_cork)
1894 {
1895         struct sk_buff *skb, *tmp_skb;
1896         struct sk_buff **tail_skb;
1897         struct in6_addr *final_dst;
1898         struct ipv6_pinfo *np = inet6_sk(sk);
1899         struct net *net = sock_net(sk);
1900         struct ipv6hdr *hdr;
1901         struct ipv6_txoptions *opt = v6_cork->opt;
1902         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1903         struct flowi6 *fl6 = &cork->fl.u.ip6;
1904         unsigned char proto = fl6->flowi6_proto;
1905
1906         skb = __skb_dequeue(queue);
1907         if (!skb)
1908                 goto out;
1909         tail_skb = &(skb_shinfo(skb)->frag_list);
1910
1911         /* move skb->data to ip header from ext header */
1912         if (skb->data < skb_network_header(skb))
1913                 __skb_pull(skb, skb_network_offset(skb));
1914         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1915                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1916                 *tail_skb = tmp_skb;
1917                 tail_skb = &(tmp_skb->next);
1918                 skb->len += tmp_skb->len;
1919                 skb->data_len += tmp_skb->len;
1920                 skb->truesize += tmp_skb->truesize;
1921                 tmp_skb->destructor = NULL;
1922                 tmp_skb->sk = NULL;
1923         }
1924
1925         /* Allow local fragmentation. */
1926         skb->ignore_df = ip6_sk_ignore_df(sk);
1927         __skb_pull(skb, skb_network_header_len(skb));
1928
1929         final_dst = &fl6->daddr;
1930         if (opt && opt->opt_flen)
1931                 ipv6_push_frag_opts(skb, opt, &proto);
1932         if (opt && opt->opt_nflen)
1933                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1934
1935         skb_push(skb, sizeof(struct ipv6hdr));
1936         skb_reset_network_header(skb);
1937         hdr = ipv6_hdr(skb);
1938
1939         ip6_flow_hdr(hdr, v6_cork->tclass,
1940                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1941                                         ip6_autoflowlabel(net, np), fl6));
1942         hdr->hop_limit = v6_cork->hop_limit;
1943         hdr->nexthdr = proto;
1944         hdr->saddr = fl6->saddr;
1945         hdr->daddr = *final_dst;
1946
1947         skb->priority = sk->sk_priority;
1948         skb->mark = cork->base.mark;
1949         skb->tstamp = cork->base.transmit_time;
1950
1951         ip6_cork_steal_dst(skb, cork);
1952         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1953         if (proto == IPPROTO_ICMPV6) {
1954                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1955
1956                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1957                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1958         }
1959
1960         ip6_cork_release(cork, v6_cork);
1961 out:
1962         return skb;
1963 }
1964
1965 int ip6_send_skb(struct sk_buff *skb)
1966 {
1967         struct net *net = sock_net(skb->sk);
1968         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1969         int err;
1970
1971         err = ip6_local_out(net, skb->sk, skb);
1972         if (err) {
1973                 if (err > 0)
1974                         err = net_xmit_errno(err);
1975                 if (err)
1976                         IP6_INC_STATS(net, rt->rt6i_idev,
1977                                       IPSTATS_MIB_OUTDISCARDS);
1978         }
1979
1980         return err;
1981 }
1982
1983 int ip6_push_pending_frames(struct sock *sk)
1984 {
1985         struct sk_buff *skb;
1986
1987         skb = ip6_finish_skb(sk);
1988         if (!skb)
1989                 return 0;
1990
1991         return ip6_send_skb(skb);
1992 }
1993 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1994
1995 static void __ip6_flush_pending_frames(struct sock *sk,
1996                                        struct sk_buff_head *queue,
1997                                        struct inet_cork_full *cork,
1998                                        struct inet6_cork *v6_cork)
1999 {
2000         struct sk_buff *skb;
2001
2002         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
2003                 if (skb_dst(skb))
2004                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
2005                                       IPSTATS_MIB_OUTDISCARDS);
2006                 kfree_skb(skb);
2007         }
2008
2009         ip6_cork_release(cork, v6_cork);
2010 }
2011
2012 void ip6_flush_pending_frames(struct sock *sk)
2013 {
2014         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
2015                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
2016 }
2017 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
2018
2019 struct sk_buff *ip6_make_skb(struct sock *sk,
2020                              int getfrag(void *from, char *to, int offset,
2021                                          int len, int odd, struct sk_buff *skb),
2022                              void *from, size_t length, int transhdrlen,
2023                              struct ipcm6_cookie *ipc6, struct rt6_info *rt,
2024                              unsigned int flags, struct inet_cork_full *cork)
2025 {
2026         struct inet6_cork v6_cork;
2027         struct sk_buff_head queue;
2028         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
2029         int err;
2030
2031         if (flags & MSG_PROBE) {
2032                 dst_release(&rt->dst);
2033                 return NULL;
2034         }
2035
2036         __skb_queue_head_init(&queue);
2037
2038         cork->base.flags = 0;
2039         cork->base.addr = 0;
2040         cork->base.opt = NULL;
2041         v6_cork.opt = NULL;
2042         err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt);
2043         if (err) {
2044                 ip6_cork_release(cork, &v6_cork);
2045                 return ERR_PTR(err);
2046         }
2047         if (ipc6->dontfrag < 0)
2048                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
2049
2050         err = __ip6_append_data(sk, &queue, cork, &v6_cork,
2051                                 &current->task_frag, getfrag, from,
2052                                 length + exthdrlen, transhdrlen + exthdrlen,
2053                                 flags, ipc6);
2054         if (err) {
2055                 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
2056                 return ERR_PTR(err);
2057         }
2058
2059         return __ip6_make_skb(sk, &queue, cork, &v6_cork);
2060 }