1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * IPv6 output functions
4 * Linux INET6 implementation
7 * Pedro Roque <roque@di.fc.ul.pt>
9 * Based on linux/net/ipv4/ip_output.c
12 * A.N.Kuznetsov : airthmetics in fragmentation.
13 * extension headers are implemented.
14 * route changes now work.
15 * ip6_forward does not confuse sniffers.
18 * H. von Brand : Added missing #include <linux/string.h>
19 * Imran Patel : frag id should be in NBO
20 * Kazunori MIYAZAWA @USAGI
21 * : add ip6_append_data and related functions
25 #include <linux/errno.h>
26 #include <linux/kernel.h>
27 #include <linux/string.h>
28 #include <linux/socket.h>
29 #include <linux/net.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/in6.h>
33 #include <linux/tcp.h>
34 #include <linux/route.h>
35 #include <linux/module.h>
36 #include <linux/slab.h>
38 #include <linux/bpf-cgroup.h>
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv6.h>
46 #include <net/ndisc.h>
47 #include <net/protocol.h>
48 #include <net/ip6_route.h>
49 #include <net/addrconf.h>
50 #include <net/rawv6.h>
53 #include <net/checksum.h>
54 #include <linux/mroute6.h>
55 #include <net/l3mdev.h>
56 #include <net/lwtunnel.h>
57 #include <net/ip_tunnels.h>
59 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
61 struct dst_entry *dst = skb_dst(skb);
62 struct net_device *dev = dst->dev;
63 struct inet6_dev *idev = ip6_dst_idev(dst);
64 unsigned int hh_len = LL_RESERVED_SPACE(dev);
65 const struct in6_addr *daddr, *nexthop;
67 struct neighbour *neigh;
70 /* Be paranoid, rather than too clever. */
71 if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
72 skb = skb_expand_head(skb, hh_len);
74 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
81 if (ipv6_addr_is_multicast(daddr)) {
82 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
83 ((mroute6_is_socket(net, skb) &&
84 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
85 ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
86 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
88 /* Do not check for IFF_ALLMULTI; multicast routing
89 is not supported in any case.
92 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
93 net, sk, newskb, NULL, newskb->dev,
96 if (hdr->hop_limit == 0) {
97 IP6_INC_STATS(net, idev,
98 IPSTATS_MIB_OUTDISCARDS);
104 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
105 if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
106 !(dev->flags & IFF_LOOPBACK)) {
112 if (lwtunnel_xmit_redirect(dst->lwtstate)) {
113 int res = lwtunnel_xmit(skb);
115 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
120 nexthop = rt6_nexthop((struct rt6_info *)dst, daddr);
121 neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
122 if (unlikely(!neigh))
123 neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
124 if (!IS_ERR(neigh)) {
125 sock_confirm_neigh(skb, neigh);
126 ret = neigh_output(neigh, skb, false);
127 rcu_read_unlock_bh();
130 rcu_read_unlock_bh();
132 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
138 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
139 struct sk_buff *skb, unsigned int mtu)
141 struct sk_buff *segs, *nskb;
142 netdev_features_t features;
145 /* Please see corresponding comment in ip_finish_output_gso
146 * describing the cases where GSO segment length exceeds the
149 features = netif_skb_features(skb);
150 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
151 if (IS_ERR_OR_NULL(segs)) {
158 skb_list_walk_safe(segs, segs, nskb) {
161 skb_mark_not_on_list(segs);
162 err = ip6_fragment(net, sk, segs, ip6_finish_output2);
170 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
174 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
175 /* Policy lookup after SNAT yielded a new policy */
176 if (skb_dst(skb)->xfrm) {
177 IPCB(skb)->flags |= IPSKB_REROUTED;
178 return dst_output(net, sk, skb);
182 mtu = ip6_skb_dst_mtu(skb);
183 if (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))
184 return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
186 if ((skb->len > mtu && !skb_is_gso(skb)) ||
187 dst_allfrag(skb_dst(skb)) ||
188 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
189 return ip6_fragment(net, sk, skb, ip6_finish_output2);
191 return ip6_finish_output2(net, sk, skb);
194 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
198 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
200 case NET_XMIT_SUCCESS:
201 return __ip6_finish_output(net, sk, skb);
203 return __ip6_finish_output(net, sk, skb) ? : ret;
210 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
212 struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
213 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
215 skb->protocol = htons(ETH_P_IPV6);
218 if (unlikely(idev->cnf.disable_ipv6)) {
219 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
224 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
225 net, sk, skb, indev, dev,
227 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
229 EXPORT_SYMBOL(ip6_output);
231 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
233 if (!np->autoflowlabel_set)
234 return ip6_default_np_autolabel(net);
236 return np->autoflowlabel;
240 * xmit an sk_buff (used by TCP, SCTP and DCCP)
241 * Note : socket lock is not held for SYNACK packets, but might be modified
242 * by calls to skb_set_owner_w() and ipv6_local_error(),
243 * which are using proper atomic operations or spinlocks.
245 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
246 __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
248 struct net *net = sock_net(sk);
249 const struct ipv6_pinfo *np = inet6_sk(sk);
250 struct in6_addr *first_hop = &fl6->daddr;
251 struct dst_entry *dst = skb_dst(skb);
252 struct net_device *dev = dst->dev;
253 struct inet6_dev *idev = ip6_dst_idev(dst);
254 unsigned int head_room;
256 u8 proto = fl6->flowi6_proto;
257 int seg_len = skb->len;
261 head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dev);
263 head_room += opt->opt_nflen + opt->opt_flen;
265 if (unlikely(head_room > skb_headroom(skb))) {
266 skb = skb_expand_head(skb, head_room);
268 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
274 seg_len += opt->opt_nflen + opt->opt_flen;
277 ipv6_push_frag_opts(skb, opt, &proto);
280 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
284 skb_push(skb, sizeof(struct ipv6hdr));
285 skb_reset_network_header(skb);
289 * Fill in the IPv6 header
292 hlimit = np->hop_limit;
294 hlimit = ip6_dst_hoplimit(dst);
296 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
297 ip6_autoflowlabel(net, np), fl6));
299 hdr->payload_len = htons(seg_len);
300 hdr->nexthdr = proto;
301 hdr->hop_limit = hlimit;
303 hdr->saddr = fl6->saddr;
304 hdr->daddr = *first_hop;
306 skb->protocol = htons(ETH_P_IPV6);
307 skb->priority = priority;
311 if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
312 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
314 /* if egress device is enslaved to an L3 master device pass the
315 * skb to its handler for processing
317 skb = l3mdev_ip6_out((struct sock *)sk, skb);
321 /* hooks should never assume socket lock is held.
322 * we promote our socket to non const
324 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
325 net, (struct sock *)sk, skb, NULL, dev,
330 /* ipv6_local_error() does not require socket lock,
331 * we promote our socket to non const
333 ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
335 IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
339 EXPORT_SYMBOL(ip6_xmit);
341 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
343 struct ip6_ra_chain *ra;
344 struct sock *last = NULL;
346 read_lock(&ip6_ra_lock);
347 for (ra = ip6_ra_chain; ra; ra = ra->next) {
348 struct sock *sk = ra->sk;
349 if (sk && ra->sel == sel &&
350 (!sk->sk_bound_dev_if ||
351 sk->sk_bound_dev_if == skb->dev->ifindex)) {
352 struct ipv6_pinfo *np = inet6_sk(sk);
354 if (np && np->rtalert_isolate &&
355 !net_eq(sock_net(sk), dev_net(skb->dev))) {
359 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
361 rawv6_rcv(last, skb2);
368 rawv6_rcv(last, skb);
369 read_unlock(&ip6_ra_lock);
372 read_unlock(&ip6_ra_lock);
376 static int ip6_forward_proxy_check(struct sk_buff *skb)
378 struct ipv6hdr *hdr = ipv6_hdr(skb);
379 u8 nexthdr = hdr->nexthdr;
383 if (ipv6_ext_hdr(nexthdr)) {
384 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
388 offset = sizeof(struct ipv6hdr);
390 if (nexthdr == IPPROTO_ICMPV6) {
391 struct icmp6hdr *icmp6;
393 if (!pskb_may_pull(skb, (skb_network_header(skb) +
394 offset + 1 - skb->data)))
397 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
399 switch (icmp6->icmp6_type) {
400 case NDISC_ROUTER_SOLICITATION:
401 case NDISC_ROUTER_ADVERTISEMENT:
402 case NDISC_NEIGHBOUR_SOLICITATION:
403 case NDISC_NEIGHBOUR_ADVERTISEMENT:
405 /* For reaction involving unicast neighbor discovery
406 * message destined to the proxied address, pass it to
416 * The proxying router can't forward traffic sent to a link-local
417 * address, so signal the sender and discard the packet. This
418 * behavior is clarified by the MIPv6 specification.
420 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
421 dst_link_failure(skb);
428 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
431 struct dst_entry *dst = skb_dst(skb);
433 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
434 __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
436 #ifdef CONFIG_NET_SWITCHDEV
437 if (skb->offload_l3_fwd_mark) {
444 return dst_output(net, sk, skb);
447 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
452 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
453 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
459 if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
465 int ip6_forward(struct sk_buff *skb)
467 struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
468 struct dst_entry *dst = skb_dst(skb);
469 struct ipv6hdr *hdr = ipv6_hdr(skb);
470 struct inet6_skb_parm *opt = IP6CB(skb);
471 struct net *net = dev_net(dst->dev);
474 if (net->ipv6.devconf_all->forwarding == 0)
477 if (skb->pkt_type != PACKET_HOST)
480 if (unlikely(skb->sk))
483 if (skb_warn_if_lro(skb))
486 if (!net->ipv6.devconf_all->disable_policy &&
487 !idev->cnf.disable_policy &&
488 !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
489 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
493 skb_forward_csum(skb);
496 * We DO NOT make any processing on
497 * RA packets, pushing them to user level AS IS
498 * without ane WARRANTY that application will be able
499 * to interpret them. The reason is that we
500 * cannot make anything clever here.
502 * We are not end-node, so that if packet contains
503 * AH/ESP, we cannot make anything.
504 * Defragmentation also would be mistake, RA packets
505 * cannot be fragmented, because there is no warranty
506 * that different fragments will go along one path. --ANK
508 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
509 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
514 * check and decrement ttl
516 if (hdr->hop_limit <= 1) {
517 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
518 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
524 /* XXX: idev->cnf.proxy_ndp? */
525 if (net->ipv6.devconf_all->proxy_ndp &&
526 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
527 int proxied = ip6_forward_proxy_check(skb);
530 return ip6_input(skb);
531 } else if (proxied < 0) {
532 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
537 if (!xfrm6_route_forward(skb)) {
538 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
543 /* IPv6 specs say nothing about it, but it is clear that we cannot
544 send redirects to source routed frames.
545 We don't send redirects to frames decapsulated from IPsec.
547 if (IP6CB(skb)->iif == dst->dev->ifindex &&
548 opt->srcrt == 0 && !skb_sec_path(skb)) {
549 struct in6_addr *target = NULL;
550 struct inet_peer *peer;
554 * incoming and outgoing devices are the same
558 rt = (struct rt6_info *) dst;
559 if (rt->rt6i_flags & RTF_GATEWAY)
560 target = &rt->rt6i_gateway;
562 target = &hdr->daddr;
564 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
566 /* Limit redirects both by destination (here)
567 and by source (inside ndisc_send_redirect)
569 if (inet_peer_xrlim_allow(peer, 1*HZ))
570 ndisc_send_redirect(skb, target);
574 int addrtype = ipv6_addr_type(&hdr->saddr);
576 /* This check is security critical. */
577 if (addrtype == IPV6_ADDR_ANY ||
578 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
580 if (addrtype & IPV6_ADDR_LINKLOCAL) {
581 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
582 ICMPV6_NOT_NEIGHBOUR, 0);
587 mtu = ip6_dst_mtu_maybe_forward(dst, true);
588 if (mtu < IPV6_MIN_MTU)
591 if (ip6_pkt_too_big(skb, mtu)) {
592 /* Again, force OUTPUT device used as source address */
594 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
595 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
596 __IP6_INC_STATS(net, ip6_dst_idev(dst),
597 IPSTATS_MIB_FRAGFAILS);
602 if (skb_cow(skb, dst->dev->hard_header_len)) {
603 __IP6_INC_STATS(net, ip6_dst_idev(dst),
604 IPSTATS_MIB_OUTDISCARDS);
610 /* Mangling hops number delayed to point after skb COW */
614 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
615 net, NULL, skb, skb->dev, dst->dev,
619 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
625 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
627 to->pkt_type = from->pkt_type;
628 to->priority = from->priority;
629 to->protocol = from->protocol;
631 skb_dst_set(to, dst_clone(skb_dst(from)));
633 to->mark = from->mark;
635 skb_copy_hash(to, from);
637 #ifdef CONFIG_NET_SCHED
638 to->tc_index = from->tc_index;
641 skb_ext_copy(to, from);
642 skb_copy_secmark(to, from);
645 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
646 u8 nexthdr, __be32 frag_id,
647 struct ip6_fraglist_iter *iter)
649 unsigned int first_len;
653 *prevhdr = NEXTHDR_FRAGMENT;
654 iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
658 iter->frag = skb_shinfo(skb)->frag_list;
659 skb_frag_list_init(skb);
663 iter->frag_id = frag_id;
664 iter->nexthdr = nexthdr;
666 __skb_pull(skb, hlen);
667 fh = __skb_push(skb, sizeof(struct frag_hdr));
668 __skb_push(skb, hlen);
669 skb_reset_network_header(skb);
670 memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
672 fh->nexthdr = nexthdr;
674 fh->frag_off = htons(IP6_MF);
675 fh->identification = frag_id;
677 first_len = skb_pagelen(skb);
678 skb->data_len = first_len - skb_headlen(skb);
679 skb->len = first_len;
680 ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
684 EXPORT_SYMBOL(ip6_fraglist_init);
686 void ip6_fraglist_prepare(struct sk_buff *skb,
687 struct ip6_fraglist_iter *iter)
689 struct sk_buff *frag = iter->frag;
690 unsigned int hlen = iter->hlen;
693 frag->ip_summed = CHECKSUM_NONE;
694 skb_reset_transport_header(frag);
695 fh = __skb_push(frag, sizeof(struct frag_hdr));
696 __skb_push(frag, hlen);
697 skb_reset_network_header(frag);
698 memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
699 iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
700 fh->nexthdr = iter->nexthdr;
702 fh->frag_off = htons(iter->offset);
704 fh->frag_off |= htons(IP6_MF);
705 fh->identification = iter->frag_id;
706 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
707 ip6_copy_metadata(frag, skb);
709 EXPORT_SYMBOL(ip6_fraglist_prepare);
711 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
712 unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
713 u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
715 state->prevhdr = prevhdr;
716 state->nexthdr = nexthdr;
717 state->frag_id = frag_id;
722 state->left = skb->len - hlen; /* Space per frame */
723 state->ptr = hlen; /* Where to start from */
725 state->hroom = hdr_room;
726 state->troom = needed_tailroom;
730 EXPORT_SYMBOL(ip6_frag_init);
732 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
734 u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
735 struct sk_buff *frag;
740 /* IF: it doesn't fit, use 'mtu' - the data space left */
741 if (len > state->mtu)
743 /* IF: we are not sending up to and including the packet end
744 then align the next start on an eight byte boundary */
745 if (len < state->left)
748 /* Allocate buffer */
749 frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
750 state->hroom + state->troom, GFP_ATOMIC);
752 return ERR_PTR(-ENOMEM);
755 * Set up data on packet
758 ip6_copy_metadata(frag, skb);
759 skb_reserve(frag, state->hroom);
760 skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
761 skb_reset_network_header(frag);
762 fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
763 frag->transport_header = (frag->network_header + state->hlen +
764 sizeof(struct frag_hdr));
767 * Charge the memory for the fragment to any owner
771 skb_set_owner_w(frag, skb->sk);
774 * Copy the packet header into the new buffer.
776 skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
778 fragnexthdr_offset = skb_network_header(frag);
779 fragnexthdr_offset += prevhdr - skb_network_header(skb);
780 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
783 * Build fragment header.
785 fh->nexthdr = state->nexthdr;
787 fh->identification = state->frag_id;
790 * Copy a block of the IP datagram.
792 BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
796 fh->frag_off = htons(state->offset);
798 fh->frag_off |= htons(IP6_MF);
799 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
802 state->offset += len;
806 EXPORT_SYMBOL(ip6_frag_next);
808 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
809 int (*output)(struct net *, struct sock *, struct sk_buff *))
811 struct sk_buff *frag;
812 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
813 struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
814 inet6_sk(skb->sk) : NULL;
815 struct ip6_frag_state state;
816 unsigned int mtu, hlen, nexthdr_offset;
817 ktime_t tstamp = skb->tstamp;
820 u8 *prevhdr, nexthdr = 0;
822 err = ip6_find_1stfragopt(skb, &prevhdr);
827 nexthdr_offset = prevhdr - skb_network_header(skb);
829 mtu = ip6_skb_dst_mtu(skb);
831 /* We must not fragment if the socket is set to force MTU discovery
832 * or if the skb it not generated by a local socket.
834 if (unlikely(!skb->ignore_df && skb->len > mtu))
837 if (IP6CB(skb)->frag_max_size) {
838 if (IP6CB(skb)->frag_max_size > mtu)
841 /* don't send fragments larger than what we received */
842 mtu = IP6CB(skb)->frag_max_size;
843 if (mtu < IPV6_MIN_MTU)
847 if (np && np->frag_size < mtu) {
851 if (mtu < hlen + sizeof(struct frag_hdr) + 8)
853 mtu -= hlen + sizeof(struct frag_hdr);
855 frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
856 &ipv6_hdr(skb)->saddr);
858 if (skb->ip_summed == CHECKSUM_PARTIAL &&
859 (err = skb_checksum_help(skb)))
862 prevhdr = skb_network_header(skb) + nexthdr_offset;
863 hroom = LL_RESERVED_SPACE(rt->dst.dev);
864 if (skb_has_frag_list(skb)) {
865 unsigned int first_len = skb_pagelen(skb);
866 struct ip6_fraglist_iter iter;
867 struct sk_buff *frag2;
869 if (first_len - hlen > mtu ||
870 ((first_len - hlen) & 7) ||
872 skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
875 skb_walk_frags(skb, frag) {
876 /* Correct geometry. */
877 if (frag->len > mtu ||
878 ((frag->len & 7) && frag->next) ||
879 skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
880 goto slow_path_clean;
882 /* Partially cloned skb? */
883 if (skb_shared(frag))
884 goto slow_path_clean;
889 frag->destructor = sock_wfree;
891 skb->truesize -= frag->truesize;
894 err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
900 /* Prepare header of the next frame,
901 * before previous one went down. */
903 ip6_fraglist_prepare(skb, &iter);
905 skb->tstamp = tstamp;
906 err = output(net, sk, skb);
908 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
909 IPSTATS_MIB_FRAGCREATES);
911 if (err || !iter.frag)
914 skb = ip6_fraglist_next(&iter);
920 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
921 IPSTATS_MIB_FRAGOKS);
925 kfree_skb_list(iter.frag);
927 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
928 IPSTATS_MIB_FRAGFAILS);
932 skb_walk_frags(skb, frag2) {
936 frag2->destructor = NULL;
937 skb->truesize += frag2->truesize;
943 * Fragment the datagram.
946 ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
947 LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
951 * Keep copying data until we run out.
954 while (state.left > 0) {
955 frag = ip6_frag_next(skb, &state);
962 * Put this fragment into the sending queue.
964 frag->tstamp = tstamp;
965 err = output(net, sk, frag);
969 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
970 IPSTATS_MIB_FRAGCREATES);
972 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
973 IPSTATS_MIB_FRAGOKS);
978 if (skb->sk && dst_allfrag(skb_dst(skb)))
979 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
981 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
985 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
986 IPSTATS_MIB_FRAGFAILS);
991 static inline int ip6_rt_check(const struct rt6key *rt_key,
992 const struct in6_addr *fl_addr,
993 const struct in6_addr *addr_cache)
995 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
996 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
999 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1000 struct dst_entry *dst,
1001 const struct flowi6 *fl6)
1003 struct ipv6_pinfo *np = inet6_sk(sk);
1004 struct rt6_info *rt;
1009 if (dst->ops->family != AF_INET6) {
1014 rt = (struct rt6_info *)dst;
1015 /* Yes, checking route validity in not connected
1016 * case is not very simple. Take into account,
1017 * that we do not support routing by source, TOS,
1018 * and MSG_DONTROUTE --ANK (980726)
1020 * 1. ip6_rt_check(): If route was host route,
1021 * check that cached destination is current.
1022 * If it is network route, we still may
1023 * check its validity using saved pointer
1024 * to the last used address: daddr_cache.
1025 * We do not want to save whole address now,
1026 * (because main consumer of this service
1027 * is tcp, which has not this problem),
1028 * so that the last trick works only on connected
1030 * 2. oif also should be the same.
1032 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1033 #ifdef CONFIG_IPV6_SUBTREES
1034 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1036 (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
1037 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
1046 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1047 struct dst_entry **dst, struct flowi6 *fl6)
1049 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1050 struct neighbour *n;
1051 struct rt6_info *rt;
1056 /* The correct way to handle this would be to do
1057 * ip6_route_get_saddr, and then ip6_route_output; however,
1058 * the route-specific preferred source forces the
1059 * ip6_route_output call _before_ ip6_route_get_saddr.
1061 * In source specific routing (no src=any default route),
1062 * ip6_route_output will fail given src=any saddr, though, so
1063 * that's why we try it again later.
1065 if (ipv6_addr_any(&fl6->saddr)) {
1066 struct fib6_info *from;
1067 struct rt6_info *rt;
1069 *dst = ip6_route_output(net, sk, fl6);
1070 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1073 from = rt ? rcu_dereference(rt->from) : NULL;
1074 err = ip6_route_get_saddr(net, from, &fl6->daddr,
1075 sk ? inet6_sk(sk)->srcprefs : 0,
1080 goto out_err_release;
1082 /* If we had an erroneous initial result, pretend it
1083 * never existed and let the SA-enabled version take
1086 if ((*dst)->error) {
1091 if (fl6->flowi6_oif)
1092 flags |= RT6_LOOKUP_F_IFACE;
1096 *dst = ip6_route_output_flags(net, sk, fl6, flags);
1098 err = (*dst)->error;
1100 goto out_err_release;
1102 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1104 * Here if the dst entry we've looked up
1105 * has a neighbour entry that is in the INCOMPLETE
1106 * state and the src address from the flow is
1107 * marked as OPTIMISTIC, we release the found
1108 * dst entry and replace it instead with the
1109 * dst entry of the nexthop router
1111 rt = (struct rt6_info *) *dst;
1113 n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1114 rt6_nexthop(rt, &fl6->daddr));
1115 err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1116 rcu_read_unlock_bh();
1119 struct inet6_ifaddr *ifp;
1120 struct flowi6 fl_gw6;
1123 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1126 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1132 * We need to get the dst entry for the
1133 * default router instead
1136 memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1137 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1138 *dst = ip6_route_output(net, sk, &fl_gw6);
1139 err = (*dst)->error;
1141 goto out_err_release;
1145 if (ipv6_addr_v4mapped(&fl6->saddr) &&
1146 !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1147 err = -EAFNOSUPPORT;
1148 goto out_err_release;
1157 if (err == -ENETUNREACH)
1158 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1163 * ip6_dst_lookup - perform route lookup on flow
1164 * @net: Network namespace to perform lookup in
1165 * @sk: socket which provides route info
1166 * @dst: pointer to dst_entry * for result
1167 * @fl6: flow to lookup
1169 * This function performs a route lookup on the given flow.
1171 * It returns zero on success, or a standard errno code on error.
1173 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1177 return ip6_dst_lookup_tail(net, sk, dst, fl6);
1179 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1182 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1183 * @net: Network namespace to perform lookup in
1184 * @sk: socket which provides route info
1185 * @fl6: flow to lookup
1186 * @final_dst: final destination address for ipsec lookup
1188 * This function performs a route lookup on the given flow.
1190 * It returns a valid dst pointer on success, or a pointer encoded
1193 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1194 const struct in6_addr *final_dst)
1196 struct dst_entry *dst = NULL;
1199 err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1201 return ERR_PTR(err);
1203 fl6->daddr = *final_dst;
1205 return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1207 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1210 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1211 * @sk: socket which provides the dst cache and route info
1212 * @fl6: flow to lookup
1213 * @final_dst: final destination address for ipsec lookup
1214 * @connected: whether @sk is connected or not
1216 * This function performs a route lookup on the given flow with the
1217 * possibility of using the cached route in the socket if it is valid.
1218 * It will take the socket dst lock when operating on the dst cache.
1219 * As a result, this function can only be used in process context.
1221 * In addition, for a connected socket, cache the dst in the socket
1222 * if the current cache is not valid.
1224 * It returns a valid dst pointer on success, or a pointer encoded
1227 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1228 const struct in6_addr *final_dst,
1231 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1233 dst = ip6_sk_dst_check(sk, dst, fl6);
1237 dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1238 if (connected && !IS_ERR(dst))
1239 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1243 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1246 * ip6_dst_lookup_tunnel - perform route lookup on tunnel
1247 * @skb: Packet for which lookup is done
1248 * @dev: Tunnel device
1249 * @net: Network namespace of tunnel device
1250 * @sock: Socket which provides route info
1251 * @saddr: Memory to store the src ip address
1252 * @info: Tunnel information
1253 * @protocol: IP protocol
1254 * @use_cache: Flag to enable cache usage
1255 * This function performs a route lookup on a tunnel
1257 * It returns a valid dst pointer and stores src address to be used in
1258 * tunnel in param saddr on success, else a pointer encoded error code.
1261 struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1262 struct net_device *dev,
1264 struct socket *sock,
1265 struct in6_addr *saddr,
1266 const struct ip_tunnel_info *info,
1270 struct dst_entry *dst = NULL;
1271 #ifdef CONFIG_DST_CACHE
1272 struct dst_cache *dst_cache;
1277 #ifdef CONFIG_DST_CACHE
1278 dst_cache = (struct dst_cache *)&info->dst_cache;
1280 dst = dst_cache_get_ip6(dst_cache, saddr);
1285 memset(&fl6, 0, sizeof(fl6));
1286 fl6.flowi6_mark = skb->mark;
1287 fl6.flowi6_proto = protocol;
1288 fl6.daddr = info->key.u.ipv6.dst;
1289 fl6.saddr = info->key.u.ipv6.src;
1290 prio = info->key.tos;
1291 fl6.flowlabel = ip6_make_flowinfo(RT_TOS(prio),
1294 dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1297 netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1298 return ERR_PTR(-ENETUNREACH);
1300 if (dst->dev == dev) { /* is this necessary? */
1301 netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1303 return ERR_PTR(-ELOOP);
1305 #ifdef CONFIG_DST_CACHE
1307 dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1312 EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1314 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1317 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1320 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1323 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1326 static void ip6_append_data_mtu(unsigned int *mtu,
1328 unsigned int fragheaderlen,
1329 struct sk_buff *skb,
1330 struct rt6_info *rt,
1331 unsigned int orig_mtu)
1333 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1335 /* first fragment, reserve header_len */
1336 *mtu = orig_mtu - rt->dst.header_len;
1340 * this fragment is not first, the headers
1341 * space is regarded as data space.
1345 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1346 + fragheaderlen - sizeof(struct frag_hdr);
1350 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1351 struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1352 struct rt6_info *rt, struct flowi6 *fl6)
1354 struct ipv6_pinfo *np = inet6_sk(sk);
1356 struct ipv6_txoptions *opt = ipc6->opt;
1362 if (WARN_ON(v6_cork->opt))
1365 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1366 if (unlikely(!v6_cork->opt))
1369 v6_cork->opt->tot_len = sizeof(*opt);
1370 v6_cork->opt->opt_flen = opt->opt_flen;
1371 v6_cork->opt->opt_nflen = opt->opt_nflen;
1373 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1375 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1378 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1380 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1383 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1385 if (opt->hopopt && !v6_cork->opt->hopopt)
1388 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1390 if (opt->srcrt && !v6_cork->opt->srcrt)
1393 /* need source address above miyazawa*/
1396 cork->base.dst = &rt->dst;
1397 cork->fl.u.ip6 = *fl6;
1398 v6_cork->hop_limit = ipc6->hlimit;
1399 v6_cork->tclass = ipc6->tclass;
1400 if (rt->dst.flags & DST_XFRM_TUNNEL)
1401 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1402 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1404 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1405 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1406 if (np->frag_size < mtu) {
1408 mtu = np->frag_size;
1410 if (mtu < IPV6_MIN_MTU)
1412 cork->base.fragsize = mtu;
1413 cork->base.gso_size = ipc6->gso_size;
1414 cork->base.tx_flags = 0;
1415 cork->base.mark = ipc6->sockc.mark;
1416 sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1418 if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1419 cork->base.flags |= IPCORK_ALLFRAG;
1420 cork->base.length = 0;
1422 cork->base.transmit_time = ipc6->sockc.transmit_time;
1427 static int __ip6_append_data(struct sock *sk,
1429 struct sk_buff_head *queue,
1430 struct inet_cork *cork,
1431 struct inet6_cork *v6_cork,
1432 struct page_frag *pfrag,
1433 int getfrag(void *from, char *to, int offset,
1434 int len, int odd, struct sk_buff *skb),
1435 void *from, int length, int transhdrlen,
1436 unsigned int flags, struct ipcm6_cookie *ipc6)
1438 struct sk_buff *skb, *skb_prev = NULL;
1439 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1440 struct ubuf_info *uarg = NULL;
1442 int dst_exthdrlen = 0;
1448 struct rt6_info *rt = (struct rt6_info *)cork->dst;
1449 struct ipv6_txoptions *opt = v6_cork->opt;
1450 int csummode = CHECKSUM_NONE;
1451 unsigned int maxnonfragsize, headersize;
1452 unsigned int wmem_alloc_delta = 0;
1453 bool paged, extra_uref = false;
1455 skb = skb_peek_tail(queue);
1457 exthdrlen = opt ? opt->opt_flen : 0;
1458 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1461 paged = !!cork->gso_size;
1462 mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1465 if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1466 sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1467 tskey = sk->sk_tskey++;
1469 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1471 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1472 (opt ? opt->opt_nflen : 0);
1473 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1474 sizeof(struct frag_hdr);
1476 headersize = sizeof(struct ipv6hdr) +
1477 (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1478 (dst_allfrag(&rt->dst) ?
1479 sizeof(struct frag_hdr) : 0) +
1480 rt->rt6i_nfheader_len;
1482 /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1483 * the first fragment
1485 if (headersize + transhdrlen > mtu)
1488 if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1489 (sk->sk_protocol == IPPROTO_UDP ||
1490 sk->sk_protocol == IPPROTO_RAW)) {
1491 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1492 sizeof(struct ipv6hdr));
1496 if (ip6_sk_ignore_df(sk))
1497 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1499 maxnonfragsize = mtu;
1501 if (cork->length + length > maxnonfragsize - headersize) {
1503 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1504 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1508 /* CHECKSUM_PARTIAL only with no extension headers and when
1509 * we are not going to fragment
1511 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1512 headersize == sizeof(struct ipv6hdr) &&
1513 length <= mtu - headersize &&
1514 (!(flags & MSG_MORE) || cork->gso_size) &&
1515 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1516 csummode = CHECKSUM_PARTIAL;
1518 if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
1519 uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
1522 extra_uref = !skb_zcopy(skb); /* only ref on new uarg */
1523 if (rt->dst.dev->features & NETIF_F_SG &&
1524 csummode == CHECKSUM_PARTIAL) {
1528 skb_zcopy_set(skb, uarg, &extra_uref);
1533 * Let's try using as much space as possible.
1534 * Use MTU if total length of the message fits into the MTU.
1535 * Otherwise, we need to reserve fragment header and
1536 * fragment alignment (= 8-15 octects, in total).
1538 * Note that we may need to "move" the data from the tail
1539 * of the buffer to the new fragment when we split
1542 * FIXME: It may be fragmented into multiple chunks
1543 * at once if non-fragmentable extension headers
1548 cork->length += length;
1552 while (length > 0) {
1553 /* Check if the remaining data fits into current packet. */
1554 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1556 copy = maxfraglen - skb->len;
1560 unsigned int datalen;
1561 unsigned int fraglen;
1562 unsigned int fraggap;
1563 unsigned int alloclen, alloc_extra;
1564 unsigned int pagedlen;
1566 /* There's no room in the current skb */
1568 fraggap = skb->len - maxfraglen;
1571 /* update mtu and maxfraglen if necessary */
1572 if (!skb || !skb_prev)
1573 ip6_append_data_mtu(&mtu, &maxfraglen,
1574 fragheaderlen, skb, rt,
1580 * If remaining data exceeds the mtu,
1581 * we know we need more fragment(s).
1583 datalen = length + fraggap;
1585 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1586 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1587 fraglen = datalen + fragheaderlen;
1590 alloc_extra = hh_len;
1591 alloc_extra += dst_exthdrlen;
1592 alloc_extra += rt->dst.trailer_len;
1594 /* We just reserve space for fragment header.
1595 * Note: this may be overallocation if the message
1596 * (without MSG_MORE) fits into the MTU.
1598 alloc_extra += sizeof(struct frag_hdr);
1600 if ((flags & MSG_MORE) &&
1601 !(rt->dst.dev->features&NETIF_F_SG))
1604 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1605 !(rt->dst.dev->features & NETIF_F_SG)))
1608 alloclen = min_t(int, fraglen, MAX_HEADER);
1609 pagedlen = fraglen - alloclen;
1611 alloclen += alloc_extra;
1613 if (datalen != length + fraggap) {
1615 * this is not the last fragment, the trailer
1616 * space is regarded as data space.
1618 datalen += rt->dst.trailer_len;
1621 fraglen = datalen + fragheaderlen;
1623 copy = datalen - transhdrlen - fraggap - pagedlen;
1629 skb = sock_alloc_send_skb(sk, alloclen,
1630 (flags & MSG_DONTWAIT), &err);
1633 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1635 skb = alloc_skb(alloclen,
1643 * Fill in the control structures
1645 skb->protocol = htons(ETH_P_IPV6);
1646 skb->ip_summed = csummode;
1648 /* reserve for fragmentation and ipsec header */
1649 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1653 * Find where to start putting bytes
1655 data = skb_put(skb, fraglen - pagedlen);
1656 skb_set_network_header(skb, exthdrlen);
1657 data += fragheaderlen;
1658 skb->transport_header = (skb->network_header +
1661 skb->csum = skb_copy_and_csum_bits(
1662 skb_prev, maxfraglen,
1663 data + transhdrlen, fraggap);
1664 skb_prev->csum = csum_sub(skb_prev->csum,
1667 pskb_trim_unique(skb_prev, maxfraglen);
1670 getfrag(from, data + transhdrlen, offset,
1671 copy, fraggap, skb) < 0) {
1678 length -= copy + transhdrlen;
1683 /* Only the initial fragment is time stamped */
1684 skb_shinfo(skb)->tx_flags = cork->tx_flags;
1686 skb_shinfo(skb)->tskey = tskey;
1688 skb_zcopy_set(skb, uarg, &extra_uref);
1690 if ((flags & MSG_CONFIRM) && !skb_prev)
1691 skb_set_dst_pending_confirm(skb, 1);
1694 * Put the packet on the pending queue
1696 if (!skb->destructor) {
1697 skb->destructor = sock_wfree;
1699 wmem_alloc_delta += skb->truesize;
1701 __skb_queue_tail(queue, skb);
1708 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1709 skb_tailroom(skb) >= copy) {
1713 if (getfrag(from, skb_put(skb, copy),
1714 offset, copy, off, skb) < 0) {
1715 __skb_trim(skb, off);
1719 } else if (!uarg || !uarg->zerocopy) {
1720 int i = skb_shinfo(skb)->nr_frags;
1723 if (!sk_page_frag_refill(sk, pfrag))
1726 if (!skb_can_coalesce(skb, i, pfrag->page,
1729 if (i == MAX_SKB_FRAGS)
1732 __skb_fill_page_desc(skb, i, pfrag->page,
1734 skb_shinfo(skb)->nr_frags = ++i;
1735 get_page(pfrag->page);
1737 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1739 page_address(pfrag->page) + pfrag->offset,
1740 offset, copy, skb->len, skb) < 0)
1743 pfrag->offset += copy;
1744 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1746 skb->data_len += copy;
1747 skb->truesize += copy;
1748 wmem_alloc_delta += copy;
1750 err = skb_zerocopy_iter_dgram(skb, from, copy);
1758 if (wmem_alloc_delta)
1759 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1765 net_zcopy_put_abort(uarg, extra_uref);
1766 cork->length -= length;
1767 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1768 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1772 int ip6_append_data(struct sock *sk,
1773 int getfrag(void *from, char *to, int offset, int len,
1774 int odd, struct sk_buff *skb),
1775 void *from, int length, int transhdrlen,
1776 struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1777 struct rt6_info *rt, unsigned int flags)
1779 struct inet_sock *inet = inet_sk(sk);
1780 struct ipv6_pinfo *np = inet6_sk(sk);
1784 if (flags&MSG_PROBE)
1786 if (skb_queue_empty(&sk->sk_write_queue)) {
1790 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1795 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1796 length += exthdrlen;
1797 transhdrlen += exthdrlen;
1799 fl6 = &inet->cork.fl.u.ip6;
1803 return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1804 &np->cork, sk_page_frag(sk), getfrag,
1805 from, length, transhdrlen, flags, ipc6);
1807 EXPORT_SYMBOL_GPL(ip6_append_data);
1809 static void ip6_cork_release(struct inet_cork_full *cork,
1810 struct inet6_cork *v6_cork)
1813 kfree(v6_cork->opt->dst0opt);
1814 kfree(v6_cork->opt->dst1opt);
1815 kfree(v6_cork->opt->hopopt);
1816 kfree(v6_cork->opt->srcrt);
1817 kfree(v6_cork->opt);
1818 v6_cork->opt = NULL;
1821 if (cork->base.dst) {
1822 dst_release(cork->base.dst);
1823 cork->base.dst = NULL;
1824 cork->base.flags &= ~IPCORK_ALLFRAG;
1826 memset(&cork->fl, 0, sizeof(cork->fl));
1829 struct sk_buff *__ip6_make_skb(struct sock *sk,
1830 struct sk_buff_head *queue,
1831 struct inet_cork_full *cork,
1832 struct inet6_cork *v6_cork)
1834 struct sk_buff *skb, *tmp_skb;
1835 struct sk_buff **tail_skb;
1836 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1837 struct ipv6_pinfo *np = inet6_sk(sk);
1838 struct net *net = sock_net(sk);
1839 struct ipv6hdr *hdr;
1840 struct ipv6_txoptions *opt = v6_cork->opt;
1841 struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1842 struct flowi6 *fl6 = &cork->fl.u.ip6;
1843 unsigned char proto = fl6->flowi6_proto;
1845 skb = __skb_dequeue(queue);
1848 tail_skb = &(skb_shinfo(skb)->frag_list);
1850 /* move skb->data to ip header from ext header */
1851 if (skb->data < skb_network_header(skb))
1852 __skb_pull(skb, skb_network_offset(skb));
1853 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1854 __skb_pull(tmp_skb, skb_network_header_len(skb));
1855 *tail_skb = tmp_skb;
1856 tail_skb = &(tmp_skb->next);
1857 skb->len += tmp_skb->len;
1858 skb->data_len += tmp_skb->len;
1859 skb->truesize += tmp_skb->truesize;
1860 tmp_skb->destructor = NULL;
1864 /* Allow local fragmentation. */
1865 skb->ignore_df = ip6_sk_ignore_df(sk);
1867 *final_dst = fl6->daddr;
1868 __skb_pull(skb, skb_network_header_len(skb));
1869 if (opt && opt->opt_flen)
1870 ipv6_push_frag_opts(skb, opt, &proto);
1871 if (opt && opt->opt_nflen)
1872 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1874 skb_push(skb, sizeof(struct ipv6hdr));
1875 skb_reset_network_header(skb);
1876 hdr = ipv6_hdr(skb);
1878 ip6_flow_hdr(hdr, v6_cork->tclass,
1879 ip6_make_flowlabel(net, skb, fl6->flowlabel,
1880 ip6_autoflowlabel(net, np), fl6));
1881 hdr->hop_limit = v6_cork->hop_limit;
1882 hdr->nexthdr = proto;
1883 hdr->saddr = fl6->saddr;
1884 hdr->daddr = *final_dst;
1886 skb->priority = sk->sk_priority;
1887 skb->mark = cork->base.mark;
1889 skb->tstamp = cork->base.transmit_time;
1891 skb_dst_set(skb, dst_clone(&rt->dst));
1892 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1893 if (proto == IPPROTO_ICMPV6) {
1894 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1896 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1897 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1900 ip6_cork_release(cork, v6_cork);
1905 int ip6_send_skb(struct sk_buff *skb)
1907 struct net *net = sock_net(skb->sk);
1908 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1911 err = ip6_local_out(net, skb->sk, skb);
1914 err = net_xmit_errno(err);
1916 IP6_INC_STATS(net, rt->rt6i_idev,
1917 IPSTATS_MIB_OUTDISCARDS);
1923 int ip6_push_pending_frames(struct sock *sk)
1925 struct sk_buff *skb;
1927 skb = ip6_finish_skb(sk);
1931 return ip6_send_skb(skb);
1933 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1935 static void __ip6_flush_pending_frames(struct sock *sk,
1936 struct sk_buff_head *queue,
1937 struct inet_cork_full *cork,
1938 struct inet6_cork *v6_cork)
1940 struct sk_buff *skb;
1942 while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1944 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1945 IPSTATS_MIB_OUTDISCARDS);
1949 ip6_cork_release(cork, v6_cork);
1952 void ip6_flush_pending_frames(struct sock *sk)
1954 __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1955 &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1957 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1959 struct sk_buff *ip6_make_skb(struct sock *sk,
1960 int getfrag(void *from, char *to, int offset,
1961 int len, int odd, struct sk_buff *skb),
1962 void *from, int length, int transhdrlen,
1963 struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1964 struct rt6_info *rt, unsigned int flags,
1965 struct inet_cork_full *cork)
1967 struct inet6_cork v6_cork;
1968 struct sk_buff_head queue;
1969 int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1972 if (flags & MSG_PROBE)
1975 __skb_queue_head_init(&queue);
1977 cork->base.flags = 0;
1978 cork->base.addr = 0;
1979 cork->base.opt = NULL;
1980 cork->base.dst = NULL;
1982 err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
1984 ip6_cork_release(cork, &v6_cork);
1985 return ERR_PTR(err);
1987 if (ipc6->dontfrag < 0)
1988 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1990 err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
1991 ¤t->task_frag, getfrag, from,
1992 length + exthdrlen, transhdrlen + exthdrlen,
1995 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
1996 return ERR_PTR(err);
1999 return __ip6_make_skb(sk, &queue, cork, &v6_cork);