2 * IPv6 output functions
3 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * Based on linux/net/ipv4/ip_output.c
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
16 * A.N.Kuznetsov : airthmetics in fragmentation.
17 * extension headers are implemented.
18 * route changes now work.
19 * ip6_forward does not confuse sniffers.
22 * H. von Brand : Added missing #include <linux/string.h>
23 * Imran Patel : frag id should be in NBO
24 * Kazunori MIYAZAWA @USAGI
25 * : add ip6_append_data and related functions
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
61 int __ip6_local_out(struct sk_buff *skb)
65 len = skb->len - sizeof(struct ipv6hdr);
66 if (len > IPV6_MAXPLEN)
68 ipv6_hdr(skb)->payload_len = htons(len);
70 return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71 skb_dst(skb)->dev, dst_output);
74 int ip6_local_out(struct sk_buff *skb)
78 err = __ip6_local_out(skb);
80 err = dst_output(skb);
84 EXPORT_SYMBOL_GPL(ip6_local_out);
86 static int ip6_finish_output2(struct sk_buff *skb)
88 struct dst_entry *dst = skb_dst(skb);
89 struct net_device *dev = dst->dev;
90 struct neighbour *neigh;
93 skb->protocol = htons(ETH_P_IPV6);
96 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
97 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
99 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
100 ((mroute6_socket(dev_net(dev), skb) &&
101 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
102 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
103 &ipv6_hdr(skb)->saddr))) {
104 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
106 /* Do not check for IFF_ALLMULTI; multicast routing
107 is not supported in any case.
110 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
111 newskb, NULL, newskb->dev,
114 if (ipv6_hdr(skb)->hop_limit == 0) {
115 IP6_INC_STATS(dev_net(dev), idev,
116 IPSTATS_MIB_OUTDISCARDS);
122 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
126 rt = (struct rt6_info *) dst;
129 return dst_neigh_output(dst, neigh, skb);
131 IP6_INC_STATS_BH(dev_net(dst->dev),
132 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
137 static int ip6_finish_output(struct sk_buff *skb)
139 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
140 dst_allfrag(skb_dst(skb)))
141 return ip6_fragment(skb, ip6_finish_output2);
143 return ip6_finish_output2(skb);
146 int ip6_output(struct sk_buff *skb)
148 struct net_device *dev = skb_dst(skb)->dev;
149 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
150 if (unlikely(idev->cnf.disable_ipv6)) {
151 IP6_INC_STATS(dev_net(dev), idev,
152 IPSTATS_MIB_OUTDISCARDS);
157 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
159 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
163 * xmit an sk_buff (used by TCP, SCTP and DCCP)
166 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
167 struct ipv6_txoptions *opt, int tclass)
169 struct net *net = sock_net(sk);
170 struct ipv6_pinfo *np = inet6_sk(sk);
171 struct in6_addr *first_hop = &fl6->daddr;
172 struct dst_entry *dst = skb_dst(skb);
174 u8 proto = fl6->flowi6_proto;
175 int seg_len = skb->len;
180 unsigned int head_room;
182 /* First: exthdrs may take lots of space (~8K for now)
183 MAX_HEADER is not enough.
185 head_room = opt->opt_nflen + opt->opt_flen;
186 seg_len += head_room;
187 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
189 if (skb_headroom(skb) < head_room) {
190 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
192 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
193 IPSTATS_MIB_OUTDISCARDS);
199 skb_set_owner_w(skb, sk);
202 ipv6_push_frag_opts(skb, opt, &proto);
204 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
207 skb_push(skb, sizeof(struct ipv6hdr));
208 skb_reset_network_header(skb);
212 * Fill in the IPv6 header
215 hlimit = np->hop_limit;
217 hlimit = ip6_dst_hoplimit(dst);
219 *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
221 hdr->payload_len = htons(seg_len);
222 hdr->nexthdr = proto;
223 hdr->hop_limit = hlimit;
225 hdr->saddr = fl6->saddr;
226 hdr->daddr = *first_hop;
228 skb->priority = sk->sk_priority;
229 skb->mark = sk->sk_mark;
232 if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
233 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
234 IPSTATS_MIB_OUT, skb->len);
235 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
236 dst->dev, dst_output);
239 net_dbg_ratelimited("IPv6: sending pkt_too_big to self\n");
241 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
242 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
247 EXPORT_SYMBOL(ip6_xmit);
250 * To avoid extra problems ND packets are send through this
251 * routine. It's code duplication but I really want to avoid
252 * extra checks since ipv6_build_header is used by TCP (which
253 * is for us performance critical)
256 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
257 const struct in6_addr *saddr, const struct in6_addr *daddr,
260 struct ipv6_pinfo *np = inet6_sk(sk);
263 skb->protocol = htons(ETH_P_IPV6);
266 skb_reset_network_header(skb);
267 skb_put(skb, sizeof(struct ipv6hdr));
270 *(__be32*)hdr = htonl(0x60000000);
272 hdr->payload_len = htons(len);
273 hdr->nexthdr = proto;
274 hdr->hop_limit = np->hop_limit;
282 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
284 struct ip6_ra_chain *ra;
285 struct sock *last = NULL;
287 read_lock(&ip6_ra_lock);
288 for (ra = ip6_ra_chain; ra; ra = ra->next) {
289 struct sock *sk = ra->sk;
290 if (sk && ra->sel == sel &&
291 (!sk->sk_bound_dev_if ||
292 sk->sk_bound_dev_if == skb->dev->ifindex)) {
294 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
296 rawv6_rcv(last, skb2);
303 rawv6_rcv(last, skb);
304 read_unlock(&ip6_ra_lock);
307 read_unlock(&ip6_ra_lock);
311 static int ip6_forward_proxy_check(struct sk_buff *skb)
313 struct ipv6hdr *hdr = ipv6_hdr(skb);
314 u8 nexthdr = hdr->nexthdr;
318 if (ipv6_ext_hdr(nexthdr)) {
319 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
323 offset = sizeof(struct ipv6hdr);
325 if (nexthdr == IPPROTO_ICMPV6) {
326 struct icmp6hdr *icmp6;
328 if (!pskb_may_pull(skb, (skb_network_header(skb) +
329 offset + 1 - skb->data)))
332 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
334 switch (icmp6->icmp6_type) {
335 case NDISC_ROUTER_SOLICITATION:
336 case NDISC_ROUTER_ADVERTISEMENT:
337 case NDISC_NEIGHBOUR_SOLICITATION:
338 case NDISC_NEIGHBOUR_ADVERTISEMENT:
340 /* For reaction involving unicast neighbor discovery
341 * message destined to the proxied address, pass it to
351 * The proxying router can't forward traffic sent to a link-local
352 * address, so signal the sender and discard the packet. This
353 * behavior is clarified by the MIPv6 specification.
355 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
356 dst_link_failure(skb);
363 static inline int ip6_forward_finish(struct sk_buff *skb)
365 return dst_output(skb);
368 int ip6_forward(struct sk_buff *skb)
370 struct dst_entry *dst = skb_dst(skb);
371 struct ipv6hdr *hdr = ipv6_hdr(skb);
372 struct inet6_skb_parm *opt = IP6CB(skb);
373 struct net *net = dev_net(dst->dev);
376 if (net->ipv6.devconf_all->forwarding == 0)
379 if (skb_warn_if_lro(skb))
382 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
383 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
387 if (skb->pkt_type != PACKET_HOST)
390 skb_forward_csum(skb);
393 * We DO NOT make any processing on
394 * RA packets, pushing them to user level AS IS
395 * without ane WARRANTY that application will be able
396 * to interpret them. The reason is that we
397 * cannot make anything clever here.
399 * We are not end-node, so that if packet contains
400 * AH/ESP, we cannot make anything.
401 * Defragmentation also would be mistake, RA packets
402 * cannot be fragmented, because there is no warranty
403 * that different fragments will go along one path. --ANK
406 u8 *ptr = skb_network_header(skb) + opt->ra;
407 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
412 * check and decrement ttl
414 if (hdr->hop_limit <= 1) {
415 /* Force OUTPUT device used as source address */
417 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
418 IP6_INC_STATS_BH(net,
419 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
425 /* XXX: idev->cnf.proxy_ndp? */
426 if (net->ipv6.devconf_all->proxy_ndp &&
427 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
428 int proxied = ip6_forward_proxy_check(skb);
430 return ip6_input(skb);
431 else if (proxied < 0) {
432 IP6_INC_STATS(net, ip6_dst_idev(dst),
433 IPSTATS_MIB_INDISCARDS);
438 if (!xfrm6_route_forward(skb)) {
439 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
444 /* IPv6 specs say nothing about it, but it is clear that we cannot
445 send redirects to source routed frames.
446 We don't send redirects to frames decapsulated from IPsec.
448 if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
449 struct in6_addr *target = NULL;
450 struct inet_peer *peer;
454 * incoming and outgoing devices are the same
458 rt = (struct rt6_info *) dst;
459 if (rt->rt6i_flags & RTF_GATEWAY)
460 target = &rt->rt6i_gateway;
462 target = &hdr->daddr;
464 peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
466 /* Limit redirects both by destination (here)
467 and by source (inside ndisc_send_redirect)
469 if (inet_peer_xrlim_allow(peer, 1*HZ))
470 ndisc_send_redirect(skb, target);
474 int addrtype = ipv6_addr_type(&hdr->saddr);
476 /* This check is security critical. */
477 if (addrtype == IPV6_ADDR_ANY ||
478 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
480 if (addrtype & IPV6_ADDR_LINKLOCAL) {
481 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
482 ICMPV6_NOT_NEIGHBOUR, 0);
488 if (mtu < IPV6_MIN_MTU)
491 if ((!skb->local_df && skb->len > mtu && !skb_is_gso(skb)) ||
492 (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)) {
493 /* Again, force OUTPUT device used as source address */
495 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
496 IP6_INC_STATS_BH(net,
497 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
498 IP6_INC_STATS_BH(net,
499 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
504 if (skb_cow(skb, dst->dev->hard_header_len)) {
505 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
511 /* Mangling hops number delayed to point after skb COW */
515 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
516 IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
517 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
521 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
527 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
529 to->pkt_type = from->pkt_type;
530 to->priority = from->priority;
531 to->protocol = from->protocol;
533 skb_dst_set(to, dst_clone(skb_dst(from)));
535 to->mark = from->mark;
537 #ifdef CONFIG_NET_SCHED
538 to->tc_index = from->tc_index;
541 #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
542 to->nf_trace = from->nf_trace;
544 skb_copy_secmark(to, from);
547 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
549 struct sk_buff *frag;
550 struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
551 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
552 struct ipv6hdr *tmp_hdr;
554 unsigned int mtu, hlen, left, len;
557 int ptr, offset = 0, err=0;
558 u8 *prevhdr, nexthdr = 0;
559 struct net *net = dev_net(skb_dst(skb)->dev);
561 hlen = ip6_find_1stfragopt(skb, &prevhdr);
564 mtu = ip6_skb_dst_mtu(skb);
566 /* We must not fragment if the socket is set to force MTU discovery
567 * or if the skb it not generated by a local socket.
569 if (unlikely(!skb->local_df && skb->len > mtu) ||
570 (IP6CB(skb)->frag_max_size &&
571 IP6CB(skb)->frag_max_size > mtu)) {
572 if (skb->sk && dst_allfrag(skb_dst(skb)))
573 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
575 skb->dev = skb_dst(skb)->dev;
576 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
577 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
578 IPSTATS_MIB_FRAGFAILS);
583 if (np && np->frag_size < mtu) {
587 mtu -= hlen + sizeof(struct frag_hdr);
589 if (skb_has_frag_list(skb)) {
590 int first_len = skb_pagelen(skb);
591 struct sk_buff *frag2;
593 if (first_len - hlen > mtu ||
594 ((first_len - hlen) & 7) ||
598 skb_walk_frags(skb, frag) {
599 /* Correct geometry. */
600 if (frag->len > mtu ||
601 ((frag->len & 7) && frag->next) ||
602 skb_headroom(frag) < hlen)
603 goto slow_path_clean;
605 /* Partially cloned skb? */
606 if (skb_shared(frag))
607 goto slow_path_clean;
612 frag->destructor = sock_wfree;
614 skb->truesize -= frag->truesize;
619 frag = skb_shinfo(skb)->frag_list;
620 skb_frag_list_init(skb);
623 *prevhdr = NEXTHDR_FRAGMENT;
624 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
626 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
627 IPSTATS_MIB_FRAGFAILS);
631 __skb_pull(skb, hlen);
632 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
633 __skb_push(skb, hlen);
634 skb_reset_network_header(skb);
635 memcpy(skb_network_header(skb), tmp_hdr, hlen);
637 ipv6_select_ident(fh, rt);
638 fh->nexthdr = nexthdr;
640 fh->frag_off = htons(IP6_MF);
641 frag_id = fh->identification;
643 first_len = skb_pagelen(skb);
644 skb->data_len = first_len - skb_headlen(skb);
645 skb->len = first_len;
646 ipv6_hdr(skb)->payload_len = htons(first_len -
647 sizeof(struct ipv6hdr));
652 /* Prepare header of the next frame,
653 * before previous one went down. */
655 frag->ip_summed = CHECKSUM_NONE;
656 skb_reset_transport_header(frag);
657 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
658 __skb_push(frag, hlen);
659 skb_reset_network_header(frag);
660 memcpy(skb_network_header(frag), tmp_hdr,
662 offset += skb->len - hlen - sizeof(struct frag_hdr);
663 fh->nexthdr = nexthdr;
665 fh->frag_off = htons(offset);
666 if (frag->next != NULL)
667 fh->frag_off |= htons(IP6_MF);
668 fh->identification = frag_id;
669 ipv6_hdr(frag)->payload_len =
671 sizeof(struct ipv6hdr));
672 ip6_copy_metadata(frag, skb);
677 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
678 IPSTATS_MIB_FRAGCREATES);
691 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
692 IPSTATS_MIB_FRAGOKS);
703 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
704 IPSTATS_MIB_FRAGFAILS);
709 skb_walk_frags(skb, frag2) {
713 frag2->destructor = NULL;
714 skb->truesize += frag2->truesize;
719 if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
720 skb_checksum_help(skb))
723 left = skb->len - hlen; /* Space per frame */
724 ptr = hlen; /* Where to start from */
727 * Fragment the datagram.
730 *prevhdr = NEXTHDR_FRAGMENT;
731 hroom = LL_RESERVED_SPACE(rt->dst.dev);
732 troom = rt->dst.dev->needed_tailroom;
735 * Keep copying data until we run out.
739 /* IF: it doesn't fit, use 'mtu' - the data space left */
742 /* IF: we are not sending up to and including the packet end
743 then align the next start on an eight byte boundary */
751 if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
752 hroom + troom, GFP_ATOMIC)) == NULL) {
753 NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
754 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
755 IPSTATS_MIB_FRAGFAILS);
761 * Set up data on packet
764 ip6_copy_metadata(frag, skb);
765 skb_reserve(frag, hroom);
766 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
767 skb_reset_network_header(frag);
768 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
769 frag->transport_header = (frag->network_header + hlen +
770 sizeof(struct frag_hdr));
773 * Charge the memory for the fragment to any owner
777 skb_set_owner_w(frag, skb->sk);
780 * Copy the packet header into the new buffer.
782 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
785 * Build fragment header.
787 fh->nexthdr = nexthdr;
790 ipv6_select_ident(fh, rt);
791 frag_id = fh->identification;
793 fh->identification = frag_id;
796 * Copy a block of the IP datagram.
798 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
802 fh->frag_off = htons(offset);
804 fh->frag_off |= htons(IP6_MF);
805 ipv6_hdr(frag)->payload_len = htons(frag->len -
806 sizeof(struct ipv6hdr));
812 * Put this fragment into the sending queue.
818 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
819 IPSTATS_MIB_FRAGCREATES);
821 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
822 IPSTATS_MIB_FRAGOKS);
827 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
828 IPSTATS_MIB_FRAGFAILS);
833 static inline int ip6_rt_check(const struct rt6key *rt_key,
834 const struct in6_addr *fl_addr,
835 const struct in6_addr *addr_cache)
837 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
838 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
841 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
842 struct dst_entry *dst,
843 const struct flowi6 *fl6)
845 struct ipv6_pinfo *np = inet6_sk(sk);
846 struct rt6_info *rt = (struct rt6_info *)dst;
851 /* Yes, checking route validity in not connected
852 * case is not very simple. Take into account,
853 * that we do not support routing by source, TOS,
854 * and MSG_DONTROUTE --ANK (980726)
856 * 1. ip6_rt_check(): If route was host route,
857 * check that cached destination is current.
858 * If it is network route, we still may
859 * check its validity using saved pointer
860 * to the last used address: daddr_cache.
861 * We do not want to save whole address now,
862 * (because main consumer of this service
863 * is tcp, which has not this problem),
864 * so that the last trick works only on connected
866 * 2. oif also should be the same.
868 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
869 #ifdef CONFIG_IPV6_SUBTREES
870 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
872 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
881 static int ip6_dst_lookup_tail(struct sock *sk,
882 struct dst_entry **dst, struct flowi6 *fl6)
884 struct net *net = sock_net(sk);
885 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
892 *dst = ip6_route_output(net, sk, fl6);
894 if ((err = (*dst)->error))
895 goto out_err_release;
897 if (ipv6_addr_any(&fl6->saddr)) {
898 struct rt6_info *rt = (struct rt6_info *) *dst;
899 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
900 sk ? inet6_sk(sk)->srcprefs : 0,
903 goto out_err_release;
906 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
908 * Here if the dst entry we've looked up
909 * has a neighbour entry that is in the INCOMPLETE
910 * state and the src address from the flow is
911 * marked as OPTIMISTIC, we release the found
912 * dst entry and replace it instead with the
913 * dst entry of the nexthop router
915 rt = (struct rt6_info *) *dst;
917 if (n && !(n->nud_state & NUD_VALID)) {
918 struct inet6_ifaddr *ifp;
919 struct flowi6 fl_gw6;
922 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
925 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
931 * We need to get the dst entry for the
932 * default router instead
935 memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
936 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
937 *dst = ip6_route_output(net, sk, &fl_gw6);
938 if ((err = (*dst)->error))
939 goto out_err_release;
947 if (err == -ENETUNREACH)
948 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
955 * ip6_dst_lookup - perform route lookup on flow
956 * @sk: socket which provides route info
957 * @dst: pointer to dst_entry * for result
958 * @fl6: flow to lookup
960 * This function performs a route lookup on the given flow.
962 * It returns zero on success, or a standard errno code on error.
964 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
967 return ip6_dst_lookup_tail(sk, dst, fl6);
969 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
972 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec
973 * @sk: socket which provides route info
974 * @fl6: flow to lookup
975 * @final_dst: final destination address for ipsec lookup
976 * @can_sleep: we are in a sleepable context
978 * This function performs a route lookup on the given flow.
980 * It returns a valid dst pointer on success, or a pointer encoded
983 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
984 const struct in6_addr *final_dst,
987 struct dst_entry *dst = NULL;
990 err = ip6_dst_lookup_tail(sk, &dst, fl6);
994 fl6->daddr = *final_dst;
996 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
998 return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1000 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1003 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1004 * @sk: socket which provides the dst cache and route info
1005 * @fl6: flow to lookup
1006 * @final_dst: final destination address for ipsec lookup
1007 * @can_sleep: we are in a sleepable context
1009 * This function performs a route lookup on the given flow with the
1010 * possibility of using the cached route in the socket if it is valid.
1011 * It will take the socket dst lock when operating on the dst cache.
1012 * As a result, this function can only be used in process context.
1014 * It returns a valid dst pointer on success, or a pointer encoded
1017 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1018 const struct in6_addr *final_dst,
1021 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1024 dst = ip6_sk_dst_check(sk, dst, fl6);
1026 err = ip6_dst_lookup_tail(sk, &dst, fl6);
1028 return ERR_PTR(err);
1030 fl6->daddr = *final_dst;
1032 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1034 return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1036 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1038 static inline int ip6_ufo_append_data(struct sock *sk,
1039 int getfrag(void *from, char *to, int offset, int len,
1040 int odd, struct sk_buff *skb),
1041 void *from, int length, int hh_len, int fragheaderlen,
1042 int transhdrlen, int mtu,unsigned int flags,
1043 struct rt6_info *rt)
1046 struct sk_buff *skb;
1049 /* There is support for UDP large send offload by network
1050 * device, so create one single skb packet containing complete
1053 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1054 skb = sock_alloc_send_skb(sk,
1055 hh_len + fragheaderlen + transhdrlen + 20,
1056 (flags & MSG_DONTWAIT), &err);
1060 /* reserve space for Hardware header */
1061 skb_reserve(skb, hh_len);
1063 /* create space for UDP/IP header */
1064 skb_put(skb,fragheaderlen + transhdrlen);
1066 /* initialize network header pointer */
1067 skb_reset_network_header(skb);
1069 /* initialize protocol header pointer */
1070 skb->transport_header = skb->network_header + fragheaderlen;
1072 skb->ip_summed = CHECKSUM_PARTIAL;
1076 err = skb_append_datato_frags(sk,skb, getfrag, from,
1077 (length - transhdrlen));
1079 struct frag_hdr fhdr;
1081 /* Specify the length of each IPv6 datagram fragment.
1082 * It has to be a multiple of 8.
1084 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1085 sizeof(struct frag_hdr)) & ~7;
1086 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1087 ipv6_select_ident(&fhdr, rt);
1088 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1089 __skb_queue_tail(&sk->sk_write_queue, skb);
1093 /* There is not enough support do UPD LSO,
1094 * so follow normal path
1101 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1104 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1107 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1110 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1113 static void ip6_append_data_mtu(int *mtu,
1115 unsigned int fragheaderlen,
1116 struct sk_buff *skb,
1117 struct rt6_info *rt)
1119 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1121 /* first fragment, reserve header_len */
1122 *mtu = *mtu - rt->dst.header_len;
1126 * this fragment is not first, the headers
1127 * space is regarded as data space.
1129 *mtu = dst_mtu(rt->dst.path);
1131 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1132 + fragheaderlen - sizeof(struct frag_hdr);
1136 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1137 int offset, int len, int odd, struct sk_buff *skb),
1138 void *from, int length, int transhdrlen,
1139 int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1140 struct rt6_info *rt, unsigned int flags, int dontfrag)
1142 struct inet_sock *inet = inet_sk(sk);
1143 struct ipv6_pinfo *np = inet6_sk(sk);
1144 struct inet_cork *cork;
1145 struct sk_buff *skb, *skb_prev = NULL;
1146 unsigned int maxfraglen, fragheaderlen;
1156 if (flags&MSG_PROBE)
1158 cork = &inet->cork.base;
1159 if (skb_queue_empty(&sk->sk_write_queue)) {
1164 if (WARN_ON(np->cork.opt))
1167 np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1168 if (unlikely(np->cork.opt == NULL))
1171 np->cork.opt->tot_len = opt->tot_len;
1172 np->cork.opt->opt_flen = opt->opt_flen;
1173 np->cork.opt->opt_nflen = opt->opt_nflen;
1175 np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1177 if (opt->dst0opt && !np->cork.opt->dst0opt)
1180 np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1182 if (opt->dst1opt && !np->cork.opt->dst1opt)
1185 np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1187 if (opt->hopopt && !np->cork.opt->hopopt)
1190 np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1192 if (opt->srcrt && !np->cork.opt->srcrt)
1195 /* need source address above miyazawa*/
1198 cork->dst = &rt->dst;
1199 inet->cork.fl.u.ip6 = *fl6;
1200 np->cork.hop_limit = hlimit;
1201 np->cork.tclass = tclass;
1202 if (rt->dst.flags & DST_XFRM_TUNNEL)
1203 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1204 rt->dst.dev->mtu : dst_mtu(&rt->dst);
1206 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1207 rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1208 if (np->frag_size < mtu) {
1210 mtu = np->frag_size;
1212 cork->fragsize = mtu;
1213 if (dst_allfrag(rt->dst.path))
1214 cork->flags |= IPCORK_ALLFRAG;
1216 exthdrlen = (opt ? opt->opt_flen : 0) - rt->rt6i_nfheader_len;
1217 length += exthdrlen;
1218 transhdrlen += exthdrlen;
1219 dst_exthdrlen = rt->dst.header_len;
1221 rt = (struct rt6_info *)cork->dst;
1222 fl6 = &inet->cork.fl.u.ip6;
1227 mtu = cork->fragsize;
1230 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1232 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1233 (opt ? opt->opt_nflen : 0);
1234 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1236 if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1237 if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1238 ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1243 /* For UDP, check if TX timestamp is enabled */
1244 if (sk->sk_type == SOCK_DGRAM) {
1245 err = sock_tx_timestamp(sk, &tx_flags);
1251 * Let's try using as much space as possible.
1252 * Use MTU if total length of the message fits into the MTU.
1253 * Otherwise, we need to reserve fragment header and
1254 * fragment alignment (= 8-15 octects, in total).
1256 * Note that we may need to "move" the data from the tail of
1257 * of the buffer to the new fragment when we split
1260 * FIXME: It may be fragmented into multiple chunks
1261 * at once if non-fragmentable extension headers
1266 cork->length += length;
1268 int proto = sk->sk_protocol;
1269 if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1270 ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1274 if (proto == IPPROTO_UDP &&
1275 (rt->dst.dev->features & NETIF_F_UFO)) {
1277 err = ip6_ufo_append_data(sk, getfrag, from, length,
1278 hh_len, fragheaderlen,
1279 transhdrlen, mtu, flags, rt);
1286 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1289 while (length > 0) {
1290 /* Check if the remaining data fits into current packet. */
1291 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1293 copy = maxfraglen - skb->len;
1297 unsigned int datalen;
1298 unsigned int fraglen;
1299 unsigned int fraggap;
1300 unsigned int alloclen;
1302 /* There's no room in the current skb */
1304 fraggap = skb->len - maxfraglen;
1307 /* update mtu and maxfraglen if necessary */
1308 if (skb == NULL || skb_prev == NULL)
1309 ip6_append_data_mtu(&mtu, &maxfraglen,
1310 fragheaderlen, skb, rt);
1315 * If remaining data exceeds the mtu,
1316 * we know we need more fragment(s).
1318 datalen = length + fraggap;
1320 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1321 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1322 if ((flags & MSG_MORE) &&
1323 !(rt->dst.dev->features&NETIF_F_SG))
1326 alloclen = datalen + fragheaderlen;
1328 alloclen += dst_exthdrlen;
1330 if (datalen != length + fraggap) {
1332 * this is not the last fragment, the trailer
1333 * space is regarded as data space.
1335 datalen += rt->dst.trailer_len;
1338 alloclen += rt->dst.trailer_len;
1339 fraglen = datalen + fragheaderlen;
1342 * We just reserve space for fragment header.
1343 * Note: this may be overallocation if the message
1344 * (without MSG_MORE) fits into the MTU.
1346 alloclen += sizeof(struct frag_hdr);
1349 skb = sock_alloc_send_skb(sk,
1351 (flags & MSG_DONTWAIT), &err);
1354 if (atomic_read(&sk->sk_wmem_alloc) <=
1356 skb = sock_wmalloc(sk,
1357 alloclen + hh_len, 1,
1359 if (unlikely(skb == NULL))
1362 /* Only the initial fragment
1371 * Fill in the control structures
1373 skb->ip_summed = CHECKSUM_NONE;
1375 /* reserve for fragmentation and ipsec header */
1376 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1379 if (sk->sk_type == SOCK_DGRAM)
1380 skb_shinfo(skb)->tx_flags = tx_flags;
1383 * Find where to start putting bytes
1385 data = skb_put(skb, fraglen);
1386 skb_set_network_header(skb, exthdrlen);
1387 data += fragheaderlen;
1388 skb->transport_header = (skb->network_header +
1391 skb->csum = skb_copy_and_csum_bits(
1392 skb_prev, maxfraglen,
1393 data + transhdrlen, fraggap, 0);
1394 skb_prev->csum = csum_sub(skb_prev->csum,
1397 pskb_trim_unique(skb_prev, maxfraglen);
1399 copy = datalen - transhdrlen - fraggap;
1405 } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1412 length -= datalen - fraggap;
1418 * Put the packet on the pending queue
1420 __skb_queue_tail(&sk->sk_write_queue, skb);
1427 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1431 if (getfrag(from, skb_put(skb, copy),
1432 offset, copy, off, skb) < 0) {
1433 __skb_trim(skb, off);
1438 int i = skb_shinfo(skb)->nr_frags;
1439 struct page_frag *pfrag = sk_page_frag(sk);
1442 if (!sk_page_frag_refill(sk, pfrag))
1445 if (!skb_can_coalesce(skb, i, pfrag->page,
1448 if (i == MAX_SKB_FRAGS)
1451 __skb_fill_page_desc(skb, i, pfrag->page,
1453 skb_shinfo(skb)->nr_frags = ++i;
1454 get_page(pfrag->page);
1456 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1458 page_address(pfrag->page) + pfrag->offset,
1459 offset, copy, skb->len, skb) < 0)
1462 pfrag->offset += copy;
1463 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1465 skb->data_len += copy;
1466 skb->truesize += copy;
1467 atomic_add(copy, &sk->sk_wmem_alloc);
1478 cork->length -= length;
1479 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1482 EXPORT_SYMBOL_GPL(ip6_append_data);
1484 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1487 kfree(np->cork.opt->dst0opt);
1488 kfree(np->cork.opt->dst1opt);
1489 kfree(np->cork.opt->hopopt);
1490 kfree(np->cork.opt->srcrt);
1491 kfree(np->cork.opt);
1492 np->cork.opt = NULL;
1495 if (inet->cork.base.dst) {
1496 dst_release(inet->cork.base.dst);
1497 inet->cork.base.dst = NULL;
1498 inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1500 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1503 int ip6_push_pending_frames(struct sock *sk)
1505 struct sk_buff *skb, *tmp_skb;
1506 struct sk_buff **tail_skb;
1507 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1508 struct inet_sock *inet = inet_sk(sk);
1509 struct ipv6_pinfo *np = inet6_sk(sk);
1510 struct net *net = sock_net(sk);
1511 struct ipv6hdr *hdr;
1512 struct ipv6_txoptions *opt = np->cork.opt;
1513 struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1514 struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1515 unsigned char proto = fl6->flowi6_proto;
1518 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1520 tail_skb = &(skb_shinfo(skb)->frag_list);
1522 /* move skb->data to ip header from ext header */
1523 if (skb->data < skb_network_header(skb))
1524 __skb_pull(skb, skb_network_offset(skb));
1525 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1526 __skb_pull(tmp_skb, skb_network_header_len(skb));
1527 *tail_skb = tmp_skb;
1528 tail_skb = &(tmp_skb->next);
1529 skb->len += tmp_skb->len;
1530 skb->data_len += tmp_skb->len;
1531 skb->truesize += tmp_skb->truesize;
1532 tmp_skb->destructor = NULL;
1536 /* Allow local fragmentation. */
1537 if (np->pmtudisc < IPV6_PMTUDISC_DO)
1540 *final_dst = fl6->daddr;
1541 __skb_pull(skb, skb_network_header_len(skb));
1542 if (opt && opt->opt_flen)
1543 ipv6_push_frag_opts(skb, opt, &proto);
1544 if (opt && opt->opt_nflen)
1545 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1547 skb_push(skb, sizeof(struct ipv6hdr));
1548 skb_reset_network_header(skb);
1549 hdr = ipv6_hdr(skb);
1551 *(__be32*)hdr = fl6->flowlabel |
1552 htonl(0x60000000 | ((int)np->cork.tclass << 20));
1554 hdr->hop_limit = np->cork.hop_limit;
1555 hdr->nexthdr = proto;
1556 hdr->saddr = fl6->saddr;
1557 hdr->daddr = *final_dst;
1559 skb->priority = sk->sk_priority;
1560 skb->mark = sk->sk_mark;
1562 skb_dst_set(skb, dst_clone(&rt->dst));
1563 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1564 if (proto == IPPROTO_ICMPV6) {
1565 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1567 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1568 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1571 err = ip6_local_out(skb);
1574 err = net_xmit_errno(err);
1580 ip6_cork_release(inet, np);
1583 IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1586 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1588 void ip6_flush_pending_frames(struct sock *sk)
1590 struct sk_buff *skb;
1592 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1594 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1595 IPSTATS_MIB_OUTDISCARDS);
1599 ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1601 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);