1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright (c) 2013 Nicira, Inc.
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
8 #include <linux/capability.h>
9 #include <linux/module.h>
10 #include <linux/types.h>
11 #include <linux/kernel.h>
12 #include <linux/slab.h>
13 #include <linux/uaccess.h>
14 #include <linux/skbuff.h>
15 #include <linux/netdevice.h>
17 #include <linux/tcp.h>
18 #include <linux/udp.h>
19 #include <linux/if_arp.h>
20 #include <linux/init.h>
21 #include <linux/in6.h>
22 #include <linux/inetdevice.h>
23 #include <linux/igmp.h>
24 #include <linux/netfilter_ipv4.h>
25 #include <linux/etherdevice.h>
26 #include <linux/if_ether.h>
27 #include <linux/if_vlan.h>
28 #include <linux/rculist.h>
29 #include <linux/err.h>
34 #include <net/protocol.h>
35 #include <net/ip_tunnels.h>
37 #include <net/checksum.h>
38 #include <net/dsfield.h>
39 #include <net/inet_ecn.h>
41 #include <net/net_namespace.h>
42 #include <net/netns/generic.h>
43 #include <net/rtnetlink.h>
45 #include <net/dst_metadata.h>
47 #if IS_ENABLED(CONFIG_IPV6)
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
53 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
55 return hash_32((__force u32)key ^ (__force u32)remote,
59 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
60 __be16 flags, __be32 key)
62 if (p->i_flags & TUNNEL_KEY) {
63 if (flags & TUNNEL_KEY)
64 return key == p->i_key;
66 /* key expected, none present */
69 return !(flags & TUNNEL_KEY);
72 /* Fallback tunnel: no source, no destination, no key, no options
75 We require exact key match i.e. if a key is present in packet
76 it will match only tunnel with the same key; if it is not present,
77 it will match only keyless tunnel.
79 All keysless packets, if not matched configured keyless tunnels
80 will match fallback tunnel.
81 Given src, dst and key, find appropriate for input tunnel.
83 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
84 int link, __be16 flags,
85 __be32 remote, __be32 local,
88 struct ip_tunnel *t, *cand = NULL;
89 struct hlist_head *head;
90 struct net_device *ndev;
93 hash = ip_tunnel_hash(key, remote);
94 head = &itn->tunnels[hash];
96 hlist_for_each_entry_rcu(t, head, hash_node) {
97 if (local != t->parms.iph.saddr ||
98 remote != t->parms.iph.daddr ||
99 !(t->dev->flags & IFF_UP))
102 if (!ip_tunnel_key_match(&t->parms, flags, key))
105 if (t->parms.link == link)
111 hlist_for_each_entry_rcu(t, head, hash_node) {
112 if (remote != t->parms.iph.daddr ||
113 t->parms.iph.saddr != 0 ||
114 !(t->dev->flags & IFF_UP))
117 if (!ip_tunnel_key_match(&t->parms, flags, key))
120 if (t->parms.link == link)
126 hash = ip_tunnel_hash(key, 0);
127 head = &itn->tunnels[hash];
129 hlist_for_each_entry_rcu(t, head, hash_node) {
130 if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
131 (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
134 if (!(t->dev->flags & IFF_UP))
137 if (!ip_tunnel_key_match(&t->parms, flags, key))
140 if (t->parms.link == link)
146 hlist_for_each_entry_rcu(t, head, hash_node) {
147 if ((!(flags & TUNNEL_NO_KEY) && t->parms.i_key != key) ||
148 t->parms.iph.saddr != 0 ||
149 t->parms.iph.daddr != 0 ||
150 !(t->dev->flags & IFF_UP))
153 if (t->parms.link == link)
162 t = rcu_dereference(itn->collect_md_tun);
163 if (t && t->dev->flags & IFF_UP)
166 ndev = READ_ONCE(itn->fb_tunnel_dev);
167 if (ndev && ndev->flags & IFF_UP)
168 return netdev_priv(ndev);
172 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
174 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
175 struct ip_tunnel_parm *parms)
179 __be32 i_key = parms->i_key;
181 if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
182 remote = parms->iph.daddr;
186 if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
189 h = ip_tunnel_hash(i_key, remote);
190 return &itn->tunnels[h];
193 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
195 struct hlist_head *head = ip_bucket(itn, &t->parms);
198 rcu_assign_pointer(itn->collect_md_tun, t);
199 hlist_add_head_rcu(&t->hash_node, head);
202 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
205 rcu_assign_pointer(itn->collect_md_tun, NULL);
206 hlist_del_init_rcu(&t->hash_node);
209 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
210 struct ip_tunnel_parm *parms,
213 __be32 remote = parms->iph.daddr;
214 __be32 local = parms->iph.saddr;
215 __be32 key = parms->i_key;
216 __be16 flags = parms->i_flags;
217 int link = parms->link;
218 struct ip_tunnel *t = NULL;
219 struct hlist_head *head = ip_bucket(itn, parms);
221 hlist_for_each_entry_rcu(t, head, hash_node) {
222 if (local == t->parms.iph.saddr &&
223 remote == t->parms.iph.daddr &&
224 link == t->parms.link &&
225 type == t->dev->type &&
226 ip_tunnel_key_match(&t->parms, flags, key))
232 static struct net_device *__ip_tunnel_create(struct net *net,
233 const struct rtnl_link_ops *ops,
234 struct ip_tunnel_parm *parms)
237 struct ip_tunnel *tunnel;
238 struct net_device *dev;
242 if (parms->name[0]) {
243 if (!dev_valid_name(parms->name))
245 strscpy(name, parms->name, IFNAMSIZ);
247 if (strlen(ops->kind) > (IFNAMSIZ - 3))
249 strcpy(name, ops->kind);
254 dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
259 dev_net_set(dev, net);
261 dev->rtnl_link_ops = ops;
263 tunnel = netdev_priv(dev);
264 tunnel->parms = *parms;
267 err = register_netdevice(dev);
279 static int ip_tunnel_bind_dev(struct net_device *dev)
281 struct net_device *tdev = NULL;
282 struct ip_tunnel *tunnel = netdev_priv(dev);
283 const struct iphdr *iph;
284 int hlen = LL_MAX_HEADER;
285 int mtu = ETH_DATA_LEN;
286 int t_hlen = tunnel->hlen + sizeof(struct iphdr);
288 iph = &tunnel->parms.iph;
290 /* Guess output device to choose reasonable mtu and needed_headroom */
295 ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
296 iph->saddr, tunnel->parms.o_key,
297 RT_TOS(iph->tos), dev_net(dev),
298 tunnel->parms.link, tunnel->fwmark, 0, 0);
299 rt = ip_route_output_key(tunnel->net, &fl4);
305 if (dev->type != ARPHRD_ETHER)
306 dev->flags |= IFF_POINTOPOINT;
308 dst_cache_reset(&tunnel->dst_cache);
311 if (!tdev && tunnel->parms.link)
312 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
315 hlen = tdev->hard_header_len + tdev->needed_headroom;
316 mtu = min(tdev->mtu, IP_MAX_MTU);
319 dev->needed_headroom = t_hlen + hlen;
320 mtu -= t_hlen + (dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0);
322 if (mtu < IPV4_MIN_MTU)
328 static struct ip_tunnel *ip_tunnel_create(struct net *net,
329 struct ip_tunnel_net *itn,
330 struct ip_tunnel_parm *parms)
332 struct ip_tunnel *nt;
333 struct net_device *dev;
338 dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms);
340 return ERR_CAST(dev);
342 mtu = ip_tunnel_bind_dev(dev);
343 err = dev_set_mtu(dev, mtu);
345 goto err_dev_set_mtu;
347 nt = netdev_priv(dev);
348 t_hlen = nt->hlen + sizeof(struct iphdr);
349 dev->min_mtu = ETH_MIN_MTU;
350 dev->max_mtu = IP_MAX_MTU - t_hlen;
351 if (dev->type == ARPHRD_ETHER)
352 dev->max_mtu -= dev->hard_header_len;
354 ip_tunnel_add(itn, nt);
358 unregister_netdevice(dev);
362 void ip_tunnel_md_udp_encap(struct sk_buff *skb, struct ip_tunnel_info *info)
364 const struct iphdr *iph = ip_hdr(skb);
365 const struct udphdr *udph;
367 if (iph->protocol != IPPROTO_UDP)
370 udph = (struct udphdr *)((__u8 *)iph + (iph->ihl << 2));
371 info->encap.sport = udph->source;
372 info->encap.dport = udph->dest;
374 EXPORT_SYMBOL(ip_tunnel_md_udp_encap);
376 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
377 const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
380 const struct iphdr *iph = ip_hdr(skb);
383 #ifdef CONFIG_NET_IPGRE_BROADCAST
384 if (ipv4_is_multicast(iph->daddr)) {
385 DEV_STATS_INC(tunnel->dev, multicast);
386 skb->pkt_type = PACKET_BROADCAST;
390 if ((!(tpi->flags&TUNNEL_CSUM) && (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
391 ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
392 DEV_STATS_INC(tunnel->dev, rx_crc_errors);
393 DEV_STATS_INC(tunnel->dev, rx_errors);
397 if (tunnel->parms.i_flags&TUNNEL_SEQ) {
398 if (!(tpi->flags&TUNNEL_SEQ) ||
399 (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
400 DEV_STATS_INC(tunnel->dev, rx_fifo_errors);
401 DEV_STATS_INC(tunnel->dev, rx_errors);
404 tunnel->i_seqno = ntohl(tpi->seq) + 1;
407 skb_set_network_header(skb, (tunnel->dev->type == ARPHRD_ETHER) ? ETH_HLEN : 0);
409 err = IP_ECN_decapsulate(iph, skb);
412 net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
413 &iph->saddr, iph->tos);
415 DEV_STATS_INC(tunnel->dev, rx_frame_errors);
416 DEV_STATS_INC(tunnel->dev, rx_errors);
421 dev_sw_netstats_rx_add(tunnel->dev, skb->len);
422 skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
424 if (tunnel->dev->type == ARPHRD_ETHER) {
425 skb->protocol = eth_type_trans(skb, tunnel->dev);
426 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
428 skb->dev = tunnel->dev;
432 skb_dst_set(skb, (struct dst_entry *)tun_dst);
434 gro_cells_receive(&tunnel->gro_cells, skb);
439 dst_release((struct dst_entry *)tun_dst);
443 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
445 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
448 if (num >= MAX_IPTUN_ENCAP_OPS)
451 return !cmpxchg((const struct ip_tunnel_encap_ops **)
455 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
457 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
462 if (num >= MAX_IPTUN_ENCAP_OPS)
465 ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
467 ops, NULL) == ops) ? 0 : -1;
473 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
475 int ip_tunnel_encap_setup(struct ip_tunnel *t,
476 struct ip_tunnel_encap *ipencap)
480 memset(&t->encap, 0, sizeof(t->encap));
482 hlen = ip_encap_hlen(ipencap);
486 t->encap.type = ipencap->type;
487 t->encap.sport = ipencap->sport;
488 t->encap.dport = ipencap->dport;
489 t->encap.flags = ipencap->flags;
491 t->encap_hlen = hlen;
492 t->hlen = t->encap_hlen + t->tun_hlen;
496 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
498 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
499 struct rtable *rt, __be16 df,
500 const struct iphdr *inner_iph,
501 int tunnel_hlen, __be32 dst, bool md)
503 struct ip_tunnel *tunnel = netdev_priv(dev);
507 tunnel_hlen = md ? tunnel_hlen : tunnel->hlen;
508 pkt_size = skb->len - tunnel_hlen;
509 pkt_size -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0;
512 mtu = dst_mtu(&rt->dst) - (sizeof(struct iphdr) + tunnel_hlen);
513 mtu -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0;
515 mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
518 if (skb_valid_dst(skb))
519 skb_dst_update_pmtu_no_confirm(skb, mtu);
521 if (skb->protocol == htons(ETH_P_IP)) {
522 if (!skb_is_gso(skb) &&
523 (inner_iph->frag_off & htons(IP_DF)) &&
525 icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
529 #if IS_ENABLED(CONFIG_IPV6)
530 else if (skb->protocol == htons(ETH_P_IPV6)) {
531 struct rt6_info *rt6;
534 rt6 = skb_valid_dst(skb) ? (struct rt6_info *)skb_dst(skb) :
536 daddr = md ? dst : tunnel->parms.iph.daddr;
538 if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
539 mtu >= IPV6_MIN_MTU) {
540 if ((daddr && !ipv4_is_multicast(daddr)) ||
541 rt6->rt6i_dst.plen == 128) {
542 rt6->rt6i_flags |= RTF_MODIFIED;
543 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
547 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
549 icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
557 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
558 u8 proto, int tunnel_hlen)
560 struct ip_tunnel *tunnel = netdev_priv(dev);
561 u32 headroom = sizeof(struct iphdr);
562 struct ip_tunnel_info *tun_info;
563 const struct ip_tunnel_key *key;
564 const struct iphdr *inner_iph;
565 struct rtable *rt = NULL;
571 tun_info = skb_tunnel_info(skb);
572 if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
573 ip_tunnel_info_af(tun_info) != AF_INET))
575 key = &tun_info->key;
576 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
577 inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
580 if (skb->protocol == htons(ETH_P_IP))
581 tos = inner_iph->tos;
582 else if (skb->protocol == htons(ETH_P_IPV6))
583 tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
585 ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src,
586 tunnel_id_to_key32(key->tun_id), RT_TOS(tos),
587 dev_net(dev), 0, skb->mark, skb_get_hash(skb),
591 tunnel_hlen = ip_encap_hlen(&tun_info->encap);
593 if (ip_tunnel_encap(skb, &tun_info->encap, &proto, &fl4) < 0)
596 use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
598 rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr);
600 rt = ip_route_output_key(tunnel->net, &fl4);
602 DEV_STATS_INC(dev, tx_carrier_errors);
606 dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
609 if (rt->dst.dev == dev) {
611 DEV_STATS_INC(dev, collisions);
615 if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
617 if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen,
618 key->u.ipv4.dst, true)) {
623 tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
626 if (skb->protocol == htons(ETH_P_IP))
627 ttl = inner_iph->ttl;
628 else if (skb->protocol == htons(ETH_P_IPV6))
629 ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
631 ttl = ip4_dst_hoplimit(&rt->dst);
634 headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
635 if (headroom > READ_ONCE(dev->needed_headroom))
636 WRITE_ONCE(dev->needed_headroom, headroom);
638 if (skb_cow_head(skb, READ_ONCE(dev->needed_headroom))) {
642 iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
643 df, !net_eq(tunnel->net, dev_net(dev)));
646 DEV_STATS_INC(dev, tx_errors);
649 DEV_STATS_INC(dev, tx_dropped);
653 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
655 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
656 const struct iphdr *tnl_params, u8 protocol)
658 struct ip_tunnel *tunnel = netdev_priv(dev);
659 struct ip_tunnel_info *tun_info = NULL;
660 const struct iphdr *inner_iph;
661 unsigned int max_headroom; /* The extra header space needed */
662 struct rtable *rt = NULL; /* Route to the other host */
663 __be16 payload_protocol;
664 bool use_cache = false;
672 inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
673 connected = (tunnel->parms.iph.daddr != 0);
674 payload_protocol = skb_protocol(skb, true);
676 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
678 dst = tnl_params->daddr;
683 DEV_STATS_INC(dev, tx_fifo_errors);
687 tun_info = skb_tunnel_info(skb);
688 if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) &&
689 ip_tunnel_info_af(tun_info) == AF_INET &&
690 tun_info->key.u.ipv4.dst) {
691 dst = tun_info->key.u.ipv4.dst;
694 } else if (payload_protocol == htons(ETH_P_IP)) {
695 rt = skb_rtable(skb);
696 dst = rt_nexthop(rt, inner_iph->daddr);
698 #if IS_ENABLED(CONFIG_IPV6)
699 else if (payload_protocol == htons(ETH_P_IPV6)) {
700 const struct in6_addr *addr6;
701 struct neighbour *neigh;
702 bool do_tx_error_icmp;
705 neigh = dst_neigh_lookup(skb_dst(skb),
706 &ipv6_hdr(skb)->daddr);
710 addr6 = (const struct in6_addr *)&neigh->primary_key;
711 addr_type = ipv6_addr_type(addr6);
713 if (addr_type == IPV6_ADDR_ANY) {
714 addr6 = &ipv6_hdr(skb)->daddr;
715 addr_type = ipv6_addr_type(addr6);
718 if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
719 do_tx_error_icmp = true;
721 do_tx_error_icmp = false;
722 dst = addr6->s6_addr32[3];
724 neigh_release(neigh);
725 if (do_tx_error_icmp)
736 tos = tnl_params->tos;
739 if (payload_protocol == htons(ETH_P_IP)) {
740 tos = inner_iph->tos;
742 } else if (payload_protocol == htons(ETH_P_IPV6)) {
743 tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
748 ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
749 tunnel->parms.o_key, RT_TOS(tos),
750 dev_net(dev), tunnel->parms.link,
751 tunnel->fwmark, skb_get_hash(skb), 0);
753 if (ip_tunnel_encap(skb, &tunnel->encap, &protocol, &fl4) < 0)
756 if (connected && md) {
757 use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
759 rt = dst_cache_get_ip4(&tun_info->dst_cache,
762 rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache,
767 rt = ip_route_output_key(tunnel->net, &fl4);
770 DEV_STATS_INC(dev, tx_carrier_errors);
774 dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
776 else if (!md && connected)
777 dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
781 if (rt->dst.dev == dev) {
783 DEV_STATS_INC(dev, collisions);
787 df = tnl_params->frag_off;
788 if (payload_protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
789 df |= (inner_iph->frag_off & htons(IP_DF));
791 if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, 0, 0, false)) {
796 if (tunnel->err_count > 0) {
797 if (time_before(jiffies,
798 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
801 dst_link_failure(skb);
803 tunnel->err_count = 0;
806 tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
807 ttl = tnl_params->ttl;
809 if (payload_protocol == htons(ETH_P_IP))
810 ttl = inner_iph->ttl;
811 #if IS_ENABLED(CONFIG_IPV6)
812 else if (payload_protocol == htons(ETH_P_IPV6))
813 ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
816 ttl = ip4_dst_hoplimit(&rt->dst);
819 max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
820 + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
821 if (max_headroom > READ_ONCE(dev->needed_headroom))
822 WRITE_ONCE(dev->needed_headroom, max_headroom);
824 if (skb_cow_head(skb, READ_ONCE(dev->needed_headroom))) {
826 DEV_STATS_INC(dev, tx_dropped);
831 iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
832 df, !net_eq(tunnel->net, dev_net(dev)));
835 #if IS_ENABLED(CONFIG_IPV6)
837 dst_link_failure(skb);
840 DEV_STATS_INC(dev, tx_errors);
843 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
845 static void ip_tunnel_update(struct ip_tunnel_net *itn,
847 struct net_device *dev,
848 struct ip_tunnel_parm *p,
852 ip_tunnel_del(itn, t);
853 t->parms.iph.saddr = p->iph.saddr;
854 t->parms.iph.daddr = p->iph.daddr;
855 t->parms.i_key = p->i_key;
856 t->parms.o_key = p->o_key;
857 if (dev->type != ARPHRD_ETHER) {
858 __dev_addr_set(dev, &p->iph.saddr, 4);
859 memcpy(dev->broadcast, &p->iph.daddr, 4);
861 ip_tunnel_add(itn, t);
863 t->parms.iph.ttl = p->iph.ttl;
864 t->parms.iph.tos = p->iph.tos;
865 t->parms.iph.frag_off = p->iph.frag_off;
867 if (t->parms.link != p->link || t->fwmark != fwmark) {
870 t->parms.link = p->link;
872 mtu = ip_tunnel_bind_dev(dev);
876 dst_cache_reset(&t->dst_cache);
877 netdev_state_change(dev);
880 int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
883 struct ip_tunnel *t = netdev_priv(dev);
884 struct net *net = t->net;
885 struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
889 if (dev == itn->fb_tunnel_dev) {
890 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
892 t = netdev_priv(dev);
894 memcpy(p, &t->parms, sizeof(*p));
900 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
903 p->iph.frag_off |= htons(IP_DF);
904 if (!(p->i_flags & VTI_ISVTI)) {
905 if (!(p->i_flags & TUNNEL_KEY))
907 if (!(p->o_flags & TUNNEL_KEY))
911 t = ip_tunnel_find(itn, p, itn->type);
913 if (cmd == SIOCADDTUNNEL) {
915 t = ip_tunnel_create(net, itn, p);
916 err = PTR_ERR_OR_ZERO(t);
923 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
930 unsigned int nflags = 0;
932 if (ipv4_is_multicast(p->iph.daddr))
933 nflags = IFF_BROADCAST;
934 else if (p->iph.daddr)
935 nflags = IFF_POINTOPOINT;
937 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
942 t = netdev_priv(dev);
948 ip_tunnel_update(itn, t, dev, p, true, 0);
956 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
959 if (dev == itn->fb_tunnel_dev) {
961 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
965 if (t == netdev_priv(itn->fb_tunnel_dev))
969 unregister_netdevice(dev);
980 EXPORT_SYMBOL_GPL(ip_tunnel_ctl);
982 int ip_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
983 void __user *data, int cmd)
985 struct ip_tunnel_parm p;
988 if (copy_from_user(&p, data, sizeof(p)))
990 err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd);
991 if (!err && copy_to_user(data, &p, sizeof(p)))
995 EXPORT_SYMBOL_GPL(ip_tunnel_siocdevprivate);
997 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
999 struct ip_tunnel *tunnel = netdev_priv(dev);
1000 int t_hlen = tunnel->hlen + sizeof(struct iphdr);
1001 int max_mtu = IP_MAX_MTU - t_hlen;
1003 if (dev->type == ARPHRD_ETHER)
1004 max_mtu -= dev->hard_header_len;
1006 if (new_mtu < ETH_MIN_MTU)
1009 if (new_mtu > max_mtu) {
1019 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
1021 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1023 return __ip_tunnel_change_mtu(dev, new_mtu, true);
1025 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
1027 static void ip_tunnel_dev_free(struct net_device *dev)
1029 struct ip_tunnel *tunnel = netdev_priv(dev);
1031 gro_cells_destroy(&tunnel->gro_cells);
1032 dst_cache_destroy(&tunnel->dst_cache);
1033 free_percpu(dev->tstats);
1036 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
1038 struct ip_tunnel *tunnel = netdev_priv(dev);
1039 struct ip_tunnel_net *itn;
1041 itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
1043 if (itn->fb_tunnel_dev != dev) {
1044 ip_tunnel_del(itn, netdev_priv(dev));
1045 unregister_netdevice_queue(dev, head);
1048 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
1050 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
1052 struct ip_tunnel *tunnel = netdev_priv(dev);
1056 EXPORT_SYMBOL(ip_tunnel_get_link_net);
1058 int ip_tunnel_get_iflink(const struct net_device *dev)
1060 struct ip_tunnel *tunnel = netdev_priv(dev);
1062 return tunnel->parms.link;
1064 EXPORT_SYMBOL(ip_tunnel_get_iflink);
1066 int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
1067 struct rtnl_link_ops *ops, char *devname)
1069 struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1070 struct ip_tunnel_parm parms;
1073 itn->rtnl_link_ops = ops;
1074 for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1075 INIT_HLIST_HEAD(&itn->tunnels[i]);
1077 if (!ops || !net_has_fallback_tunnels(net)) {
1078 struct ip_tunnel_net *it_init_net;
1080 it_init_net = net_generic(&init_net, ip_tnl_net_id);
1081 itn->type = it_init_net->type;
1082 itn->fb_tunnel_dev = NULL;
1086 memset(&parms, 0, sizeof(parms));
1088 strscpy(parms.name, devname, IFNAMSIZ);
1091 itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1092 /* FB netdevice is special: we have one, and only one per netns.
1093 * Allowing to move it to another netns is clearly unsafe.
1095 if (!IS_ERR(itn->fb_tunnel_dev)) {
1096 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1097 itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1098 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1099 itn->type = itn->fb_tunnel_dev->type;
1103 return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1105 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1107 static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
1108 struct list_head *head,
1109 struct rtnl_link_ops *ops)
1111 struct net_device *dev, *aux;
1114 for_each_netdev_safe(net, dev, aux)
1115 if (dev->rtnl_link_ops == ops)
1116 unregister_netdevice_queue(dev, head);
1118 for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1119 struct ip_tunnel *t;
1120 struct hlist_node *n;
1121 struct hlist_head *thead = &itn->tunnels[h];
1123 hlist_for_each_entry_safe(t, n, thead, hash_node)
1124 /* If dev is in the same netns, it has already
1125 * been added to the list by the previous loop.
1127 if (!net_eq(dev_net(t->dev), net))
1128 unregister_netdevice_queue(t->dev, head);
1132 void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
1133 struct rtnl_link_ops *ops)
1135 struct ip_tunnel_net *itn;
1140 list_for_each_entry(net, net_list, exit_list) {
1141 itn = net_generic(net, id);
1142 ip_tunnel_destroy(net, itn, &list, ops);
1144 unregister_netdevice_many(&list);
1147 EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
1149 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1150 struct ip_tunnel_parm *p, __u32 fwmark)
1152 struct ip_tunnel *nt;
1153 struct net *net = dev_net(dev);
1154 struct ip_tunnel_net *itn;
1158 nt = netdev_priv(dev);
1159 itn = net_generic(net, nt->ip_tnl_net_id);
1161 if (nt->collect_md) {
1162 if (rtnl_dereference(itn->collect_md_tun))
1165 if (ip_tunnel_find(itn, p, dev->type))
1171 nt->fwmark = fwmark;
1172 err = register_netdevice(dev);
1174 goto err_register_netdevice;
1176 if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1177 eth_hw_addr_random(dev);
1179 mtu = ip_tunnel_bind_dev(dev);
1181 unsigned int max = IP_MAX_MTU - (nt->hlen + sizeof(struct iphdr));
1183 if (dev->type == ARPHRD_ETHER)
1184 max -= dev->hard_header_len;
1186 mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU, max);
1189 err = dev_set_mtu(dev, mtu);
1191 goto err_dev_set_mtu;
1193 ip_tunnel_add(itn, nt);
1197 unregister_netdevice(dev);
1198 err_register_netdevice:
1201 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1203 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1204 struct ip_tunnel_parm *p, __u32 fwmark)
1206 struct ip_tunnel *t;
1207 struct ip_tunnel *tunnel = netdev_priv(dev);
1208 struct net *net = tunnel->net;
1209 struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1211 if (dev == itn->fb_tunnel_dev)
1214 t = ip_tunnel_find(itn, p, dev->type);
1222 if (dev->type != ARPHRD_ETHER) {
1223 unsigned int nflags = 0;
1225 if (ipv4_is_multicast(p->iph.daddr))
1226 nflags = IFF_BROADCAST;
1227 else if (p->iph.daddr)
1228 nflags = IFF_POINTOPOINT;
1230 if ((dev->flags ^ nflags) &
1231 (IFF_POINTOPOINT | IFF_BROADCAST))
1236 ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
1239 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1241 int ip_tunnel_init(struct net_device *dev)
1243 struct ip_tunnel *tunnel = netdev_priv(dev);
1244 struct iphdr *iph = &tunnel->parms.iph;
1247 dev->needs_free_netdev = true;
1248 dev->priv_destructor = ip_tunnel_dev_free;
1249 dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1253 err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1255 free_percpu(dev->tstats);
1259 err = gro_cells_init(&tunnel->gro_cells, dev);
1261 dst_cache_destroy(&tunnel->dst_cache);
1262 free_percpu(dev->tstats);
1267 tunnel->net = dev_net(dev);
1268 strcpy(tunnel->parms.name, dev->name);
1272 if (tunnel->collect_md)
1273 netif_keep_dst(dev);
1276 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1278 void ip_tunnel_uninit(struct net_device *dev)
1280 struct ip_tunnel *tunnel = netdev_priv(dev);
1281 struct net *net = tunnel->net;
1282 struct ip_tunnel_net *itn;
1284 itn = net_generic(net, tunnel->ip_tnl_net_id);
1285 ip_tunnel_del(itn, netdev_priv(dev));
1286 if (itn->fb_tunnel_dev == dev)
1287 WRITE_ONCE(itn->fb_tunnel_dev, NULL);
1289 dst_cache_reset(&tunnel->dst_cache);
1291 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1293 /* Do least required initialization, rest of init is done in tunnel_init call */
1294 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1296 struct ip_tunnel *tunnel = netdev_priv(dev);
1297 tunnel->ip_tnl_net_id = net_id;
1299 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1301 MODULE_LICENSE("GPL");