1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright (c) 2013 Nicira, Inc.
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
8 #include <linux/capability.h>
9 #include <linux/module.h>
10 #include <linux/types.h>
11 #include <linux/kernel.h>
12 #include <linux/slab.h>
13 #include <linux/uaccess.h>
14 #include <linux/skbuff.h>
15 #include <linux/netdevice.h>
17 #include <linux/tcp.h>
18 #include <linux/udp.h>
19 #include <linux/if_arp.h>
20 #include <linux/init.h>
21 #include <linux/in6.h>
22 #include <linux/inetdevice.h>
23 #include <linux/igmp.h>
24 #include <linux/netfilter_ipv4.h>
25 #include <linux/etherdevice.h>
26 #include <linux/if_ether.h>
27 #include <linux/if_vlan.h>
28 #include <linux/rculist.h>
29 #include <linux/err.h>
34 #include <net/protocol.h>
35 #include <net/ip_tunnels.h>
37 #include <net/checksum.h>
38 #include <net/dsfield.h>
39 #include <net/inet_ecn.h>
41 #include <net/net_namespace.h>
42 #include <net/netns/generic.h>
43 #include <net/rtnetlink.h>
45 #include <net/dst_metadata.h>
47 #if IS_ENABLED(CONFIG_IPV6)
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
53 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
55 return hash_32((__force u32)key ^ (__force u32)remote,
59 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
60 __be16 flags, __be32 key)
62 if (p->i_flags & TUNNEL_KEY) {
63 if (flags & TUNNEL_KEY)
64 return key == p->i_key;
66 /* key expected, none present */
69 return !(flags & TUNNEL_KEY);
72 /* Fallback tunnel: no source, no destination, no key, no options
75 We require exact key match i.e. if a key is present in packet
76 it will match only tunnel with the same key; if it is not present,
77 it will match only keyless tunnel.
79 All keysless packets, if not matched configured keyless tunnels
80 will match fallback tunnel.
81 Given src, dst and key, find appropriate for input tunnel.
83 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
84 int link, __be16 flags,
85 __be32 remote, __be32 local,
88 struct ip_tunnel *t, *cand = NULL;
89 struct hlist_head *head;
90 struct net_device *ndev;
93 hash = ip_tunnel_hash(key, remote);
94 head = &itn->tunnels[hash];
96 hlist_for_each_entry_rcu(t, head, hash_node) {
97 if (local != t->parms.iph.saddr ||
98 remote != t->parms.iph.daddr ||
99 !(t->dev->flags & IFF_UP))
102 if (!ip_tunnel_key_match(&t->parms, flags, key))
105 if (t->parms.link == link)
111 hlist_for_each_entry_rcu(t, head, hash_node) {
112 if (remote != t->parms.iph.daddr ||
113 t->parms.iph.saddr != 0 ||
114 !(t->dev->flags & IFF_UP))
117 if (!ip_tunnel_key_match(&t->parms, flags, key))
120 if (t->parms.link == link)
126 hash = ip_tunnel_hash(key, 0);
127 head = &itn->tunnels[hash];
129 hlist_for_each_entry_rcu(t, head, hash_node) {
130 if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
131 (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
134 if (!(t->dev->flags & IFF_UP))
137 if (!ip_tunnel_key_match(&t->parms, flags, key))
140 if (t->parms.link == link)
146 hlist_for_each_entry_rcu(t, head, hash_node) {
147 if ((!(flags & TUNNEL_NO_KEY) && t->parms.i_key != key) ||
148 t->parms.iph.saddr != 0 ||
149 t->parms.iph.daddr != 0 ||
150 !(t->dev->flags & IFF_UP))
153 if (t->parms.link == link)
162 t = rcu_dereference(itn->collect_md_tun);
163 if (t && t->dev->flags & IFF_UP)
166 ndev = READ_ONCE(itn->fb_tunnel_dev);
167 if (ndev && ndev->flags & IFF_UP)
168 return netdev_priv(ndev);
172 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
174 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
175 struct ip_tunnel_parm *parms)
179 __be32 i_key = parms->i_key;
181 if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
182 remote = parms->iph.daddr;
186 if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
189 h = ip_tunnel_hash(i_key, remote);
190 return &itn->tunnels[h];
193 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
195 struct hlist_head *head = ip_bucket(itn, &t->parms);
198 rcu_assign_pointer(itn->collect_md_tun, t);
199 hlist_add_head_rcu(&t->hash_node, head);
202 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
205 rcu_assign_pointer(itn->collect_md_tun, NULL);
206 hlist_del_init_rcu(&t->hash_node);
209 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
210 struct ip_tunnel_parm *parms,
213 __be32 remote = parms->iph.daddr;
214 __be32 local = parms->iph.saddr;
215 __be32 key = parms->i_key;
216 __be16 flags = parms->i_flags;
217 int link = parms->link;
218 struct ip_tunnel *t = NULL;
219 struct hlist_head *head = ip_bucket(itn, parms);
221 hlist_for_each_entry_rcu(t, head, hash_node) {
222 if (local == t->parms.iph.saddr &&
223 remote == t->parms.iph.daddr &&
224 link == t->parms.link &&
225 type == t->dev->type &&
226 ip_tunnel_key_match(&t->parms, flags, key))
232 static struct net_device *__ip_tunnel_create(struct net *net,
233 const struct rtnl_link_ops *ops,
234 struct ip_tunnel_parm *parms)
237 struct ip_tunnel *tunnel;
238 struct net_device *dev;
242 if (parms->name[0]) {
243 if (!dev_valid_name(parms->name))
245 strlcpy(name, parms->name, IFNAMSIZ);
247 if (strlen(ops->kind) > (IFNAMSIZ - 3))
249 strcpy(name, ops->kind);
254 dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
259 dev_net_set(dev, net);
261 dev->rtnl_link_ops = ops;
263 tunnel = netdev_priv(dev);
264 tunnel->parms = *parms;
267 err = register_netdevice(dev);
279 static int ip_tunnel_bind_dev(struct net_device *dev)
281 struct net_device *tdev = NULL;
282 struct ip_tunnel *tunnel = netdev_priv(dev);
283 const struct iphdr *iph;
284 int hlen = LL_MAX_HEADER;
285 int mtu = ETH_DATA_LEN;
286 int t_hlen = tunnel->hlen + sizeof(struct iphdr);
288 iph = &tunnel->parms.iph;
290 /* Guess output device to choose reasonable mtu and needed_headroom */
295 ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
296 iph->saddr, tunnel->parms.o_key,
297 RT_TOS(iph->tos), tunnel->parms.link,
299 rt = ip_route_output_key(tunnel->net, &fl4);
305 if (dev->type != ARPHRD_ETHER)
306 dev->flags |= IFF_POINTOPOINT;
308 dst_cache_reset(&tunnel->dst_cache);
311 if (!tdev && tunnel->parms.link)
312 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
315 hlen = tdev->hard_header_len + tdev->needed_headroom;
316 mtu = min(tdev->mtu, IP_MAX_MTU);
319 dev->needed_headroom = t_hlen + hlen;
322 if (mtu < IPV4_MIN_MTU)
328 static struct ip_tunnel *ip_tunnel_create(struct net *net,
329 struct ip_tunnel_net *itn,
330 struct ip_tunnel_parm *parms)
332 struct ip_tunnel *nt;
333 struct net_device *dev;
338 dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms);
340 return ERR_CAST(dev);
342 mtu = ip_tunnel_bind_dev(dev);
343 err = dev_set_mtu(dev, mtu);
345 goto err_dev_set_mtu;
347 nt = netdev_priv(dev);
348 t_hlen = nt->hlen + sizeof(struct iphdr);
349 dev->min_mtu = ETH_MIN_MTU;
350 dev->max_mtu = IP_MAX_MTU - t_hlen;
351 ip_tunnel_add(itn, nt);
355 unregister_netdevice(dev);
359 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
360 const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
363 const struct iphdr *iph = ip_hdr(skb);
366 #ifdef CONFIG_NET_IPGRE_BROADCAST
367 if (ipv4_is_multicast(iph->daddr)) {
368 tunnel->dev->stats.multicast++;
369 skb->pkt_type = PACKET_BROADCAST;
373 if ((!(tpi->flags&TUNNEL_CSUM) && (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
374 ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
375 tunnel->dev->stats.rx_crc_errors++;
376 tunnel->dev->stats.rx_errors++;
380 if (tunnel->parms.i_flags&TUNNEL_SEQ) {
381 if (!(tpi->flags&TUNNEL_SEQ) ||
382 (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
383 tunnel->dev->stats.rx_fifo_errors++;
384 tunnel->dev->stats.rx_errors++;
387 tunnel->i_seqno = ntohl(tpi->seq) + 1;
390 skb_reset_network_header(skb);
392 err = IP_ECN_decapsulate(iph, skb);
395 net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
396 &iph->saddr, iph->tos);
398 ++tunnel->dev->stats.rx_frame_errors;
399 ++tunnel->dev->stats.rx_errors;
404 dev_sw_netstats_rx_add(tunnel->dev, skb->len);
405 skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
407 if (tunnel->dev->type == ARPHRD_ETHER) {
408 skb->protocol = eth_type_trans(skb, tunnel->dev);
409 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
411 skb->dev = tunnel->dev;
415 skb_dst_set(skb, (struct dst_entry *)tun_dst);
417 gro_cells_receive(&tunnel->gro_cells, skb);
422 dst_release((struct dst_entry *)tun_dst);
426 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
428 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
431 if (num >= MAX_IPTUN_ENCAP_OPS)
434 return !cmpxchg((const struct ip_tunnel_encap_ops **)
438 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
440 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
445 if (num >= MAX_IPTUN_ENCAP_OPS)
448 ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
450 ops, NULL) == ops) ? 0 : -1;
456 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
458 int ip_tunnel_encap_setup(struct ip_tunnel *t,
459 struct ip_tunnel_encap *ipencap)
463 memset(&t->encap, 0, sizeof(t->encap));
465 hlen = ip_encap_hlen(ipencap);
469 t->encap.type = ipencap->type;
470 t->encap.sport = ipencap->sport;
471 t->encap.dport = ipencap->dport;
472 t->encap.flags = ipencap->flags;
474 t->encap_hlen = hlen;
475 t->hlen = t->encap_hlen + t->tun_hlen;
479 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
481 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
482 struct rtable *rt, __be16 df,
483 const struct iphdr *inner_iph,
484 int tunnel_hlen, __be32 dst, bool md)
486 struct ip_tunnel *tunnel = netdev_priv(dev);
490 tunnel_hlen = md ? tunnel_hlen : tunnel->hlen;
491 pkt_size = skb->len - tunnel_hlen;
494 mtu = dst_mtu(&rt->dst) - (sizeof(struct iphdr) + tunnel_hlen);
496 mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
498 if (skb_valid_dst(skb))
499 skb_dst_update_pmtu_no_confirm(skb, mtu);
501 if (skb->protocol == htons(ETH_P_IP)) {
502 if (!skb_is_gso(skb) &&
503 (inner_iph->frag_off & htons(IP_DF)) &&
505 icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
509 #if IS_ENABLED(CONFIG_IPV6)
510 else if (skb->protocol == htons(ETH_P_IPV6)) {
511 struct rt6_info *rt6;
514 rt6 = skb_valid_dst(skb) ? (struct rt6_info *)skb_dst(skb) :
516 daddr = md ? dst : tunnel->parms.iph.daddr;
518 if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
519 mtu >= IPV6_MIN_MTU) {
520 if ((daddr && !ipv4_is_multicast(daddr)) ||
521 rt6->rt6i_dst.plen == 128) {
522 rt6->rt6i_flags |= RTF_MODIFIED;
523 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
527 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
529 icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
537 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
538 u8 proto, int tunnel_hlen)
540 struct ip_tunnel *tunnel = netdev_priv(dev);
541 u32 headroom = sizeof(struct iphdr);
542 struct ip_tunnel_info *tun_info;
543 const struct ip_tunnel_key *key;
544 const struct iphdr *inner_iph;
545 struct rtable *rt = NULL;
551 tun_info = skb_tunnel_info(skb);
552 if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
553 ip_tunnel_info_af(tun_info) != AF_INET))
555 key = &tun_info->key;
556 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
557 inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
560 if (skb->protocol == htons(ETH_P_IP))
561 tos = inner_iph->tos;
562 else if (skb->protocol == htons(ETH_P_IPV6))
563 tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
565 ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src,
566 tunnel_id_to_key32(key->tun_id), RT_TOS(tos),
567 0, skb->mark, skb_get_hash(skb));
568 if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
571 use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
573 rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr);
575 rt = ip_route_output_key(tunnel->net, &fl4);
577 dev->stats.tx_carrier_errors++;
581 dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
584 if (rt->dst.dev == dev) {
586 dev->stats.collisions++;
590 if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
592 if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen,
593 key->u.ipv4.dst, true)) {
598 tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
601 if (skb->protocol == htons(ETH_P_IP))
602 ttl = inner_iph->ttl;
603 else if (skb->protocol == htons(ETH_P_IPV6))
604 ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
606 ttl = ip4_dst_hoplimit(&rt->dst);
609 headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
610 if (headroom > dev->needed_headroom)
611 dev->needed_headroom = headroom;
613 if (skb_cow_head(skb, dev->needed_headroom)) {
617 iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
618 df, !net_eq(tunnel->net, dev_net(dev)));
621 dev->stats.tx_errors++;
624 dev->stats.tx_dropped++;
628 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
630 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
631 const struct iphdr *tnl_params, u8 protocol)
633 struct ip_tunnel *tunnel = netdev_priv(dev);
634 struct ip_tunnel_info *tun_info = NULL;
635 const struct iphdr *inner_iph;
636 unsigned int max_headroom; /* The extra header space needed */
637 struct rtable *rt = NULL; /* Route to the other host */
638 bool use_cache = false;
646 inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
647 connected = (tunnel->parms.iph.daddr != 0);
649 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
651 dst = tnl_params->daddr;
656 dev->stats.tx_fifo_errors++;
660 tun_info = skb_tunnel_info(skb);
661 if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) &&
662 ip_tunnel_info_af(tun_info) == AF_INET &&
663 tun_info->key.u.ipv4.dst) {
664 dst = tun_info->key.u.ipv4.dst;
668 else if (skb->protocol == htons(ETH_P_IP)) {
669 rt = skb_rtable(skb);
670 dst = rt_nexthop(rt, inner_iph->daddr);
672 #if IS_ENABLED(CONFIG_IPV6)
673 else if (skb->protocol == htons(ETH_P_IPV6)) {
674 const struct in6_addr *addr6;
675 struct neighbour *neigh;
676 bool do_tx_error_icmp;
679 neigh = dst_neigh_lookup(skb_dst(skb),
680 &ipv6_hdr(skb)->daddr);
684 addr6 = (const struct in6_addr *)&neigh->primary_key;
685 addr_type = ipv6_addr_type(addr6);
687 if (addr_type == IPV6_ADDR_ANY) {
688 addr6 = &ipv6_hdr(skb)->daddr;
689 addr_type = ipv6_addr_type(addr6);
692 if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
693 do_tx_error_icmp = true;
695 do_tx_error_icmp = false;
696 dst = addr6->s6_addr32[3];
698 neigh_release(neigh);
699 if (do_tx_error_icmp)
710 tos = tnl_params->tos;
713 if (skb->protocol == htons(ETH_P_IP)) {
714 tos = inner_iph->tos;
716 } else if (skb->protocol == htons(ETH_P_IPV6)) {
717 tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
722 ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
723 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link,
724 tunnel->fwmark, skb_get_hash(skb));
726 if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
729 if (connected && md) {
730 use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
732 rt = dst_cache_get_ip4(&tun_info->dst_cache,
735 rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache,
740 rt = ip_route_output_key(tunnel->net, &fl4);
743 dev->stats.tx_carrier_errors++;
747 dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
749 else if (!md && connected)
750 dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
754 if (rt->dst.dev == dev) {
756 dev->stats.collisions++;
760 df = tnl_params->frag_off;
761 if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
762 df |= (inner_iph->frag_off & htons(IP_DF));
764 if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, 0, 0, false)) {
769 if (tunnel->err_count > 0) {
770 if (time_before(jiffies,
771 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
774 dst_link_failure(skb);
776 tunnel->err_count = 0;
779 tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
780 ttl = tnl_params->ttl;
782 if (skb->protocol == htons(ETH_P_IP))
783 ttl = inner_iph->ttl;
784 #if IS_ENABLED(CONFIG_IPV6)
785 else if (skb->protocol == htons(ETH_P_IPV6))
786 ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
789 ttl = ip4_dst_hoplimit(&rt->dst);
792 max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
793 + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
794 if (max_headroom > dev->needed_headroom)
795 dev->needed_headroom = max_headroom;
797 if (skb_cow_head(skb, dev->needed_headroom)) {
799 dev->stats.tx_dropped++;
804 iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
805 df, !net_eq(tunnel->net, dev_net(dev)));
808 #if IS_ENABLED(CONFIG_IPV6)
810 dst_link_failure(skb);
813 dev->stats.tx_errors++;
816 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
818 static void ip_tunnel_update(struct ip_tunnel_net *itn,
820 struct net_device *dev,
821 struct ip_tunnel_parm *p,
825 ip_tunnel_del(itn, t);
826 t->parms.iph.saddr = p->iph.saddr;
827 t->parms.iph.daddr = p->iph.daddr;
828 t->parms.i_key = p->i_key;
829 t->parms.o_key = p->o_key;
830 if (dev->type != ARPHRD_ETHER) {
831 memcpy(dev->dev_addr, &p->iph.saddr, 4);
832 memcpy(dev->broadcast, &p->iph.daddr, 4);
834 ip_tunnel_add(itn, t);
836 t->parms.iph.ttl = p->iph.ttl;
837 t->parms.iph.tos = p->iph.tos;
838 t->parms.iph.frag_off = p->iph.frag_off;
840 if (t->parms.link != p->link || t->fwmark != fwmark) {
843 t->parms.link = p->link;
845 mtu = ip_tunnel_bind_dev(dev);
849 dst_cache_reset(&t->dst_cache);
850 netdev_state_change(dev);
853 int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
856 struct ip_tunnel *t = netdev_priv(dev);
857 struct net *net = t->net;
858 struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
862 if (dev == itn->fb_tunnel_dev) {
863 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
865 t = netdev_priv(dev);
867 memcpy(p, &t->parms, sizeof(*p));
873 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
876 p->iph.frag_off |= htons(IP_DF);
877 if (!(p->i_flags & VTI_ISVTI)) {
878 if (!(p->i_flags & TUNNEL_KEY))
880 if (!(p->o_flags & TUNNEL_KEY))
884 t = ip_tunnel_find(itn, p, itn->type);
886 if (cmd == SIOCADDTUNNEL) {
888 t = ip_tunnel_create(net, itn, p);
889 err = PTR_ERR_OR_ZERO(t);
896 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
903 unsigned int nflags = 0;
905 if (ipv4_is_multicast(p->iph.daddr))
906 nflags = IFF_BROADCAST;
907 else if (p->iph.daddr)
908 nflags = IFF_POINTOPOINT;
910 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
915 t = netdev_priv(dev);
921 ip_tunnel_update(itn, t, dev, p, true, 0);
929 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
932 if (dev == itn->fb_tunnel_dev) {
934 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
938 if (t == netdev_priv(itn->fb_tunnel_dev))
942 unregister_netdevice(dev);
953 EXPORT_SYMBOL_GPL(ip_tunnel_ctl);
955 int ip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
957 struct ip_tunnel_parm p;
960 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
962 err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd);
963 if (!err && copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
967 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
969 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
971 struct ip_tunnel *tunnel = netdev_priv(dev);
972 int t_hlen = tunnel->hlen + sizeof(struct iphdr);
973 int max_mtu = IP_MAX_MTU - t_hlen;
975 if (new_mtu < ETH_MIN_MTU)
978 if (new_mtu > max_mtu) {
988 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
990 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
992 return __ip_tunnel_change_mtu(dev, new_mtu, true);
994 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
996 static void ip_tunnel_dev_free(struct net_device *dev)
998 struct ip_tunnel *tunnel = netdev_priv(dev);
1000 gro_cells_destroy(&tunnel->gro_cells);
1001 dst_cache_destroy(&tunnel->dst_cache);
1002 free_percpu(dev->tstats);
1005 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
1007 struct ip_tunnel *tunnel = netdev_priv(dev);
1008 struct ip_tunnel_net *itn;
1010 itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
1012 if (itn->fb_tunnel_dev != dev) {
1013 ip_tunnel_del(itn, netdev_priv(dev));
1014 unregister_netdevice_queue(dev, head);
1017 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
1019 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
1021 struct ip_tunnel *tunnel = netdev_priv(dev);
1025 EXPORT_SYMBOL(ip_tunnel_get_link_net);
1027 int ip_tunnel_get_iflink(const struct net_device *dev)
1029 struct ip_tunnel *tunnel = netdev_priv(dev);
1031 return tunnel->parms.link;
1033 EXPORT_SYMBOL(ip_tunnel_get_iflink);
1035 int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
1036 struct rtnl_link_ops *ops, char *devname)
1038 struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1039 struct ip_tunnel_parm parms;
1042 itn->rtnl_link_ops = ops;
1043 for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1044 INIT_HLIST_HEAD(&itn->tunnels[i]);
1046 if (!ops || !net_has_fallback_tunnels(net)) {
1047 struct ip_tunnel_net *it_init_net;
1049 it_init_net = net_generic(&init_net, ip_tnl_net_id);
1050 itn->type = it_init_net->type;
1051 itn->fb_tunnel_dev = NULL;
1055 memset(&parms, 0, sizeof(parms));
1057 strlcpy(parms.name, devname, IFNAMSIZ);
1060 itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1061 /* FB netdevice is special: we have one, and only one per netns.
1062 * Allowing to move it to another netns is clearly unsafe.
1064 if (!IS_ERR(itn->fb_tunnel_dev)) {
1065 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1066 itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1067 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1068 itn->type = itn->fb_tunnel_dev->type;
1072 return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1074 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1076 static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
1077 struct list_head *head,
1078 struct rtnl_link_ops *ops)
1080 struct net_device *dev, *aux;
1083 for_each_netdev_safe(net, dev, aux)
1084 if (dev->rtnl_link_ops == ops)
1085 unregister_netdevice_queue(dev, head);
1087 for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1088 struct ip_tunnel *t;
1089 struct hlist_node *n;
1090 struct hlist_head *thead = &itn->tunnels[h];
1092 hlist_for_each_entry_safe(t, n, thead, hash_node)
1093 /* If dev is in the same netns, it has already
1094 * been added to the list by the previous loop.
1096 if (!net_eq(dev_net(t->dev), net))
1097 unregister_netdevice_queue(t->dev, head);
1101 void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
1102 struct rtnl_link_ops *ops)
1104 struct ip_tunnel_net *itn;
1109 list_for_each_entry(net, net_list, exit_list) {
1110 itn = net_generic(net, id);
1111 ip_tunnel_destroy(net, itn, &list, ops);
1113 unregister_netdevice_many(&list);
1116 EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
1118 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1119 struct ip_tunnel_parm *p, __u32 fwmark)
1121 struct ip_tunnel *nt;
1122 struct net *net = dev_net(dev);
1123 struct ip_tunnel_net *itn;
1127 nt = netdev_priv(dev);
1128 itn = net_generic(net, nt->ip_tnl_net_id);
1130 if (nt->collect_md) {
1131 if (rtnl_dereference(itn->collect_md_tun))
1134 if (ip_tunnel_find(itn, p, dev->type))
1140 nt->fwmark = fwmark;
1141 err = register_netdevice(dev);
1143 goto err_register_netdevice;
1145 if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1146 eth_hw_addr_random(dev);
1148 mtu = ip_tunnel_bind_dev(dev);
1150 unsigned int max = IP_MAX_MTU - (nt->hlen + sizeof(struct iphdr));
1152 mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU, max);
1155 err = dev_set_mtu(dev, mtu);
1157 goto err_dev_set_mtu;
1159 ip_tunnel_add(itn, nt);
1163 unregister_netdevice(dev);
1164 err_register_netdevice:
1167 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1169 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1170 struct ip_tunnel_parm *p, __u32 fwmark)
1172 struct ip_tunnel *t;
1173 struct ip_tunnel *tunnel = netdev_priv(dev);
1174 struct net *net = tunnel->net;
1175 struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1177 if (dev == itn->fb_tunnel_dev)
1180 t = ip_tunnel_find(itn, p, dev->type);
1188 if (dev->type != ARPHRD_ETHER) {
1189 unsigned int nflags = 0;
1191 if (ipv4_is_multicast(p->iph.daddr))
1192 nflags = IFF_BROADCAST;
1193 else if (p->iph.daddr)
1194 nflags = IFF_POINTOPOINT;
1196 if ((dev->flags ^ nflags) &
1197 (IFF_POINTOPOINT | IFF_BROADCAST))
1202 ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
1205 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1207 int ip_tunnel_init(struct net_device *dev)
1209 struct ip_tunnel *tunnel = netdev_priv(dev);
1210 struct iphdr *iph = &tunnel->parms.iph;
1213 dev->needs_free_netdev = true;
1214 dev->priv_destructor = ip_tunnel_dev_free;
1215 dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1219 err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1221 free_percpu(dev->tstats);
1225 err = gro_cells_init(&tunnel->gro_cells, dev);
1227 dst_cache_destroy(&tunnel->dst_cache);
1228 free_percpu(dev->tstats);
1233 tunnel->net = dev_net(dev);
1234 strcpy(tunnel->parms.name, dev->name);
1238 if (tunnel->collect_md)
1239 netif_keep_dst(dev);
1242 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1244 void ip_tunnel_uninit(struct net_device *dev)
1246 struct ip_tunnel *tunnel = netdev_priv(dev);
1247 struct net *net = tunnel->net;
1248 struct ip_tunnel_net *itn;
1250 itn = net_generic(net, tunnel->ip_tnl_net_id);
1251 ip_tunnel_del(itn, netdev_priv(dev));
1252 if (itn->fb_tunnel_dev == dev)
1253 WRITE_ONCE(itn->fb_tunnel_dev, NULL);
1255 dst_cache_reset(&tunnel->dst_cache);
1257 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1259 /* Do least required initialization, rest of init is done in tunnel_init call */
1260 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1262 struct ip_tunnel *tunnel = netdev_priv(dev);
1263 tunnel->ip_tnl_net_id = net_id;
1265 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1267 MODULE_LICENSE("GPL");