2 * Linux NET3: GRE over IP protocol decoder.
4 * Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/slab.h>
18 #include <asm/uaccess.h>
19 #include <linux/skbuff.h>
20 #include <linux/netdevice.h>
22 #include <linux/tcp.h>
23 #include <linux/udp.h>
24 #include <linux/if_arp.h>
25 #include <linux/mroute.h>
26 #include <linux/init.h>
27 #include <linux/in6.h>
28 #include <linux/inetdevice.h>
29 #include <linux/igmp.h>
30 #include <linux/netfilter_ipv4.h>
31 #include <linux/etherdevice.h>
32 #include <linux/if_ether.h>
37 #include <net/protocol.h>
40 #include <net/checksum.h>
41 #include <net/dsfield.h>
42 #include <net/inet_ecn.h>
44 #include <net/net_namespace.h>
45 #include <net/netns/generic.h>
46 #include <net/rtnetlink.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
58 1. The most important issue is detecting local dead loops.
59 They would cause complete host lockup in transmit, which
60 would be "resolved" by stack overflow or, if queueing is enabled,
61 with infinite looping in net_bh.
63 We cannot track such dead loops during route installation,
64 it is infeasible task. The most general solutions would be
65 to keep skb->encapsulation counter (sort of local ttl),
66 and silently drop packet when it expires. It is the best
67 solution, but it supposes maintaing new variable in ALL
68 skb, even if no tunneling is used.
70 Current solution: HARD_TX_LOCK lock breaks dead loops.
74 2. Networking dead loops would not kill routers, but would really
75 kill network. IP hop limit plays role of "t->recursion" in this case,
76 if we copy it from packet being encapsulated to upper header.
77 It is very good solution, but it introduces two problems:
79 - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
80 do not work over tunnels.
81 - traceroute does not work. I planned to relay ICMP from tunnel,
82 so that this problem would be solved and traceroute output
83 would even more informative. This idea appeared to be wrong:
84 only Linux complies to rfc1812 now (yes, guys, Linux is the only
85 true router now :-)), all routers (at least, in neighbourhood of mine)
86 return only 8 bytes of payload. It is the end.
88 Hence, if we want that OSPF worked or traceroute said something reasonable,
89 we should search for another solution.
91 One of them is to parse packet trying to detect inner encapsulation
92 made by our node. It is difficult or even impossible, especially,
93 taking into account fragmentation. TO be short, tt is not solution at all.
95 Current solution: The solution was UNEXPECTEDLY SIMPLE.
96 We force DF flag on tunnels with preconfigured hop limit,
97 that is ALL. :-) Well, it does not remove the problem completely,
98 but exponential growth of network traffic is changed to linear
99 (branches, that exceed pmtu are pruned) and tunnel mtu
100 fastly degrades to value <68, where looping stops.
101 Yes, it is not good if there exists a router in the loop,
102 which does not force DF, even when encapsulating packets have DF set.
103 But it is not our problem! Nobody could accuse us, we made
104 all that we could make. Even if it is your gated who injected
105 fatal route to network, even if it were you who configured
106 fatal static route: you are innocent. :-)
110 3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
111 practically identical code. It would be good to glue them
112 together, but it is not very evident, how to make them modular.
113 sit is integral part of IPv6, ipip and gre are naturally modular.
114 We could extract common parts (hash table, ioctl etc)
115 to a separate module (ip_tunnel.c).
120 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
121 static int ipgre_tunnel_init(struct net_device *dev);
122 static void ipgre_tunnel_setup(struct net_device *dev);
123 static int ipgre_tunnel_bind_dev(struct net_device *dev);
125 /* Fallback tunnel: no source, no destination, no key, no options */
129 static int ipgre_net_id __read_mostly;
131 struct ip_tunnel *tunnels[4][HASH_SIZE];
133 struct net_device *fb_tunnel_dev;
136 /* Tunnel hash table */
146 We require exact key match i.e. if a key is present in packet
147 it will match only tunnel with the same key; if it is not present,
148 it will match only keyless tunnel.
150 All keysless packets, if not matched configured keyless tunnels
151 will match fallback tunnel.
154 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
156 #define tunnels_r_l tunnels[3]
157 #define tunnels_r tunnels[2]
158 #define tunnels_l tunnels[1]
159 #define tunnels_wc tunnels[0]
161 * Locking : hash tables are protected by RCU and a spinlock
163 static DEFINE_SPINLOCK(ipgre_lock);
165 #define for_each_ip_tunnel_rcu(start) \
166 for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
168 /* Given src, dst and key, find appropriate for input tunnel. */
170 static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
171 __be32 remote, __be32 local,
172 __be32 key, __be16 gre_proto)
174 struct net *net = dev_net(dev);
175 int link = dev->ifindex;
176 unsigned h0 = HASH(remote);
177 unsigned h1 = HASH(key);
178 struct ip_tunnel *t, *cand = NULL;
179 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
180 int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
181 ARPHRD_ETHER : ARPHRD_IPGRE;
182 int score, cand_score = 4;
184 for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
185 if (local != t->parms.iph.saddr ||
186 remote != t->parms.iph.daddr ||
187 key != t->parms.i_key ||
188 !(t->dev->flags & IFF_UP))
191 if (t->dev->type != ARPHRD_IPGRE &&
192 t->dev->type != dev_type)
196 if (t->parms.link != link)
198 if (t->dev->type != dev_type)
203 if (score < cand_score) {
209 for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
210 if (remote != t->parms.iph.daddr ||
211 key != t->parms.i_key ||
212 !(t->dev->flags & IFF_UP))
215 if (t->dev->type != ARPHRD_IPGRE &&
216 t->dev->type != dev_type)
220 if (t->parms.link != link)
222 if (t->dev->type != dev_type)
227 if (score < cand_score) {
233 for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) {
234 if ((local != t->parms.iph.saddr &&
235 (local != t->parms.iph.daddr ||
236 !ipv4_is_multicast(local))) ||
237 key != t->parms.i_key ||
238 !(t->dev->flags & IFF_UP))
241 if (t->dev->type != ARPHRD_IPGRE &&
242 t->dev->type != dev_type)
246 if (t->parms.link != link)
248 if (t->dev->type != dev_type)
253 if (score < cand_score) {
259 for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) {
260 if (t->parms.i_key != key ||
261 !(t->dev->flags & IFF_UP))
264 if (t->dev->type != ARPHRD_IPGRE &&
265 t->dev->type != dev_type)
269 if (t->parms.link != link)
271 if (t->dev->type != dev_type)
276 if (score < cand_score) {
285 dev = ign->fb_tunnel_dev;
286 if (dev->flags & IFF_UP)
287 return netdev_priv(dev);
292 static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
293 struct ip_tunnel_parm *parms)
295 __be32 remote = parms->iph.daddr;
296 __be32 local = parms->iph.saddr;
297 __be32 key = parms->i_key;
298 unsigned h = HASH(key);
303 if (remote && !ipv4_is_multicast(remote)) {
308 return &ign->tunnels[prio][h];
311 static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
314 return __ipgre_bucket(ign, &t->parms);
317 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
319 struct ip_tunnel **tp = ipgre_bucket(ign, t);
321 spin_lock_bh(&ipgre_lock);
323 rcu_assign_pointer(*tp, t);
324 spin_unlock_bh(&ipgre_lock);
327 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
329 struct ip_tunnel **tp;
331 for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
333 spin_lock_bh(&ipgre_lock);
335 spin_unlock_bh(&ipgre_lock);
341 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
342 struct ip_tunnel_parm *parms,
345 __be32 remote = parms->iph.daddr;
346 __be32 local = parms->iph.saddr;
347 __be32 key = parms->i_key;
348 int link = parms->link;
349 struct ip_tunnel *t, **tp;
350 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
352 for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next)
353 if (local == t->parms.iph.saddr &&
354 remote == t->parms.iph.daddr &&
355 key == t->parms.i_key &&
356 link == t->parms.link &&
357 type == t->dev->type)
363 static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
364 struct ip_tunnel_parm *parms, int create)
366 struct ip_tunnel *t, *nt;
367 struct net_device *dev;
369 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
371 t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
376 strlcpy(name, parms->name, IFNAMSIZ);
378 sprintf(name, "gre%%d");
380 dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
384 dev_net_set(dev, net);
386 if (strchr(name, '%')) {
387 if (dev_alloc_name(dev, name) < 0)
391 nt = netdev_priv(dev);
393 dev->rtnl_link_ops = &ipgre_link_ops;
395 dev->mtu = ipgre_tunnel_bind_dev(dev);
397 if (register_netdevice(dev) < 0)
401 ipgre_tunnel_link(ign, nt);
409 static void ipgre_tunnel_uninit(struct net_device *dev)
411 struct net *net = dev_net(dev);
412 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
414 ipgre_tunnel_unlink(ign, netdev_priv(dev));
419 static void ipgre_err(struct sk_buff *skb, u32 info)
422 /* All the routers (except for Linux) return only
423 8 bytes of packet payload. It means, that precise relaying of
424 ICMP in the real Internet is absolutely infeasible.
426 Moreover, Cisco "wise men" put GRE key to the third word
427 in GRE header. It makes impossible maintaining even soft state for keyed
428 GRE tunnels with enabled checksum. Tell them "thank you".
430 Well, I wonder, rfc1812 was written by Cisco employee,
431 what the hell these idiots break standrads established
435 struct iphdr *iph = (struct iphdr *)skb->data;
436 __be16 *p = (__be16*)(skb->data+(iph->ihl<<2));
437 int grehlen = (iph->ihl<<2) + 4;
438 const int type = icmp_hdr(skb)->type;
439 const int code = icmp_hdr(skb)->code;
444 if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
445 if (flags&(GRE_VERSION|GRE_ROUTING))
454 /* If only 8 bytes returned, keyed message will be dropped here */
455 if (skb_headlen(skb) < grehlen)
460 case ICMP_PARAMETERPROB:
463 case ICMP_DEST_UNREACH:
466 case ICMP_PORT_UNREACH:
467 /* Impossible event. */
469 case ICMP_FRAG_NEEDED:
470 /* Soft state for pmtu is maintained by IP core. */
473 /* All others are translated to HOST_UNREACH.
474 rfc2003 contains "deep thoughts" about NET_UNREACH,
475 I believe they are just ether pollution. --ANK
480 case ICMP_TIME_EXCEEDED:
481 if (code != ICMP_EXC_TTL)
487 t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
489 *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
491 if (t == NULL || t->parms.iph.daddr == 0 ||
492 ipv4_is_multicast(t->parms.iph.daddr))
495 if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
498 if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
502 t->err_time = jiffies;
508 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
510 if (INET_ECN_is_ce(iph->tos)) {
511 if (skb->protocol == htons(ETH_P_IP)) {
512 IP_ECN_set_ce(ip_hdr(skb));
513 } else if (skb->protocol == htons(ETH_P_IPV6)) {
514 IP6_ECN_set_ce(ipv6_hdr(skb));
520 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
523 if (skb->protocol == htons(ETH_P_IP))
524 inner = old_iph->tos;
525 else if (skb->protocol == htons(ETH_P_IPV6))
526 inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
527 return INET_ECN_encapsulate(tos, inner);
530 static int ipgre_rcv(struct sk_buff *skb)
538 struct ip_tunnel *tunnel;
543 if (!pskb_may_pull(skb, 16))
550 if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
551 /* - Version must be 0.
552 - We do not support routing headers.
554 if (flags&(GRE_VERSION|GRE_ROUTING))
557 if (flags&GRE_CSUM) {
558 switch (skb->ip_summed) {
559 case CHECKSUM_COMPLETE:
560 csum = csum_fold(skb->csum);
566 csum = __skb_checksum_complete(skb);
567 skb->ip_summed = CHECKSUM_COMPLETE;
572 key = *(__be32*)(h + offset);
576 seqno = ntohl(*(__be32*)(h + offset));
581 gre_proto = *(__be16 *)(h + 2);
584 if ((tunnel = ipgre_tunnel_lookup(skb->dev,
585 iph->saddr, iph->daddr, key,
587 struct net_device_stats *stats = &tunnel->dev->stats;
591 skb->protocol = gre_proto;
592 /* WCCP version 1 and 2 protocol decoding.
593 * - Change protocol to IP
594 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
596 if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
597 skb->protocol = htons(ETH_P_IP);
598 if ((*(h + offset) & 0xF0) != 0x40)
602 skb->mac_header = skb->network_header;
603 __pskb_pull(skb, offset);
604 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
605 skb->pkt_type = PACKET_HOST;
606 #ifdef CONFIG_NET_IPGRE_BROADCAST
607 if (ipv4_is_multicast(iph->daddr)) {
608 /* Looped back packet, drop it! */
609 if (skb_rtable(skb)->fl.iif == 0)
612 skb->pkt_type = PACKET_BROADCAST;
616 if (((flags&GRE_CSUM) && csum) ||
617 (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
618 stats->rx_crc_errors++;
622 if (tunnel->parms.i_flags&GRE_SEQ) {
623 if (!(flags&GRE_SEQ) ||
624 (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
625 stats->rx_fifo_errors++;
629 tunnel->i_seqno = seqno + 1;
634 /* Warning: All skb pointers will be invalidated! */
635 if (tunnel->dev->type == ARPHRD_ETHER) {
636 if (!pskb_may_pull(skb, ETH_HLEN)) {
637 stats->rx_length_errors++;
643 skb->protocol = eth_type_trans(skb, tunnel->dev);
644 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
648 stats->rx_bytes += len;
649 skb->dev = tunnel->dev;
653 skb_reset_network_header(skb);
654 ipgre_ecn_decapsulate(iph, skb);
660 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
669 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
671 struct ip_tunnel *tunnel = netdev_priv(dev);
672 struct net_device_stats *stats = &dev->stats;
673 struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);
674 struct iphdr *old_iph = ip_hdr(skb);
678 struct rtable *rt; /* Route to the other host */
679 struct net_device *tdev; /* Device to other host */
680 struct iphdr *iph; /* Our new IP header */
681 unsigned int max_headroom; /* The extra header space needed */
686 if (dev->type == ARPHRD_ETHER)
687 IPCB(skb)->flags = 0;
689 if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
691 tiph = (struct iphdr *)skb->data;
693 gre_hlen = tunnel->hlen;
694 tiph = &tunnel->parms.iph;
697 if ((dst = tiph->daddr) == 0) {
700 if (skb_dst(skb) == NULL) {
701 stats->tx_fifo_errors++;
705 if (skb->protocol == htons(ETH_P_IP)) {
706 rt = skb_rtable(skb);
707 if ((dst = rt->rt_gateway) == 0)
711 else if (skb->protocol == htons(ETH_P_IPV6)) {
712 struct in6_addr *addr6;
714 struct neighbour *neigh = skb_dst(skb)->neighbour;
719 addr6 = (struct in6_addr *)&neigh->primary_key;
720 addr_type = ipv6_addr_type(addr6);
722 if (addr_type == IPV6_ADDR_ANY) {
723 addr6 = &ipv6_hdr(skb)->daddr;
724 addr_type = ipv6_addr_type(addr6);
727 if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
730 dst = addr6->s6_addr32[3];
740 if (skb->protocol == htons(ETH_P_IP))
745 struct flowi fl = { .oif = tunnel->parms.link,
748 .saddr = tiph->saddr,
749 .tos = RT_TOS(tos) } },
750 .proto = IPPROTO_GRE };
751 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
752 stats->tx_carrier_errors++;
756 tdev = rt->u.dst.dev;
766 mtu = dst_mtu(&rt->u.dst) - dev->hard_header_len - tunnel->hlen;
768 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
771 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
773 if (skb->protocol == htons(ETH_P_IP)) {
774 df |= (old_iph->frag_off&htons(IP_DF));
776 if ((old_iph->frag_off&htons(IP_DF)) &&
777 mtu < ntohs(old_iph->tot_len)) {
778 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
784 else if (skb->protocol == htons(ETH_P_IPV6)) {
785 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
787 if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
788 if ((tunnel->parms.iph.daddr &&
789 !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
790 rt6->rt6i_dst.plen == 128) {
791 rt6->rt6i_flags |= RTF_MODIFIED;
792 skb_dst(skb)->metrics[RTAX_MTU-1] = mtu;
796 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
797 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
804 if (tunnel->err_count > 0) {
805 if (time_before(jiffies,
806 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
809 dst_link_failure(skb);
811 tunnel->err_count = 0;
814 max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->u.dst.header_len;
816 if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
817 (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
818 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
819 if (max_headroom > dev->needed_headroom)
820 dev->needed_headroom = max_headroom;
828 skb_set_owner_w(new_skb, skb->sk);
831 old_iph = ip_hdr(skb);
834 skb_reset_transport_header(skb);
835 skb_push(skb, gre_hlen);
836 skb_reset_network_header(skb);
837 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
838 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
841 skb_dst_set(skb, &rt->u.dst);
844 * Push down and install the IPIP header.
849 iph->ihl = sizeof(struct iphdr) >> 2;
851 iph->protocol = IPPROTO_GRE;
852 iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb);
853 iph->daddr = rt->rt_dst;
854 iph->saddr = rt->rt_src;
856 if ((iph->ttl = tiph->ttl) == 0) {
857 if (skb->protocol == htons(ETH_P_IP))
858 iph->ttl = old_iph->ttl;
860 else if (skb->protocol == htons(ETH_P_IPV6))
861 iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
864 iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
867 ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
868 ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
869 htons(ETH_P_TEB) : skb->protocol;
871 if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
872 __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
874 if (tunnel->parms.o_flags&GRE_SEQ) {
876 *ptr = htonl(tunnel->o_seqno);
879 if (tunnel->parms.o_flags&GRE_KEY) {
880 *ptr = tunnel->parms.o_key;
883 if (tunnel->parms.o_flags&GRE_CSUM) {
885 *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
895 dst_link_failure(skb);
903 static int ipgre_tunnel_bind_dev(struct net_device *dev)
905 struct net_device *tdev = NULL;
906 struct ip_tunnel *tunnel;
908 int hlen = LL_MAX_HEADER;
909 int mtu = ETH_DATA_LEN;
910 int addend = sizeof(struct iphdr) + 4;
912 tunnel = netdev_priv(dev);
913 iph = &tunnel->parms.iph;
915 /* Guess output device to choose reasonable mtu and needed_headroom */
918 struct flowi fl = { .oif = tunnel->parms.link,
920 { .daddr = iph->daddr,
922 .tos = RT_TOS(iph->tos) } },
923 .proto = IPPROTO_GRE };
925 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
926 tdev = rt->u.dst.dev;
930 if (dev->type != ARPHRD_ETHER)
931 dev->flags |= IFF_POINTOPOINT;
934 if (!tdev && tunnel->parms.link)
935 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
938 hlen = tdev->hard_header_len + tdev->needed_headroom;
941 dev->iflink = tunnel->parms.link;
943 /* Precalculate GRE options length */
944 if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
945 if (tunnel->parms.o_flags&GRE_CSUM)
947 if (tunnel->parms.o_flags&GRE_KEY)
949 if (tunnel->parms.o_flags&GRE_SEQ)
952 dev->needed_headroom = addend + hlen;
953 mtu -= dev->hard_header_len + addend;
958 tunnel->hlen = addend;
964 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
967 struct ip_tunnel_parm p;
969 struct net *net = dev_net(dev);
970 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
975 if (dev == ign->fb_tunnel_dev) {
976 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
980 t = ipgre_tunnel_locate(net, &p, 0);
983 t = netdev_priv(dev);
984 memcpy(&p, &t->parms, sizeof(p));
985 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
992 if (!capable(CAP_NET_ADMIN))
996 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1000 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
1001 p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
1002 ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1005 p.iph.frag_off |= htons(IP_DF);
1007 if (!(p.i_flags&GRE_KEY))
1009 if (!(p.o_flags&GRE_KEY))
1012 t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1014 if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1016 if (t->dev != dev) {
1021 unsigned nflags = 0;
1023 t = netdev_priv(dev);
1025 if (ipv4_is_multicast(p.iph.daddr))
1026 nflags = IFF_BROADCAST;
1027 else if (p.iph.daddr)
1028 nflags = IFF_POINTOPOINT;
1030 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1034 ipgre_tunnel_unlink(ign, t);
1035 t->parms.iph.saddr = p.iph.saddr;
1036 t->parms.iph.daddr = p.iph.daddr;
1037 t->parms.i_key = p.i_key;
1038 t->parms.o_key = p.o_key;
1039 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1040 memcpy(dev->broadcast, &p.iph.daddr, 4);
1041 ipgre_tunnel_link(ign, t);
1042 netdev_state_change(dev);
1048 if (cmd == SIOCCHGTUNNEL) {
1049 t->parms.iph.ttl = p.iph.ttl;
1050 t->parms.iph.tos = p.iph.tos;
1051 t->parms.iph.frag_off = p.iph.frag_off;
1052 if (t->parms.link != p.link) {
1053 t->parms.link = p.link;
1054 dev->mtu = ipgre_tunnel_bind_dev(dev);
1055 netdev_state_change(dev);
1058 if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1061 err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1066 if (!capable(CAP_NET_ADMIN))
1069 if (dev == ign->fb_tunnel_dev) {
1071 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1074 if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1077 if (t == netdev_priv(ign->fb_tunnel_dev))
1081 unregister_netdevice(dev);
1093 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1095 struct ip_tunnel *tunnel = netdev_priv(dev);
1097 new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1103 /* Nice toy. Unfortunately, useless in real life :-)
1104 It allows to construct virtual multiprotocol broadcast "LAN"
1105 over the Internet, provided multicast routing is tuned.
1108 I have no idea was this bicycle invented before me,
1109 so that I had to set ARPHRD_IPGRE to a random value.
1110 I have an impression, that Cisco could make something similar,
1111 but this feature is apparently missing in IOS<=11.2(8).
1113 I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1114 with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1116 ping -t 255 224.66.66.66
1118 If nobody answers, mbone does not work.
1120 ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1121 ip addr add 10.66.66.<somewhat>/24 dev Universe
1122 ifconfig Universe up
1123 ifconfig Universe add fe80::<Your_real_addr>/10
1124 ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1127 ftp fec0:6666:6666::193.233.7.65
1132 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1133 unsigned short type,
1134 const void *daddr, const void *saddr, unsigned len)
1136 struct ip_tunnel *t = netdev_priv(dev);
1137 struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1138 __be16 *p = (__be16*)(iph+1);
1140 memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1141 p[0] = t->parms.o_flags;
1145 * Set the source hardware address.
1149 memcpy(&iph->saddr, saddr, 4);
1151 memcpy(&iph->daddr, daddr, 4);
1158 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1160 struct iphdr *iph = (struct iphdr *) skb_mac_header(skb);
1161 memcpy(haddr, &iph->saddr, 4);
1165 static const struct header_ops ipgre_header_ops = {
1166 .create = ipgre_header,
1167 .parse = ipgre_header_parse,
1170 #ifdef CONFIG_NET_IPGRE_BROADCAST
1171 static int ipgre_open(struct net_device *dev)
1173 struct ip_tunnel *t = netdev_priv(dev);
1175 if (ipv4_is_multicast(t->parms.iph.daddr)) {
1176 struct flowi fl = { .oif = t->parms.link,
1178 { .daddr = t->parms.iph.daddr,
1179 .saddr = t->parms.iph.saddr,
1180 .tos = RT_TOS(t->parms.iph.tos) } },
1181 .proto = IPPROTO_GRE };
1183 if (ip_route_output_key(dev_net(dev), &rt, &fl))
1184 return -EADDRNOTAVAIL;
1185 dev = rt->u.dst.dev;
1187 if (__in_dev_get_rtnl(dev) == NULL)
1188 return -EADDRNOTAVAIL;
1189 t->mlink = dev->ifindex;
1190 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1195 static int ipgre_close(struct net_device *dev)
1197 struct ip_tunnel *t = netdev_priv(dev);
1199 if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1200 struct in_device *in_dev;
1201 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1203 ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1212 static const struct net_device_ops ipgre_netdev_ops = {
1213 .ndo_init = ipgre_tunnel_init,
1214 .ndo_uninit = ipgre_tunnel_uninit,
1215 #ifdef CONFIG_NET_IPGRE_BROADCAST
1216 .ndo_open = ipgre_open,
1217 .ndo_stop = ipgre_close,
1219 .ndo_start_xmit = ipgre_tunnel_xmit,
1220 .ndo_do_ioctl = ipgre_tunnel_ioctl,
1221 .ndo_change_mtu = ipgre_tunnel_change_mtu,
1224 static void ipgre_tunnel_setup(struct net_device *dev)
1226 dev->netdev_ops = &ipgre_netdev_ops;
1227 dev->destructor = free_netdev;
1229 dev->type = ARPHRD_IPGRE;
1230 dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1231 dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1232 dev->flags = IFF_NOARP;
1235 dev->features |= NETIF_F_NETNS_LOCAL;
1236 dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
1239 static int ipgre_tunnel_init(struct net_device *dev)
1241 struct ip_tunnel *tunnel;
1244 tunnel = netdev_priv(dev);
1245 iph = &tunnel->parms.iph;
1248 strcpy(tunnel->parms.name, dev->name);
1250 memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1251 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1254 #ifdef CONFIG_NET_IPGRE_BROADCAST
1255 if (ipv4_is_multicast(iph->daddr)) {
1258 dev->flags = IFF_BROADCAST;
1259 dev->header_ops = &ipgre_header_ops;
1263 dev->header_ops = &ipgre_header_ops;
1268 static void ipgre_fb_tunnel_init(struct net_device *dev)
1270 struct ip_tunnel *tunnel = netdev_priv(dev);
1271 struct iphdr *iph = &tunnel->parms.iph;
1272 struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
1275 strcpy(tunnel->parms.name, dev->name);
1278 iph->protocol = IPPROTO_GRE;
1280 tunnel->hlen = sizeof(struct iphdr) + 4;
1283 ign->tunnels_wc[0] = tunnel;
1287 static const struct net_protocol ipgre_protocol = {
1288 .handler = ipgre_rcv,
1289 .err_handler = ipgre_err,
1293 static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1297 for (prio = 0; prio < 4; prio++) {
1299 for (h = 0; h < HASH_SIZE; h++) {
1300 struct ip_tunnel *t = ign->tunnels[prio][h];
1303 unregister_netdevice_queue(t->dev, head);
1310 static int __net_init ipgre_init_net(struct net *net)
1312 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1315 ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1316 ipgre_tunnel_setup);
1317 if (!ign->fb_tunnel_dev) {
1321 dev_net_set(ign->fb_tunnel_dev, net);
1323 ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1324 ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1326 if ((err = register_netdev(ign->fb_tunnel_dev)))
1332 free_netdev(ign->fb_tunnel_dev);
1337 static void __net_exit ipgre_exit_net(struct net *net)
1339 struct ipgre_net *ign;
1342 ign = net_generic(net, ipgre_net_id);
1344 ipgre_destroy_tunnels(ign, &list);
1345 unregister_netdevice_many(&list);
1349 static struct pernet_operations ipgre_net_ops = {
1350 .init = ipgre_init_net,
1351 .exit = ipgre_exit_net,
1352 .id = &ipgre_net_id,
1353 .size = sizeof(struct ipgre_net),
1356 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1364 if (data[IFLA_GRE_IFLAGS])
1365 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1366 if (data[IFLA_GRE_OFLAGS])
1367 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1368 if (flags & (GRE_VERSION|GRE_ROUTING))
1374 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1378 if (tb[IFLA_ADDRESS]) {
1379 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1381 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1382 return -EADDRNOTAVAIL;
1388 if (data[IFLA_GRE_REMOTE]) {
1389 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1395 return ipgre_tunnel_validate(tb, data);
1398 static void ipgre_netlink_parms(struct nlattr *data[],
1399 struct ip_tunnel_parm *parms)
1401 memset(parms, 0, sizeof(*parms));
1403 parms->iph.protocol = IPPROTO_GRE;
1408 if (data[IFLA_GRE_LINK])
1409 parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1411 if (data[IFLA_GRE_IFLAGS])
1412 parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1414 if (data[IFLA_GRE_OFLAGS])
1415 parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1417 if (data[IFLA_GRE_IKEY])
1418 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1420 if (data[IFLA_GRE_OKEY])
1421 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1423 if (data[IFLA_GRE_LOCAL])
1424 parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1426 if (data[IFLA_GRE_REMOTE])
1427 parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1429 if (data[IFLA_GRE_TTL])
1430 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1432 if (data[IFLA_GRE_TOS])
1433 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1435 if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1436 parms->iph.frag_off = htons(IP_DF);
1439 static int ipgre_tap_init(struct net_device *dev)
1441 struct ip_tunnel *tunnel;
1443 tunnel = netdev_priv(dev);
1446 strcpy(tunnel->parms.name, dev->name);
1448 ipgre_tunnel_bind_dev(dev);
1453 static const struct net_device_ops ipgre_tap_netdev_ops = {
1454 .ndo_init = ipgre_tap_init,
1455 .ndo_uninit = ipgre_tunnel_uninit,
1456 .ndo_start_xmit = ipgre_tunnel_xmit,
1457 .ndo_set_mac_address = eth_mac_addr,
1458 .ndo_validate_addr = eth_validate_addr,
1459 .ndo_change_mtu = ipgre_tunnel_change_mtu,
1462 static void ipgre_tap_setup(struct net_device *dev)
1467 dev->netdev_ops = &ipgre_tap_netdev_ops;
1468 dev->destructor = free_netdev;
1471 dev->features |= NETIF_F_NETNS_LOCAL;
1474 static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
1475 struct nlattr *data[])
1477 struct ip_tunnel *nt;
1478 struct net *net = dev_net(dev);
1479 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1483 nt = netdev_priv(dev);
1484 ipgre_netlink_parms(data, &nt->parms);
1486 if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1489 if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1490 random_ether_addr(dev->dev_addr);
1492 mtu = ipgre_tunnel_bind_dev(dev);
1496 err = register_netdevice(dev);
1501 ipgre_tunnel_link(ign, nt);
1507 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1508 struct nlattr *data[])
1510 struct ip_tunnel *t, *nt;
1511 struct net *net = dev_net(dev);
1512 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1513 struct ip_tunnel_parm p;
1516 if (dev == ign->fb_tunnel_dev)
1519 nt = netdev_priv(dev);
1520 ipgre_netlink_parms(data, &p);
1522 t = ipgre_tunnel_locate(net, &p, 0);
1530 if (dev->type != ARPHRD_ETHER) {
1531 unsigned nflags = 0;
1533 if (ipv4_is_multicast(p.iph.daddr))
1534 nflags = IFF_BROADCAST;
1535 else if (p.iph.daddr)
1536 nflags = IFF_POINTOPOINT;
1538 if ((dev->flags ^ nflags) &
1539 (IFF_POINTOPOINT | IFF_BROADCAST))
1543 ipgre_tunnel_unlink(ign, t);
1544 t->parms.iph.saddr = p.iph.saddr;
1545 t->parms.iph.daddr = p.iph.daddr;
1546 t->parms.i_key = p.i_key;
1547 if (dev->type != ARPHRD_ETHER) {
1548 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1549 memcpy(dev->broadcast, &p.iph.daddr, 4);
1551 ipgre_tunnel_link(ign, t);
1552 netdev_state_change(dev);
1555 t->parms.o_key = p.o_key;
1556 t->parms.iph.ttl = p.iph.ttl;
1557 t->parms.iph.tos = p.iph.tos;
1558 t->parms.iph.frag_off = p.iph.frag_off;
1560 if (t->parms.link != p.link) {
1561 t->parms.link = p.link;
1562 mtu = ipgre_tunnel_bind_dev(dev);
1565 netdev_state_change(dev);
1571 static size_t ipgre_get_size(const struct net_device *dev)
1576 /* IFLA_GRE_IFLAGS */
1578 /* IFLA_GRE_OFLAGS */
1584 /* IFLA_GRE_LOCAL */
1586 /* IFLA_GRE_REMOTE */
1592 /* IFLA_GRE_PMTUDISC */
1597 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1599 struct ip_tunnel *t = netdev_priv(dev);
1600 struct ip_tunnel_parm *p = &t->parms;
1602 NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1603 NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1604 NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
1605 NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
1606 NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
1607 NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
1608 NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
1609 NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1610 NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1611 NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1619 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1620 [IFLA_GRE_LINK] = { .type = NLA_U32 },
1621 [IFLA_GRE_IFLAGS] = { .type = NLA_U16 },
1622 [IFLA_GRE_OFLAGS] = { .type = NLA_U16 },
1623 [IFLA_GRE_IKEY] = { .type = NLA_U32 },
1624 [IFLA_GRE_OKEY] = { .type = NLA_U32 },
1625 [IFLA_GRE_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1626 [IFLA_GRE_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1627 [IFLA_GRE_TTL] = { .type = NLA_U8 },
1628 [IFLA_GRE_TOS] = { .type = NLA_U8 },
1629 [IFLA_GRE_PMTUDISC] = { .type = NLA_U8 },
1632 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1634 .maxtype = IFLA_GRE_MAX,
1635 .policy = ipgre_policy,
1636 .priv_size = sizeof(struct ip_tunnel),
1637 .setup = ipgre_tunnel_setup,
1638 .validate = ipgre_tunnel_validate,
1639 .newlink = ipgre_newlink,
1640 .changelink = ipgre_changelink,
1641 .get_size = ipgre_get_size,
1642 .fill_info = ipgre_fill_info,
1645 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1647 .maxtype = IFLA_GRE_MAX,
1648 .policy = ipgre_policy,
1649 .priv_size = sizeof(struct ip_tunnel),
1650 .setup = ipgre_tap_setup,
1651 .validate = ipgre_tap_validate,
1652 .newlink = ipgre_newlink,
1653 .changelink = ipgre_changelink,
1654 .get_size = ipgre_get_size,
1655 .fill_info = ipgre_fill_info,
1659 * And now the modules code and kernel interface.
1662 static int __init ipgre_init(void)
1666 printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1668 err = register_pernet_device(&ipgre_net_ops);
1672 err = inet_add_protocol(&ipgre_protocol, IPPROTO_GRE);
1674 printk(KERN_INFO "ipgre init: can't add protocol\n");
1675 goto add_proto_failed;
1678 err = rtnl_link_register(&ipgre_link_ops);
1680 goto rtnl_link_failed;
1682 err = rtnl_link_register(&ipgre_tap_ops);
1684 goto tap_ops_failed;
1690 rtnl_link_unregister(&ipgre_link_ops);
1692 inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
1694 unregister_pernet_device(&ipgre_net_ops);
1698 static void __exit ipgre_fini(void)
1700 rtnl_link_unregister(&ipgre_tap_ops);
1701 rtnl_link_unregister(&ipgre_link_ops);
1702 if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1703 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1704 unregister_pernet_device(&ipgre_net_ops);
1707 module_init(ipgre_init);
1708 module_exit(ipgre_fini);
1709 MODULE_LICENSE("GPL");
1710 MODULE_ALIAS_RTNL_LINK("gre");
1711 MODULE_ALIAS_RTNL_LINK("gretap");