vfs: add lookup_open()
[platform/adaptation/renesas_rcar/renesas_kernel.git] / net / ipv4 / ip_gre.c
1 /*
2  *      Linux NET3:     GRE over IP protocol decoder.
3  *
4  *      Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *      This program is free software; you can redistribute it and/or
7  *      modify it under the terms of the GNU General Public License
8  *      as published by the Free Software Foundation; either version
9  *      2 of the License, or (at your option) any later version.
10  *
11  */
12
13 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
14
15 #include <linux/capability.h>
16 #include <linux/module.h>
17 #include <linux/types.h>
18 #include <linux/kernel.h>
19 #include <linux/slab.h>
20 #include <asm/uaccess.h>
21 #include <linux/skbuff.h>
22 #include <linux/netdevice.h>
23 #include <linux/in.h>
24 #include <linux/tcp.h>
25 #include <linux/udp.h>
26 #include <linux/if_arp.h>
27 #include <linux/mroute.h>
28 #include <linux/init.h>
29 #include <linux/in6.h>
30 #include <linux/inetdevice.h>
31 #include <linux/igmp.h>
32 #include <linux/netfilter_ipv4.h>
33 #include <linux/etherdevice.h>
34 #include <linux/if_ether.h>
35
36 #include <net/sock.h>
37 #include <net/ip.h>
38 #include <net/icmp.h>
39 #include <net/protocol.h>
40 #include <net/ipip.h>
41 #include <net/arp.h>
42 #include <net/checksum.h>
43 #include <net/dsfield.h>
44 #include <net/inet_ecn.h>
45 #include <net/xfrm.h>
46 #include <net/net_namespace.h>
47 #include <net/netns/generic.h>
48 #include <net/rtnetlink.h>
49 #include <net/gre.h>
50
51 #if IS_ENABLED(CONFIG_IPV6)
52 #include <net/ipv6.h>
53 #include <net/ip6_fib.h>
54 #include <net/ip6_route.h>
55 #endif
56
57 /*
58    Problems & solutions
59    --------------------
60
61    1. The most important issue is detecting local dead loops.
62    They would cause complete host lockup in transmit, which
63    would be "resolved" by stack overflow or, if queueing is enabled,
64    with infinite looping in net_bh.
65
66    We cannot track such dead loops during route installation,
67    it is infeasible task. The most general solutions would be
68    to keep skb->encapsulation counter (sort of local ttl),
69    and silently drop packet when it expires. It is a good
70    solution, but it supposes maintaining new variable in ALL
71    skb, even if no tunneling is used.
72
73    Current solution: xmit_recursion breaks dead loops. This is a percpu
74    counter, since when we enter the first ndo_xmit(), cpu migration is
75    forbidden. We force an exit if this counter reaches RECURSION_LIMIT
76
77    2. Networking dead loops would not kill routers, but would really
78    kill network. IP hop limit plays role of "t->recursion" in this case,
79    if we copy it from packet being encapsulated to upper header.
80    It is very good solution, but it introduces two problems:
81
82    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
83      do not work over tunnels.
84    - traceroute does not work. I planned to relay ICMP from tunnel,
85      so that this problem would be solved and traceroute output
86      would even more informative. This idea appeared to be wrong:
87      only Linux complies to rfc1812 now (yes, guys, Linux is the only
88      true router now :-)), all routers (at least, in neighbourhood of mine)
89      return only 8 bytes of payload. It is the end.
90
91    Hence, if we want that OSPF worked or traceroute said something reasonable,
92    we should search for another solution.
93
94    One of them is to parse packet trying to detect inner encapsulation
95    made by our node. It is difficult or even impossible, especially,
96    taking into account fragmentation. TO be short, ttl is not solution at all.
97
98    Current solution: The solution was UNEXPECTEDLY SIMPLE.
99    We force DF flag on tunnels with preconfigured hop limit,
100    that is ALL. :-) Well, it does not remove the problem completely,
101    but exponential growth of network traffic is changed to linear
102    (branches, that exceed pmtu are pruned) and tunnel mtu
103    rapidly degrades to value <68, where looping stops.
104    Yes, it is not good if there exists a router in the loop,
105    which does not force DF, even when encapsulating packets have DF set.
106    But it is not our problem! Nobody could accuse us, we made
107    all that we could make. Even if it is your gated who injected
108    fatal route to network, even if it were you who configured
109    fatal static route: you are innocent. :-)
110
111
112
113    3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
114    practically identical code. It would be good to glue them
115    together, but it is not very evident, how to make them modular.
116    sit is integral part of IPv6, ipip and gre are naturally modular.
117    We could extract common parts (hash table, ioctl etc)
118    to a separate module (ip_tunnel.c).
119
120    Alexey Kuznetsov.
121  */
122
123 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
124 static int ipgre_tunnel_init(struct net_device *dev);
125 static void ipgre_tunnel_setup(struct net_device *dev);
126 static int ipgre_tunnel_bind_dev(struct net_device *dev);
127
128 /* Fallback tunnel: no source, no destination, no key, no options */
129
130 #define HASH_SIZE  16
131
132 static int ipgre_net_id __read_mostly;
133 struct ipgre_net {
134         struct ip_tunnel __rcu *tunnels[4][HASH_SIZE];
135
136         struct net_device *fb_tunnel_dev;
137 };
138
139 /* Tunnel hash table */
140
141 /*
142    4 hash tables:
143
144    3: (remote,local)
145    2: (remote,*)
146    1: (*,local)
147    0: (*,*)
148
149    We require exact key match i.e. if a key is present in packet
150    it will match only tunnel with the same key; if it is not present,
151    it will match only keyless tunnel.
152
153    All keysless packets, if not matched configured keyless tunnels
154    will match fallback tunnel.
155  */
156
157 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
158
159 #define tunnels_r_l     tunnels[3]
160 #define tunnels_r       tunnels[2]
161 #define tunnels_l       tunnels[1]
162 #define tunnels_wc      tunnels[0]
163 /*
164  * Locking : hash tables are protected by RCU and RTNL
165  */
166
167 #define for_each_ip_tunnel_rcu(start) \
168         for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
169
170 /* often modified stats are per cpu, other are shared (netdev->stats) */
171 struct pcpu_tstats {
172         u64     rx_packets;
173         u64     rx_bytes;
174         u64     tx_packets;
175         u64     tx_bytes;
176         struct u64_stats_sync   syncp;
177 };
178
179 static struct rtnl_link_stats64 *ipgre_get_stats64(struct net_device *dev,
180                                                    struct rtnl_link_stats64 *tot)
181 {
182         int i;
183
184         for_each_possible_cpu(i) {
185                 const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
186                 u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
187                 unsigned int start;
188
189                 do {
190                         start = u64_stats_fetch_begin_bh(&tstats->syncp);
191                         rx_packets = tstats->rx_packets;
192                         tx_packets = tstats->tx_packets;
193                         rx_bytes = tstats->rx_bytes;
194                         tx_bytes = tstats->tx_bytes;
195                 } while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
196
197                 tot->rx_packets += rx_packets;
198                 tot->tx_packets += tx_packets;
199                 tot->rx_bytes   += rx_bytes;
200                 tot->tx_bytes   += tx_bytes;
201         }
202
203         tot->multicast = dev->stats.multicast;
204         tot->rx_crc_errors = dev->stats.rx_crc_errors;
205         tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
206         tot->rx_length_errors = dev->stats.rx_length_errors;
207         tot->rx_errors = dev->stats.rx_errors;
208         tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
209         tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
210         tot->tx_dropped = dev->stats.tx_dropped;
211         tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
212         tot->tx_errors = dev->stats.tx_errors;
213
214         return tot;
215 }
216
217 /* Given src, dst and key, find appropriate for input tunnel. */
218
219 static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev,
220                                              __be32 remote, __be32 local,
221                                              __be32 key, __be16 gre_proto)
222 {
223         struct net *net = dev_net(dev);
224         int link = dev->ifindex;
225         unsigned int h0 = HASH(remote);
226         unsigned int h1 = HASH(key);
227         struct ip_tunnel *t, *cand = NULL;
228         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
229         int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
230                        ARPHRD_ETHER : ARPHRD_IPGRE;
231         int score, cand_score = 4;
232
233         for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
234                 if (local != t->parms.iph.saddr ||
235                     remote != t->parms.iph.daddr ||
236                     key != t->parms.i_key ||
237                     !(t->dev->flags & IFF_UP))
238                         continue;
239
240                 if (t->dev->type != ARPHRD_IPGRE &&
241                     t->dev->type != dev_type)
242                         continue;
243
244                 score = 0;
245                 if (t->parms.link != link)
246                         score |= 1;
247                 if (t->dev->type != dev_type)
248                         score |= 2;
249                 if (score == 0)
250                         return t;
251
252                 if (score < cand_score) {
253                         cand = t;
254                         cand_score = score;
255                 }
256         }
257
258         for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
259                 if (remote != t->parms.iph.daddr ||
260                     key != t->parms.i_key ||
261                     !(t->dev->flags & IFF_UP))
262                         continue;
263
264                 if (t->dev->type != ARPHRD_IPGRE &&
265                     t->dev->type != dev_type)
266                         continue;
267
268                 score = 0;
269                 if (t->parms.link != link)
270                         score |= 1;
271                 if (t->dev->type != dev_type)
272                         score |= 2;
273                 if (score == 0)
274                         return t;
275
276                 if (score < cand_score) {
277                         cand = t;
278                         cand_score = score;
279                 }
280         }
281
282         for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) {
283                 if ((local != t->parms.iph.saddr &&
284                      (local != t->parms.iph.daddr ||
285                       !ipv4_is_multicast(local))) ||
286                     key != t->parms.i_key ||
287                     !(t->dev->flags & IFF_UP))
288                         continue;
289
290                 if (t->dev->type != ARPHRD_IPGRE &&
291                     t->dev->type != dev_type)
292                         continue;
293
294                 score = 0;
295                 if (t->parms.link != link)
296                         score |= 1;
297                 if (t->dev->type != dev_type)
298                         score |= 2;
299                 if (score == 0)
300                         return t;
301
302                 if (score < cand_score) {
303                         cand = t;
304                         cand_score = score;
305                 }
306         }
307
308         for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) {
309                 if (t->parms.i_key != key ||
310                     !(t->dev->flags & IFF_UP))
311                         continue;
312
313                 if (t->dev->type != ARPHRD_IPGRE &&
314                     t->dev->type != dev_type)
315                         continue;
316
317                 score = 0;
318                 if (t->parms.link != link)
319                         score |= 1;
320                 if (t->dev->type != dev_type)
321                         score |= 2;
322                 if (score == 0)
323                         return t;
324
325                 if (score < cand_score) {
326                         cand = t;
327                         cand_score = score;
328                 }
329         }
330
331         if (cand != NULL)
332                 return cand;
333
334         dev = ign->fb_tunnel_dev;
335         if (dev->flags & IFF_UP)
336                 return netdev_priv(dev);
337
338         return NULL;
339 }
340
341 static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign,
342                 struct ip_tunnel_parm *parms)
343 {
344         __be32 remote = parms->iph.daddr;
345         __be32 local = parms->iph.saddr;
346         __be32 key = parms->i_key;
347         unsigned int h = HASH(key);
348         int prio = 0;
349
350         if (local)
351                 prio |= 1;
352         if (remote && !ipv4_is_multicast(remote)) {
353                 prio |= 2;
354                 h ^= HASH(remote);
355         }
356
357         return &ign->tunnels[prio][h];
358 }
359
360 static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign,
361                 struct ip_tunnel *t)
362 {
363         return __ipgre_bucket(ign, &t->parms);
364 }
365
366 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
367 {
368         struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t);
369
370         rcu_assign_pointer(t->next, rtnl_dereference(*tp));
371         rcu_assign_pointer(*tp, t);
372 }
373
374 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
375 {
376         struct ip_tunnel __rcu **tp;
377         struct ip_tunnel *iter;
378
379         for (tp = ipgre_bucket(ign, t);
380              (iter = rtnl_dereference(*tp)) != NULL;
381              tp = &iter->next) {
382                 if (t == iter) {
383                         rcu_assign_pointer(*tp, t->next);
384                         break;
385                 }
386         }
387 }
388
389 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
390                                            struct ip_tunnel_parm *parms,
391                                            int type)
392 {
393         __be32 remote = parms->iph.daddr;
394         __be32 local = parms->iph.saddr;
395         __be32 key = parms->i_key;
396         int link = parms->link;
397         struct ip_tunnel *t;
398         struct ip_tunnel __rcu **tp;
399         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
400
401         for (tp = __ipgre_bucket(ign, parms);
402              (t = rtnl_dereference(*tp)) != NULL;
403              tp = &t->next)
404                 if (local == t->parms.iph.saddr &&
405                     remote == t->parms.iph.daddr &&
406                     key == t->parms.i_key &&
407                     link == t->parms.link &&
408                     type == t->dev->type)
409                         break;
410
411         return t;
412 }
413
414 static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
415                 struct ip_tunnel_parm *parms, int create)
416 {
417         struct ip_tunnel *t, *nt;
418         struct net_device *dev;
419         char name[IFNAMSIZ];
420         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
421
422         t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
423         if (t || !create)
424                 return t;
425
426         if (parms->name[0])
427                 strlcpy(name, parms->name, IFNAMSIZ);
428         else
429                 strcpy(name, "gre%d");
430
431         dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
432         if (!dev)
433                 return NULL;
434
435         dev_net_set(dev, net);
436
437         nt = netdev_priv(dev);
438         nt->parms = *parms;
439         dev->rtnl_link_ops = &ipgre_link_ops;
440
441         dev->mtu = ipgre_tunnel_bind_dev(dev);
442
443         if (register_netdevice(dev) < 0)
444                 goto failed_free;
445
446         /* Can use a lockless transmit, unless we generate output sequences */
447         if (!(nt->parms.o_flags & GRE_SEQ))
448                 dev->features |= NETIF_F_LLTX;
449
450         dev_hold(dev);
451         ipgre_tunnel_link(ign, nt);
452         return nt;
453
454 failed_free:
455         free_netdev(dev);
456         return NULL;
457 }
458
459 static void ipgre_tunnel_uninit(struct net_device *dev)
460 {
461         struct net *net = dev_net(dev);
462         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
463
464         ipgre_tunnel_unlink(ign, netdev_priv(dev));
465         dev_put(dev);
466 }
467
468
469 static void ipgre_err(struct sk_buff *skb, u32 info)
470 {
471
472 /* All the routers (except for Linux) return only
473    8 bytes of packet payload. It means, that precise relaying of
474    ICMP in the real Internet is absolutely infeasible.
475
476    Moreover, Cisco "wise men" put GRE key to the third word
477    in GRE header. It makes impossible maintaining even soft state for keyed
478    GRE tunnels with enabled checksum. Tell them "thank you".
479
480    Well, I wonder, rfc1812 was written by Cisco employee,
481    what the hell these idiots break standards established
482    by themselves???
483  */
484
485         const struct iphdr *iph = (const struct iphdr *)skb->data;
486         __be16       *p = (__be16 *)(skb->data+(iph->ihl<<2));
487         int grehlen = (iph->ihl<<2) + 4;
488         const int type = icmp_hdr(skb)->type;
489         const int code = icmp_hdr(skb)->code;
490         struct ip_tunnel *t;
491         __be16 flags;
492
493         flags = p[0];
494         if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
495                 if (flags&(GRE_VERSION|GRE_ROUTING))
496                         return;
497                 if (flags&GRE_KEY) {
498                         grehlen += 4;
499                         if (flags&GRE_CSUM)
500                                 grehlen += 4;
501                 }
502         }
503
504         /* If only 8 bytes returned, keyed message will be dropped here */
505         if (skb_headlen(skb) < grehlen)
506                 return;
507
508         switch (type) {
509         default:
510         case ICMP_PARAMETERPROB:
511                 return;
512
513         case ICMP_DEST_UNREACH:
514                 switch (code) {
515                 case ICMP_SR_FAILED:
516                 case ICMP_PORT_UNREACH:
517                         /* Impossible event. */
518                         return;
519                 case ICMP_FRAG_NEEDED:
520                         /* Soft state for pmtu is maintained by IP core. */
521                         return;
522                 default:
523                         /* All others are translated to HOST_UNREACH.
524                            rfc2003 contains "deep thoughts" about NET_UNREACH,
525                            I believe they are just ether pollution. --ANK
526                          */
527                         break;
528                 }
529                 break;
530         case ICMP_TIME_EXCEEDED:
531                 if (code != ICMP_EXC_TTL)
532                         return;
533                 break;
534         }
535
536         rcu_read_lock();
537         t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
538                                 flags & GRE_KEY ?
539                                 *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
540                                 p[1]);
541         if (t == NULL || t->parms.iph.daddr == 0 ||
542             ipv4_is_multicast(t->parms.iph.daddr))
543                 goto out;
544
545         if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
546                 goto out;
547
548         if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
549                 t->err_count++;
550         else
551                 t->err_count = 1;
552         t->err_time = jiffies;
553 out:
554         rcu_read_unlock();
555 }
556
557 static inline void ipgre_ecn_decapsulate(const struct iphdr *iph, struct sk_buff *skb)
558 {
559         if (INET_ECN_is_ce(iph->tos)) {
560                 if (skb->protocol == htons(ETH_P_IP)) {
561                         IP_ECN_set_ce(ip_hdr(skb));
562                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
563                         IP6_ECN_set_ce(ipv6_hdr(skb));
564                 }
565         }
566 }
567
568 static inline u8
569 ipgre_ecn_encapsulate(u8 tos, const struct iphdr *old_iph, struct sk_buff *skb)
570 {
571         u8 inner = 0;
572         if (skb->protocol == htons(ETH_P_IP))
573                 inner = old_iph->tos;
574         else if (skb->protocol == htons(ETH_P_IPV6))
575                 inner = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
576         return INET_ECN_encapsulate(tos, inner);
577 }
578
579 static int ipgre_rcv(struct sk_buff *skb)
580 {
581         const struct iphdr *iph;
582         u8     *h;
583         __be16    flags;
584         __sum16   csum = 0;
585         __be32 key = 0;
586         u32    seqno = 0;
587         struct ip_tunnel *tunnel;
588         int    offset = 4;
589         __be16 gre_proto;
590
591         if (!pskb_may_pull(skb, 16))
592                 goto drop_nolock;
593
594         iph = ip_hdr(skb);
595         h = skb->data;
596         flags = *(__be16 *)h;
597
598         if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
599                 /* - Version must be 0.
600                    - We do not support routing headers.
601                  */
602                 if (flags&(GRE_VERSION|GRE_ROUTING))
603                         goto drop_nolock;
604
605                 if (flags&GRE_CSUM) {
606                         switch (skb->ip_summed) {
607                         case CHECKSUM_COMPLETE:
608                                 csum = csum_fold(skb->csum);
609                                 if (!csum)
610                                         break;
611                                 /* fall through */
612                         case CHECKSUM_NONE:
613                                 skb->csum = 0;
614                                 csum = __skb_checksum_complete(skb);
615                                 skb->ip_summed = CHECKSUM_COMPLETE;
616                         }
617                         offset += 4;
618                 }
619                 if (flags&GRE_KEY) {
620                         key = *(__be32 *)(h + offset);
621                         offset += 4;
622                 }
623                 if (flags&GRE_SEQ) {
624                         seqno = ntohl(*(__be32 *)(h + offset));
625                         offset += 4;
626                 }
627         }
628
629         gre_proto = *(__be16 *)(h + 2);
630
631         rcu_read_lock();
632         if ((tunnel = ipgre_tunnel_lookup(skb->dev,
633                                           iph->saddr, iph->daddr, key,
634                                           gre_proto))) {
635                 struct pcpu_tstats *tstats;
636
637                 secpath_reset(skb);
638
639                 skb->protocol = gre_proto;
640                 /* WCCP version 1 and 2 protocol decoding.
641                  * - Change protocol to IP
642                  * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
643                  */
644                 if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
645                         skb->protocol = htons(ETH_P_IP);
646                         if ((*(h + offset) & 0xF0) != 0x40)
647                                 offset += 4;
648                 }
649
650                 skb->mac_header = skb->network_header;
651                 __pskb_pull(skb, offset);
652                 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
653                 skb->pkt_type = PACKET_HOST;
654 #ifdef CONFIG_NET_IPGRE_BROADCAST
655                 if (ipv4_is_multicast(iph->daddr)) {
656                         /* Looped back packet, drop it! */
657                         if (rt_is_output_route(skb_rtable(skb)))
658                                 goto drop;
659                         tunnel->dev->stats.multicast++;
660                         skb->pkt_type = PACKET_BROADCAST;
661                 }
662 #endif
663
664                 if (((flags&GRE_CSUM) && csum) ||
665                     (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
666                         tunnel->dev->stats.rx_crc_errors++;
667                         tunnel->dev->stats.rx_errors++;
668                         goto drop;
669                 }
670                 if (tunnel->parms.i_flags&GRE_SEQ) {
671                         if (!(flags&GRE_SEQ) ||
672                             (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
673                                 tunnel->dev->stats.rx_fifo_errors++;
674                                 tunnel->dev->stats.rx_errors++;
675                                 goto drop;
676                         }
677                         tunnel->i_seqno = seqno + 1;
678                 }
679
680                 /* Warning: All skb pointers will be invalidated! */
681                 if (tunnel->dev->type == ARPHRD_ETHER) {
682                         if (!pskb_may_pull(skb, ETH_HLEN)) {
683                                 tunnel->dev->stats.rx_length_errors++;
684                                 tunnel->dev->stats.rx_errors++;
685                                 goto drop;
686                         }
687
688                         iph = ip_hdr(skb);
689                         skb->protocol = eth_type_trans(skb, tunnel->dev);
690                         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
691                 }
692
693                 tstats = this_cpu_ptr(tunnel->dev->tstats);
694                 u64_stats_update_begin(&tstats->syncp);
695                 tstats->rx_packets++;
696                 tstats->rx_bytes += skb->len;
697                 u64_stats_update_end(&tstats->syncp);
698
699                 __skb_tunnel_rx(skb, tunnel->dev);
700
701                 skb_reset_network_header(skb);
702                 ipgre_ecn_decapsulate(iph, skb);
703
704                 netif_rx(skb);
705
706                 rcu_read_unlock();
707                 return 0;
708         }
709         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
710
711 drop:
712         rcu_read_unlock();
713 drop_nolock:
714         kfree_skb(skb);
715         return 0;
716 }
717
718 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
719 {
720         struct ip_tunnel *tunnel = netdev_priv(dev);
721         struct pcpu_tstats *tstats;
722         const struct iphdr  *old_iph = ip_hdr(skb);
723         const struct iphdr  *tiph;
724         struct flowi4 fl4;
725         u8     tos;
726         __be16 df;
727         struct rtable *rt;                      /* Route to the other host */
728         struct net_device *tdev;                /* Device to other host */
729         struct iphdr  *iph;                     /* Our new IP header */
730         unsigned int max_headroom;              /* The extra header space needed */
731         int    gre_hlen;
732         __be32 dst;
733         int    mtu;
734
735         if (dev->type == ARPHRD_ETHER)
736                 IPCB(skb)->flags = 0;
737
738         if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
739                 gre_hlen = 0;
740                 tiph = (const struct iphdr *)skb->data;
741         } else {
742                 gre_hlen = tunnel->hlen;
743                 tiph = &tunnel->parms.iph;
744         }
745
746         if ((dst = tiph->daddr) == 0) {
747                 /* NBMA tunnel */
748
749                 if (skb_dst(skb) == NULL) {
750                         dev->stats.tx_fifo_errors++;
751                         goto tx_error;
752                 }
753
754                 if (skb->protocol == htons(ETH_P_IP)) {
755                         rt = skb_rtable(skb);
756                         dst = rt->rt_gateway;
757                 }
758 #if IS_ENABLED(CONFIG_IPV6)
759                 else if (skb->protocol == htons(ETH_P_IPV6)) {
760                         const struct in6_addr *addr6;
761                         struct neighbour *neigh;
762                         bool do_tx_error_icmp;
763                         int addr_type;
764
765                         neigh = dst_neigh_lookup(skb_dst(skb), &ipv6_hdr(skb)->daddr);
766                         if (neigh == NULL)
767                                 goto tx_error;
768
769                         addr6 = (const struct in6_addr *)&neigh->primary_key;
770                         addr_type = ipv6_addr_type(addr6);
771
772                         if (addr_type == IPV6_ADDR_ANY) {
773                                 addr6 = &ipv6_hdr(skb)->daddr;
774                                 addr_type = ipv6_addr_type(addr6);
775                         }
776
777                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
778                                 do_tx_error_icmp = true;
779                         else {
780                                 do_tx_error_icmp = false;
781                                 dst = addr6->s6_addr32[3];
782                         }
783                         neigh_release(neigh);
784                         if (do_tx_error_icmp)
785                                 goto tx_error_icmp;
786                 }
787 #endif
788                 else
789                         goto tx_error;
790         }
791
792         tos = tiph->tos;
793         if (tos == 1) {
794                 tos = 0;
795                 if (skb->protocol == htons(ETH_P_IP))
796                         tos = old_iph->tos;
797                 else if (skb->protocol == htons(ETH_P_IPV6))
798                         tos = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
799         }
800
801         rt = ip_route_output_gre(dev_net(dev), &fl4, dst, tiph->saddr,
802                                  tunnel->parms.o_key, RT_TOS(tos),
803                                  tunnel->parms.link);
804         if (IS_ERR(rt)) {
805                 dev->stats.tx_carrier_errors++;
806                 goto tx_error;
807         }
808         tdev = rt->dst.dev;
809
810         if (tdev == dev) {
811                 ip_rt_put(rt);
812                 dev->stats.collisions++;
813                 goto tx_error;
814         }
815
816         df = tiph->frag_off;
817         if (df)
818                 mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen;
819         else
820                 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
821
822         if (skb_dst(skb))
823                 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
824
825         if (skb->protocol == htons(ETH_P_IP)) {
826                 df |= (old_iph->frag_off&htons(IP_DF));
827
828                 if ((old_iph->frag_off&htons(IP_DF)) &&
829                     mtu < ntohs(old_iph->tot_len)) {
830                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
831                         ip_rt_put(rt);
832                         goto tx_error;
833                 }
834         }
835 #if IS_ENABLED(CONFIG_IPV6)
836         else if (skb->protocol == htons(ETH_P_IPV6)) {
837                 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
838
839                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
840                         if ((tunnel->parms.iph.daddr &&
841                              !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
842                             rt6->rt6i_dst.plen == 128) {
843                                 rt6->rt6i_flags |= RTF_MODIFIED;
844                                 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
845                         }
846                 }
847
848                 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
849                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
850                         ip_rt_put(rt);
851                         goto tx_error;
852                 }
853         }
854 #endif
855
856         if (tunnel->err_count > 0) {
857                 if (time_before(jiffies,
858                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
859                         tunnel->err_count--;
860
861                         dst_link_failure(skb);
862                 } else
863                         tunnel->err_count = 0;
864         }
865
866         max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len;
867
868         if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
869             (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
870                 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
871                 if (max_headroom > dev->needed_headroom)
872                         dev->needed_headroom = max_headroom;
873                 if (!new_skb) {
874                         ip_rt_put(rt);
875                         dev->stats.tx_dropped++;
876                         dev_kfree_skb(skb);
877                         return NETDEV_TX_OK;
878                 }
879                 if (skb->sk)
880                         skb_set_owner_w(new_skb, skb->sk);
881                 dev_kfree_skb(skb);
882                 skb = new_skb;
883                 old_iph = ip_hdr(skb);
884         }
885
886         skb_reset_transport_header(skb);
887         skb_push(skb, gre_hlen);
888         skb_reset_network_header(skb);
889         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
890         IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
891                               IPSKB_REROUTED);
892         skb_dst_drop(skb);
893         skb_dst_set(skb, &rt->dst);
894
895         /*
896          *      Push down and install the IPIP header.
897          */
898
899         iph                     =       ip_hdr(skb);
900         iph->version            =       4;
901         iph->ihl                =       sizeof(struct iphdr) >> 2;
902         iph->frag_off           =       df;
903         iph->protocol           =       IPPROTO_GRE;
904         iph->tos                =       ipgre_ecn_encapsulate(tos, old_iph, skb);
905         iph->daddr              =       fl4.daddr;
906         iph->saddr              =       fl4.saddr;
907
908         if ((iph->ttl = tiph->ttl) == 0) {
909                 if (skb->protocol == htons(ETH_P_IP))
910                         iph->ttl = old_iph->ttl;
911 #if IS_ENABLED(CONFIG_IPV6)
912                 else if (skb->protocol == htons(ETH_P_IPV6))
913                         iph->ttl = ((const struct ipv6hdr *)old_iph)->hop_limit;
914 #endif
915                 else
916                         iph->ttl = ip4_dst_hoplimit(&rt->dst);
917         }
918
919         ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
920         ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
921                                    htons(ETH_P_TEB) : skb->protocol;
922
923         if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
924                 __be32 *ptr = (__be32 *)(((u8 *)iph) + tunnel->hlen - 4);
925
926                 if (tunnel->parms.o_flags&GRE_SEQ) {
927                         ++tunnel->o_seqno;
928                         *ptr = htonl(tunnel->o_seqno);
929                         ptr--;
930                 }
931                 if (tunnel->parms.o_flags&GRE_KEY) {
932                         *ptr = tunnel->parms.o_key;
933                         ptr--;
934                 }
935                 if (tunnel->parms.o_flags&GRE_CSUM) {
936                         *ptr = 0;
937                         *(__sum16 *)ptr = ip_compute_csum((void *)(iph+1), skb->len - sizeof(struct iphdr));
938                 }
939         }
940
941         nf_reset(skb);
942         tstats = this_cpu_ptr(dev->tstats);
943         __IPTUNNEL_XMIT(tstats, &dev->stats);
944         return NETDEV_TX_OK;
945
946 #if IS_ENABLED(CONFIG_IPV6)
947 tx_error_icmp:
948         dst_link_failure(skb);
949 #endif
950 tx_error:
951         dev->stats.tx_errors++;
952         dev_kfree_skb(skb);
953         return NETDEV_TX_OK;
954 }
955
956 static int ipgre_tunnel_bind_dev(struct net_device *dev)
957 {
958         struct net_device *tdev = NULL;
959         struct ip_tunnel *tunnel;
960         const struct iphdr *iph;
961         int hlen = LL_MAX_HEADER;
962         int mtu = ETH_DATA_LEN;
963         int addend = sizeof(struct iphdr) + 4;
964
965         tunnel = netdev_priv(dev);
966         iph = &tunnel->parms.iph;
967
968         /* Guess output device to choose reasonable mtu and needed_headroom */
969
970         if (iph->daddr) {
971                 struct flowi4 fl4;
972                 struct rtable *rt;
973
974                 rt = ip_route_output_gre(dev_net(dev), &fl4,
975                                          iph->daddr, iph->saddr,
976                                          tunnel->parms.o_key,
977                                          RT_TOS(iph->tos),
978                                          tunnel->parms.link);
979                 if (!IS_ERR(rt)) {
980                         tdev = rt->dst.dev;
981                         ip_rt_put(rt);
982                 }
983
984                 if (dev->type != ARPHRD_ETHER)
985                         dev->flags |= IFF_POINTOPOINT;
986         }
987
988         if (!tdev && tunnel->parms.link)
989                 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
990
991         if (tdev) {
992                 hlen = tdev->hard_header_len + tdev->needed_headroom;
993                 mtu = tdev->mtu;
994         }
995         dev->iflink = tunnel->parms.link;
996
997         /* Precalculate GRE options length */
998         if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
999                 if (tunnel->parms.o_flags&GRE_CSUM)
1000                         addend += 4;
1001                 if (tunnel->parms.o_flags&GRE_KEY)
1002                         addend += 4;
1003                 if (tunnel->parms.o_flags&GRE_SEQ)
1004                         addend += 4;
1005         }
1006         dev->needed_headroom = addend + hlen;
1007         mtu -= dev->hard_header_len + addend;
1008
1009         if (mtu < 68)
1010                 mtu = 68;
1011
1012         tunnel->hlen = addend;
1013
1014         return mtu;
1015 }
1016
1017 static int
1018 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
1019 {
1020         int err = 0;
1021         struct ip_tunnel_parm p;
1022         struct ip_tunnel *t;
1023         struct net *net = dev_net(dev);
1024         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1025
1026         switch (cmd) {
1027         case SIOCGETTUNNEL:
1028                 t = NULL;
1029                 if (dev == ign->fb_tunnel_dev) {
1030                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
1031                                 err = -EFAULT;
1032                                 break;
1033                         }
1034                         t = ipgre_tunnel_locate(net, &p, 0);
1035                 }
1036                 if (t == NULL)
1037                         t = netdev_priv(dev);
1038                 memcpy(&p, &t->parms, sizeof(p));
1039                 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
1040                         err = -EFAULT;
1041                 break;
1042
1043         case SIOCADDTUNNEL:
1044         case SIOCCHGTUNNEL:
1045                 err = -EPERM;
1046                 if (!capable(CAP_NET_ADMIN))
1047                         goto done;
1048
1049                 err = -EFAULT;
1050                 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1051                         goto done;
1052
1053                 err = -EINVAL;
1054                 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
1055                     p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
1056                     ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1057                         goto done;
1058                 if (p.iph.ttl)
1059                         p.iph.frag_off |= htons(IP_DF);
1060
1061                 if (!(p.i_flags&GRE_KEY))
1062                         p.i_key = 0;
1063                 if (!(p.o_flags&GRE_KEY))
1064                         p.o_key = 0;
1065
1066                 t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1067
1068                 if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1069                         if (t != NULL) {
1070                                 if (t->dev != dev) {
1071                                         err = -EEXIST;
1072                                         break;
1073                                 }
1074                         } else {
1075                                 unsigned int nflags = 0;
1076
1077                                 t = netdev_priv(dev);
1078
1079                                 if (ipv4_is_multicast(p.iph.daddr))
1080                                         nflags = IFF_BROADCAST;
1081                                 else if (p.iph.daddr)
1082                                         nflags = IFF_POINTOPOINT;
1083
1084                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1085                                         err = -EINVAL;
1086                                         break;
1087                                 }
1088                                 ipgre_tunnel_unlink(ign, t);
1089                                 synchronize_net();
1090                                 t->parms.iph.saddr = p.iph.saddr;
1091                                 t->parms.iph.daddr = p.iph.daddr;
1092                                 t->parms.i_key = p.i_key;
1093                                 t->parms.o_key = p.o_key;
1094                                 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1095                                 memcpy(dev->broadcast, &p.iph.daddr, 4);
1096                                 ipgre_tunnel_link(ign, t);
1097                                 netdev_state_change(dev);
1098                         }
1099                 }
1100
1101                 if (t) {
1102                         err = 0;
1103                         if (cmd == SIOCCHGTUNNEL) {
1104                                 t->parms.iph.ttl = p.iph.ttl;
1105                                 t->parms.iph.tos = p.iph.tos;
1106                                 t->parms.iph.frag_off = p.iph.frag_off;
1107                                 if (t->parms.link != p.link) {
1108                                         t->parms.link = p.link;
1109                                         dev->mtu = ipgre_tunnel_bind_dev(dev);
1110                                         netdev_state_change(dev);
1111                                 }
1112                         }
1113                         if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1114                                 err = -EFAULT;
1115                 } else
1116                         err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1117                 break;
1118
1119         case SIOCDELTUNNEL:
1120                 err = -EPERM;
1121                 if (!capable(CAP_NET_ADMIN))
1122                         goto done;
1123
1124                 if (dev == ign->fb_tunnel_dev) {
1125                         err = -EFAULT;
1126                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1127                                 goto done;
1128                         err = -ENOENT;
1129                         if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1130                                 goto done;
1131                         err = -EPERM;
1132                         if (t == netdev_priv(ign->fb_tunnel_dev))
1133                                 goto done;
1134                         dev = t->dev;
1135                 }
1136                 unregister_netdevice(dev);
1137                 err = 0;
1138                 break;
1139
1140         default:
1141                 err = -EINVAL;
1142         }
1143
1144 done:
1145         return err;
1146 }
1147
1148 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1149 {
1150         struct ip_tunnel *tunnel = netdev_priv(dev);
1151         if (new_mtu < 68 ||
1152             new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1153                 return -EINVAL;
1154         dev->mtu = new_mtu;
1155         return 0;
1156 }
1157
1158 /* Nice toy. Unfortunately, useless in real life :-)
1159    It allows to construct virtual multiprotocol broadcast "LAN"
1160    over the Internet, provided multicast routing is tuned.
1161
1162
1163    I have no idea was this bicycle invented before me,
1164    so that I had to set ARPHRD_IPGRE to a random value.
1165    I have an impression, that Cisco could make something similar,
1166    but this feature is apparently missing in IOS<=11.2(8).
1167
1168    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1169    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1170
1171    ping -t 255 224.66.66.66
1172
1173    If nobody answers, mbone does not work.
1174
1175    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1176    ip addr add 10.66.66.<somewhat>/24 dev Universe
1177    ifconfig Universe up
1178    ifconfig Universe add fe80::<Your_real_addr>/10
1179    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1180    ftp 10.66.66.66
1181    ...
1182    ftp fec0:6666:6666::193.233.7.65
1183    ...
1184
1185  */
1186
1187 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1188                         unsigned short type,
1189                         const void *daddr, const void *saddr, unsigned int len)
1190 {
1191         struct ip_tunnel *t = netdev_priv(dev);
1192         struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1193         __be16 *p = (__be16 *)(iph+1);
1194
1195         memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1196         p[0]            = t->parms.o_flags;
1197         p[1]            = htons(type);
1198
1199         /*
1200          *      Set the source hardware address.
1201          */
1202
1203         if (saddr)
1204                 memcpy(&iph->saddr, saddr, 4);
1205         if (daddr)
1206                 memcpy(&iph->daddr, daddr, 4);
1207         if (iph->daddr)
1208                 return t->hlen;
1209
1210         return -t->hlen;
1211 }
1212
1213 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1214 {
1215         const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
1216         memcpy(haddr, &iph->saddr, 4);
1217         return 4;
1218 }
1219
1220 static const struct header_ops ipgre_header_ops = {
1221         .create = ipgre_header,
1222         .parse  = ipgre_header_parse,
1223 };
1224
1225 #ifdef CONFIG_NET_IPGRE_BROADCAST
1226 static int ipgre_open(struct net_device *dev)
1227 {
1228         struct ip_tunnel *t = netdev_priv(dev);
1229
1230         if (ipv4_is_multicast(t->parms.iph.daddr)) {
1231                 struct flowi4 fl4;
1232                 struct rtable *rt;
1233
1234                 rt = ip_route_output_gre(dev_net(dev), &fl4,
1235                                          t->parms.iph.daddr,
1236                                          t->parms.iph.saddr,
1237                                          t->parms.o_key,
1238                                          RT_TOS(t->parms.iph.tos),
1239                                          t->parms.link);
1240                 if (IS_ERR(rt))
1241                         return -EADDRNOTAVAIL;
1242                 dev = rt->dst.dev;
1243                 ip_rt_put(rt);
1244                 if (__in_dev_get_rtnl(dev) == NULL)
1245                         return -EADDRNOTAVAIL;
1246                 t->mlink = dev->ifindex;
1247                 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1248         }
1249         return 0;
1250 }
1251
1252 static int ipgre_close(struct net_device *dev)
1253 {
1254         struct ip_tunnel *t = netdev_priv(dev);
1255
1256         if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1257                 struct in_device *in_dev;
1258                 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1259                 if (in_dev)
1260                         ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1261         }
1262         return 0;
1263 }
1264
1265 #endif
1266
1267 static const struct net_device_ops ipgre_netdev_ops = {
1268         .ndo_init               = ipgre_tunnel_init,
1269         .ndo_uninit             = ipgre_tunnel_uninit,
1270 #ifdef CONFIG_NET_IPGRE_BROADCAST
1271         .ndo_open               = ipgre_open,
1272         .ndo_stop               = ipgre_close,
1273 #endif
1274         .ndo_start_xmit         = ipgre_tunnel_xmit,
1275         .ndo_do_ioctl           = ipgre_tunnel_ioctl,
1276         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1277         .ndo_get_stats64        = ipgre_get_stats64,
1278 };
1279
1280 static void ipgre_dev_free(struct net_device *dev)
1281 {
1282         free_percpu(dev->tstats);
1283         free_netdev(dev);
1284 }
1285
1286 static void ipgre_tunnel_setup(struct net_device *dev)
1287 {
1288         dev->netdev_ops         = &ipgre_netdev_ops;
1289         dev->destructor         = ipgre_dev_free;
1290
1291         dev->type               = ARPHRD_IPGRE;
1292         dev->needed_headroom    = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1293         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1294         dev->flags              = IFF_NOARP;
1295         dev->iflink             = 0;
1296         dev->addr_len           = 4;
1297         dev->features           |= NETIF_F_NETNS_LOCAL;
1298         dev->priv_flags         &= ~IFF_XMIT_DST_RELEASE;
1299 }
1300
1301 static int ipgre_tunnel_init(struct net_device *dev)
1302 {
1303         struct ip_tunnel *tunnel;
1304         struct iphdr *iph;
1305
1306         tunnel = netdev_priv(dev);
1307         iph = &tunnel->parms.iph;
1308
1309         tunnel->dev = dev;
1310         strcpy(tunnel->parms.name, dev->name);
1311
1312         memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1313         memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1314
1315         if (iph->daddr) {
1316 #ifdef CONFIG_NET_IPGRE_BROADCAST
1317                 if (ipv4_is_multicast(iph->daddr)) {
1318                         if (!iph->saddr)
1319                                 return -EINVAL;
1320                         dev->flags = IFF_BROADCAST;
1321                         dev->header_ops = &ipgre_header_ops;
1322                 }
1323 #endif
1324         } else
1325                 dev->header_ops = &ipgre_header_ops;
1326
1327         dev->tstats = alloc_percpu(struct pcpu_tstats);
1328         if (!dev->tstats)
1329                 return -ENOMEM;
1330
1331         return 0;
1332 }
1333
1334 static void ipgre_fb_tunnel_init(struct net_device *dev)
1335 {
1336         struct ip_tunnel *tunnel = netdev_priv(dev);
1337         struct iphdr *iph = &tunnel->parms.iph;
1338
1339         tunnel->dev = dev;
1340         strcpy(tunnel->parms.name, dev->name);
1341
1342         iph->version            = 4;
1343         iph->protocol           = IPPROTO_GRE;
1344         iph->ihl                = 5;
1345         tunnel->hlen            = sizeof(struct iphdr) + 4;
1346
1347         dev_hold(dev);
1348 }
1349
1350
1351 static const struct gre_protocol ipgre_protocol = {
1352         .handler     = ipgre_rcv,
1353         .err_handler = ipgre_err,
1354 };
1355
1356 static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1357 {
1358         int prio;
1359
1360         for (prio = 0; prio < 4; prio++) {
1361                 int h;
1362                 for (h = 0; h < HASH_SIZE; h++) {
1363                         struct ip_tunnel *t;
1364
1365                         t = rtnl_dereference(ign->tunnels[prio][h]);
1366
1367                         while (t != NULL) {
1368                                 unregister_netdevice_queue(t->dev, head);
1369                                 t = rtnl_dereference(t->next);
1370                         }
1371                 }
1372         }
1373 }
1374
1375 static int __net_init ipgre_init_net(struct net *net)
1376 {
1377         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1378         int err;
1379
1380         ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1381                                            ipgre_tunnel_setup);
1382         if (!ign->fb_tunnel_dev) {
1383                 err = -ENOMEM;
1384                 goto err_alloc_dev;
1385         }
1386         dev_net_set(ign->fb_tunnel_dev, net);
1387
1388         ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1389         ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1390
1391         if ((err = register_netdev(ign->fb_tunnel_dev)))
1392                 goto err_reg_dev;
1393
1394         rcu_assign_pointer(ign->tunnels_wc[0],
1395                            netdev_priv(ign->fb_tunnel_dev));
1396         return 0;
1397
1398 err_reg_dev:
1399         ipgre_dev_free(ign->fb_tunnel_dev);
1400 err_alloc_dev:
1401         return err;
1402 }
1403
1404 static void __net_exit ipgre_exit_net(struct net *net)
1405 {
1406         struct ipgre_net *ign;
1407         LIST_HEAD(list);
1408
1409         ign = net_generic(net, ipgre_net_id);
1410         rtnl_lock();
1411         ipgre_destroy_tunnels(ign, &list);
1412         unregister_netdevice_many(&list);
1413         rtnl_unlock();
1414 }
1415
1416 static struct pernet_operations ipgre_net_ops = {
1417         .init = ipgre_init_net,
1418         .exit = ipgre_exit_net,
1419         .id   = &ipgre_net_id,
1420         .size = sizeof(struct ipgre_net),
1421 };
1422
1423 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1424 {
1425         __be16 flags;
1426
1427         if (!data)
1428                 return 0;
1429
1430         flags = 0;
1431         if (data[IFLA_GRE_IFLAGS])
1432                 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1433         if (data[IFLA_GRE_OFLAGS])
1434                 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1435         if (flags & (GRE_VERSION|GRE_ROUTING))
1436                 return -EINVAL;
1437
1438         return 0;
1439 }
1440
1441 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1442 {
1443         __be32 daddr;
1444
1445         if (tb[IFLA_ADDRESS]) {
1446                 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1447                         return -EINVAL;
1448                 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1449                         return -EADDRNOTAVAIL;
1450         }
1451
1452         if (!data)
1453                 goto out;
1454
1455         if (data[IFLA_GRE_REMOTE]) {
1456                 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1457                 if (!daddr)
1458                         return -EINVAL;
1459         }
1460
1461 out:
1462         return ipgre_tunnel_validate(tb, data);
1463 }
1464
1465 static void ipgre_netlink_parms(struct nlattr *data[],
1466                                 struct ip_tunnel_parm *parms)
1467 {
1468         memset(parms, 0, sizeof(*parms));
1469
1470         parms->iph.protocol = IPPROTO_GRE;
1471
1472         if (!data)
1473                 return;
1474
1475         if (data[IFLA_GRE_LINK])
1476                 parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1477
1478         if (data[IFLA_GRE_IFLAGS])
1479                 parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1480
1481         if (data[IFLA_GRE_OFLAGS])
1482                 parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1483
1484         if (data[IFLA_GRE_IKEY])
1485                 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1486
1487         if (data[IFLA_GRE_OKEY])
1488                 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1489
1490         if (data[IFLA_GRE_LOCAL])
1491                 parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1492
1493         if (data[IFLA_GRE_REMOTE])
1494                 parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1495
1496         if (data[IFLA_GRE_TTL])
1497                 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1498
1499         if (data[IFLA_GRE_TOS])
1500                 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1501
1502         if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1503                 parms->iph.frag_off = htons(IP_DF);
1504 }
1505
1506 static int ipgre_tap_init(struct net_device *dev)
1507 {
1508         struct ip_tunnel *tunnel;
1509
1510         tunnel = netdev_priv(dev);
1511
1512         tunnel->dev = dev;
1513         strcpy(tunnel->parms.name, dev->name);
1514
1515         ipgre_tunnel_bind_dev(dev);
1516
1517         dev->tstats = alloc_percpu(struct pcpu_tstats);
1518         if (!dev->tstats)
1519                 return -ENOMEM;
1520
1521         return 0;
1522 }
1523
1524 static const struct net_device_ops ipgre_tap_netdev_ops = {
1525         .ndo_init               = ipgre_tap_init,
1526         .ndo_uninit             = ipgre_tunnel_uninit,
1527         .ndo_start_xmit         = ipgre_tunnel_xmit,
1528         .ndo_set_mac_address    = eth_mac_addr,
1529         .ndo_validate_addr      = eth_validate_addr,
1530         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1531         .ndo_get_stats64        = ipgre_get_stats64,
1532 };
1533
1534 static void ipgre_tap_setup(struct net_device *dev)
1535 {
1536
1537         ether_setup(dev);
1538
1539         dev->netdev_ops         = &ipgre_tap_netdev_ops;
1540         dev->destructor         = ipgre_dev_free;
1541
1542         dev->iflink             = 0;
1543         dev->features           |= NETIF_F_NETNS_LOCAL;
1544 }
1545
1546 static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
1547                          struct nlattr *data[])
1548 {
1549         struct ip_tunnel *nt;
1550         struct net *net = dev_net(dev);
1551         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1552         int mtu;
1553         int err;
1554
1555         nt = netdev_priv(dev);
1556         ipgre_netlink_parms(data, &nt->parms);
1557
1558         if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1559                 return -EEXIST;
1560
1561         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1562                 eth_hw_addr_random(dev);
1563
1564         mtu = ipgre_tunnel_bind_dev(dev);
1565         if (!tb[IFLA_MTU])
1566                 dev->mtu = mtu;
1567
1568         /* Can use a lockless transmit, unless we generate output sequences */
1569         if (!(nt->parms.o_flags & GRE_SEQ))
1570                 dev->features |= NETIF_F_LLTX;
1571
1572         err = register_netdevice(dev);
1573         if (err)
1574                 goto out;
1575
1576         dev_hold(dev);
1577         ipgre_tunnel_link(ign, nt);
1578
1579 out:
1580         return err;
1581 }
1582
1583 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1584                             struct nlattr *data[])
1585 {
1586         struct ip_tunnel *t, *nt;
1587         struct net *net = dev_net(dev);
1588         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1589         struct ip_tunnel_parm p;
1590         int mtu;
1591
1592         if (dev == ign->fb_tunnel_dev)
1593                 return -EINVAL;
1594
1595         nt = netdev_priv(dev);
1596         ipgre_netlink_parms(data, &p);
1597
1598         t = ipgre_tunnel_locate(net, &p, 0);
1599
1600         if (t) {
1601                 if (t->dev != dev)
1602                         return -EEXIST;
1603         } else {
1604                 t = nt;
1605
1606                 if (dev->type != ARPHRD_ETHER) {
1607                         unsigned int nflags = 0;
1608
1609                         if (ipv4_is_multicast(p.iph.daddr))
1610                                 nflags = IFF_BROADCAST;
1611                         else if (p.iph.daddr)
1612                                 nflags = IFF_POINTOPOINT;
1613
1614                         if ((dev->flags ^ nflags) &
1615                             (IFF_POINTOPOINT | IFF_BROADCAST))
1616                                 return -EINVAL;
1617                 }
1618
1619                 ipgre_tunnel_unlink(ign, t);
1620                 t->parms.iph.saddr = p.iph.saddr;
1621                 t->parms.iph.daddr = p.iph.daddr;
1622                 t->parms.i_key = p.i_key;
1623                 if (dev->type != ARPHRD_ETHER) {
1624                         memcpy(dev->dev_addr, &p.iph.saddr, 4);
1625                         memcpy(dev->broadcast, &p.iph.daddr, 4);
1626                 }
1627                 ipgre_tunnel_link(ign, t);
1628                 netdev_state_change(dev);
1629         }
1630
1631         t->parms.o_key = p.o_key;
1632         t->parms.iph.ttl = p.iph.ttl;
1633         t->parms.iph.tos = p.iph.tos;
1634         t->parms.iph.frag_off = p.iph.frag_off;
1635
1636         if (t->parms.link != p.link) {
1637                 t->parms.link = p.link;
1638                 mtu = ipgre_tunnel_bind_dev(dev);
1639                 if (!tb[IFLA_MTU])
1640                         dev->mtu = mtu;
1641                 netdev_state_change(dev);
1642         }
1643
1644         return 0;
1645 }
1646
1647 static size_t ipgre_get_size(const struct net_device *dev)
1648 {
1649         return
1650                 /* IFLA_GRE_LINK */
1651                 nla_total_size(4) +
1652                 /* IFLA_GRE_IFLAGS */
1653                 nla_total_size(2) +
1654                 /* IFLA_GRE_OFLAGS */
1655                 nla_total_size(2) +
1656                 /* IFLA_GRE_IKEY */
1657                 nla_total_size(4) +
1658                 /* IFLA_GRE_OKEY */
1659                 nla_total_size(4) +
1660                 /* IFLA_GRE_LOCAL */
1661                 nla_total_size(4) +
1662                 /* IFLA_GRE_REMOTE */
1663                 nla_total_size(4) +
1664                 /* IFLA_GRE_TTL */
1665                 nla_total_size(1) +
1666                 /* IFLA_GRE_TOS */
1667                 nla_total_size(1) +
1668                 /* IFLA_GRE_PMTUDISC */
1669                 nla_total_size(1) +
1670                 0;
1671 }
1672
1673 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1674 {
1675         struct ip_tunnel *t = netdev_priv(dev);
1676         struct ip_tunnel_parm *p = &t->parms;
1677
1678         if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
1679             nla_put_be16(skb, IFLA_GRE_IFLAGS, p->i_flags) ||
1680             nla_put_be16(skb, IFLA_GRE_OFLAGS, p->o_flags) ||
1681             nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
1682             nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
1683             nla_put_be32(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
1684             nla_put_be32(skb, IFLA_GRE_REMOTE, p->iph.daddr) ||
1685             nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) ||
1686             nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) ||
1687             nla_put_u8(skb, IFLA_GRE_PMTUDISC,
1688                        !!(p->iph.frag_off & htons(IP_DF))))
1689                 goto nla_put_failure;
1690         return 0;
1691
1692 nla_put_failure:
1693         return -EMSGSIZE;
1694 }
1695
1696 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1697         [IFLA_GRE_LINK]         = { .type = NLA_U32 },
1698         [IFLA_GRE_IFLAGS]       = { .type = NLA_U16 },
1699         [IFLA_GRE_OFLAGS]       = { .type = NLA_U16 },
1700         [IFLA_GRE_IKEY]         = { .type = NLA_U32 },
1701         [IFLA_GRE_OKEY]         = { .type = NLA_U32 },
1702         [IFLA_GRE_LOCAL]        = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1703         [IFLA_GRE_REMOTE]       = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1704         [IFLA_GRE_TTL]          = { .type = NLA_U8 },
1705         [IFLA_GRE_TOS]          = { .type = NLA_U8 },
1706         [IFLA_GRE_PMTUDISC]     = { .type = NLA_U8 },
1707 };
1708
1709 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1710         .kind           = "gre",
1711         .maxtype        = IFLA_GRE_MAX,
1712         .policy         = ipgre_policy,
1713         .priv_size      = sizeof(struct ip_tunnel),
1714         .setup          = ipgre_tunnel_setup,
1715         .validate       = ipgre_tunnel_validate,
1716         .newlink        = ipgre_newlink,
1717         .changelink     = ipgre_changelink,
1718         .get_size       = ipgre_get_size,
1719         .fill_info      = ipgre_fill_info,
1720 };
1721
1722 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1723         .kind           = "gretap",
1724         .maxtype        = IFLA_GRE_MAX,
1725         .policy         = ipgre_policy,
1726         .priv_size      = sizeof(struct ip_tunnel),
1727         .setup          = ipgre_tap_setup,
1728         .validate       = ipgre_tap_validate,
1729         .newlink        = ipgre_newlink,
1730         .changelink     = ipgre_changelink,
1731         .get_size       = ipgre_get_size,
1732         .fill_info      = ipgre_fill_info,
1733 };
1734
1735 /*
1736  *      And now the modules code and kernel interface.
1737  */
1738
1739 static int __init ipgre_init(void)
1740 {
1741         int err;
1742
1743         pr_info("GRE over IPv4 tunneling driver\n");
1744
1745         err = register_pernet_device(&ipgre_net_ops);
1746         if (err < 0)
1747                 return err;
1748
1749         err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1750         if (err < 0) {
1751                 pr_info("%s: can't add protocol\n", __func__);
1752                 goto add_proto_failed;
1753         }
1754
1755         err = rtnl_link_register(&ipgre_link_ops);
1756         if (err < 0)
1757                 goto rtnl_link_failed;
1758
1759         err = rtnl_link_register(&ipgre_tap_ops);
1760         if (err < 0)
1761                 goto tap_ops_failed;
1762
1763 out:
1764         return err;
1765
1766 tap_ops_failed:
1767         rtnl_link_unregister(&ipgre_link_ops);
1768 rtnl_link_failed:
1769         gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1770 add_proto_failed:
1771         unregister_pernet_device(&ipgre_net_ops);
1772         goto out;
1773 }
1774
1775 static void __exit ipgre_fini(void)
1776 {
1777         rtnl_link_unregister(&ipgre_tap_ops);
1778         rtnl_link_unregister(&ipgre_link_ops);
1779         if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
1780                 pr_info("%s: can't remove protocol\n", __func__);
1781         unregister_pernet_device(&ipgre_net_ops);
1782 }
1783
1784 module_init(ipgre_init);
1785 module_exit(ipgre_fini);
1786 MODULE_LICENSE("GPL");
1787 MODULE_ALIAS_RTNL_LINK("gre");
1788 MODULE_ALIAS_RTNL_LINK("gretap");
1789 MODULE_ALIAS_NETDEV("gre0");