Merge git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf-next
[platform/kernel/linux-starfive.git] / net / ipv4 / ip_gre.c
1 /*
2  *      Linux NET3:     GRE over IP protocol decoder.
3  *
4  *      Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *      This program is free software; you can redistribute it and/or
7  *      modify it under the terms of the GNU General Public License
8  *      as published by the Free Software Foundation; either version
9  *      2 of the License, or (at your option) any later version.
10  *
11  */
12
13 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
14
15 #include <linux/capability.h>
16 #include <linux/module.h>
17 #include <linux/types.h>
18 #include <linux/kernel.h>
19 #include <linux/slab.h>
20 #include <linux/uaccess.h>
21 #include <linux/skbuff.h>
22 #include <linux/netdevice.h>
23 #include <linux/in.h>
24 #include <linux/tcp.h>
25 #include <linux/udp.h>
26 #include <linux/if_arp.h>
27 #include <linux/if_vlan.h>
28 #include <linux/init.h>
29 #include <linux/in6.h>
30 #include <linux/inetdevice.h>
31 #include <linux/igmp.h>
32 #include <linux/netfilter_ipv4.h>
33 #include <linux/etherdevice.h>
34 #include <linux/if_ether.h>
35
36 #include <net/sock.h>
37 #include <net/ip.h>
38 #include <net/icmp.h>
39 #include <net/protocol.h>
40 #include <net/ip_tunnels.h>
41 #include <net/arp.h>
42 #include <net/checksum.h>
43 #include <net/dsfield.h>
44 #include <net/inet_ecn.h>
45 #include <net/xfrm.h>
46 #include <net/net_namespace.h>
47 #include <net/netns/generic.h>
48 #include <net/rtnetlink.h>
49 #include <net/gre.h>
50 #include <net/dst_metadata.h>
51 #include <net/erspan.h>
52
53 /*
54    Problems & solutions
55    --------------------
56
57    1. The most important issue is detecting local dead loops.
58    They would cause complete host lockup in transmit, which
59    would be "resolved" by stack overflow or, if queueing is enabled,
60    with infinite looping in net_bh.
61
62    We cannot track such dead loops during route installation,
63    it is infeasible task. The most general solutions would be
64    to keep skb->encapsulation counter (sort of local ttl),
65    and silently drop packet when it expires. It is a good
66    solution, but it supposes maintaining new variable in ALL
67    skb, even if no tunneling is used.
68
69    Current solution: xmit_recursion breaks dead loops. This is a percpu
70    counter, since when we enter the first ndo_xmit(), cpu migration is
71    forbidden. We force an exit if this counter reaches RECURSION_LIMIT
72
73    2. Networking dead loops would not kill routers, but would really
74    kill network. IP hop limit plays role of "t->recursion" in this case,
75    if we copy it from packet being encapsulated to upper header.
76    It is very good solution, but it introduces two problems:
77
78    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
79      do not work over tunnels.
80    - traceroute does not work. I planned to relay ICMP from tunnel,
81      so that this problem would be solved and traceroute output
82      would even more informative. This idea appeared to be wrong:
83      only Linux complies to rfc1812 now (yes, guys, Linux is the only
84      true router now :-)), all routers (at least, in neighbourhood of mine)
85      return only 8 bytes of payload. It is the end.
86
87    Hence, if we want that OSPF worked or traceroute said something reasonable,
88    we should search for another solution.
89
90    One of them is to parse packet trying to detect inner encapsulation
91    made by our node. It is difficult or even impossible, especially,
92    taking into account fragmentation. TO be short, ttl is not solution at all.
93
94    Current solution: The solution was UNEXPECTEDLY SIMPLE.
95    We force DF flag on tunnels with preconfigured hop limit,
96    that is ALL. :-) Well, it does not remove the problem completely,
97    but exponential growth of network traffic is changed to linear
98    (branches, that exceed pmtu are pruned) and tunnel mtu
99    rapidly degrades to value <68, where looping stops.
100    Yes, it is not good if there exists a router in the loop,
101    which does not force DF, even when encapsulating packets have DF set.
102    But it is not our problem! Nobody could accuse us, we made
103    all that we could make. Even if it is your gated who injected
104    fatal route to network, even if it were you who configured
105    fatal static route: you are innocent. :-)
106
107    Alexey Kuznetsov.
108  */
109
110 static bool log_ecn_error = true;
111 module_param(log_ecn_error, bool, 0644);
112 MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
113
114 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
115 static int ipgre_tunnel_init(struct net_device *dev);
116 static void erspan_build_header(struct sk_buff *skb,
117                                 __be32 id, u32 index, bool truncate);
118
119 static unsigned int ipgre_net_id __read_mostly;
120 static unsigned int gre_tap_net_id __read_mostly;
121 static unsigned int erspan_net_id __read_mostly;
122
123 static void ipgre_err(struct sk_buff *skb, u32 info,
124                       const struct tnl_ptk_info *tpi)
125 {
126
127         /* All the routers (except for Linux) return only
128            8 bytes of packet payload. It means, that precise relaying of
129            ICMP in the real Internet is absolutely infeasible.
130
131            Moreover, Cisco "wise men" put GRE key to the third word
132            in GRE header. It makes impossible maintaining even soft
133            state for keyed GRE tunnels with enabled checksum. Tell
134            them "thank you".
135
136            Well, I wonder, rfc1812 was written by Cisco employee,
137            what the hell these idiots break standards established
138            by themselves???
139            */
140         struct net *net = dev_net(skb->dev);
141         struct ip_tunnel_net *itn;
142         const struct iphdr *iph;
143         const int type = icmp_hdr(skb)->type;
144         const int code = icmp_hdr(skb)->code;
145         unsigned int data_len = 0;
146         struct ip_tunnel *t;
147
148         switch (type) {
149         default:
150         case ICMP_PARAMETERPROB:
151                 return;
152
153         case ICMP_DEST_UNREACH:
154                 switch (code) {
155                 case ICMP_SR_FAILED:
156                 case ICMP_PORT_UNREACH:
157                         /* Impossible event. */
158                         return;
159                 default:
160                         /* All others are translated to HOST_UNREACH.
161                            rfc2003 contains "deep thoughts" about NET_UNREACH,
162                            I believe they are just ether pollution. --ANK
163                          */
164                         break;
165                 }
166                 break;
167
168         case ICMP_TIME_EXCEEDED:
169                 if (code != ICMP_EXC_TTL)
170                         return;
171                 data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */
172                 break;
173
174         case ICMP_REDIRECT:
175                 break;
176         }
177
178         if (tpi->proto == htons(ETH_P_TEB))
179                 itn = net_generic(net, gre_tap_net_id);
180         else
181                 itn = net_generic(net, ipgre_net_id);
182
183         iph = (const struct iphdr *)(icmp_hdr(skb) + 1);
184         t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
185                              iph->daddr, iph->saddr, tpi->key);
186
187         if (!t)
188                 return;
189
190 #if IS_ENABLED(CONFIG_IPV6)
191        if (tpi->proto == htons(ETH_P_IPV6) &&
192            !ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len,
193                                        type, data_len))
194                return;
195 #endif
196
197         if (t->parms.iph.daddr == 0 ||
198             ipv4_is_multicast(t->parms.iph.daddr))
199                 return;
200
201         if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
202                 return;
203
204         if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
205                 t->err_count++;
206         else
207                 t->err_count = 1;
208         t->err_time = jiffies;
209 }
210
211 static void gre_err(struct sk_buff *skb, u32 info)
212 {
213         /* All the routers (except for Linux) return only
214          * 8 bytes of packet payload. It means, that precise relaying of
215          * ICMP in the real Internet is absolutely infeasible.
216          *
217          * Moreover, Cisco "wise men" put GRE key to the third word
218          * in GRE header. It makes impossible maintaining even soft
219          * state for keyed
220          * GRE tunnels with enabled checksum. Tell them "thank you".
221          *
222          * Well, I wonder, rfc1812 was written by Cisco employee,
223          * what the hell these idiots break standards established
224          * by themselves???
225          */
226
227         const struct iphdr *iph = (struct iphdr *)skb->data;
228         const int type = icmp_hdr(skb)->type;
229         const int code = icmp_hdr(skb)->code;
230         struct tnl_ptk_info tpi;
231         bool csum_err = false;
232
233         if (gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IP),
234                              iph->ihl * 4) < 0) {
235                 if (!csum_err)          /* ignore csum errors. */
236                         return;
237         }
238
239         if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
240                 ipv4_update_pmtu(skb, dev_net(skb->dev), info,
241                                  skb->dev->ifindex, 0, IPPROTO_GRE, 0);
242                 return;
243         }
244         if (type == ICMP_REDIRECT) {
245                 ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex, 0,
246                               IPPROTO_GRE, 0);
247                 return;
248         }
249
250         ipgre_err(skb, info, &tpi);
251 }
252
253 static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
254                       int gre_hdr_len)
255 {
256         struct net *net = dev_net(skb->dev);
257         struct metadata_dst *tun_dst = NULL;
258         struct ip_tunnel_net *itn;
259         struct ip_tunnel *tunnel;
260         struct erspanhdr *ershdr;
261         const struct iphdr *iph;
262         __be32 session_id;
263         __be32 index;
264         int len;
265
266         itn = net_generic(net, erspan_net_id);
267         len = gre_hdr_len + sizeof(*ershdr);
268
269         if (unlikely(!pskb_may_pull(skb, len)))
270                 return -ENOMEM;
271
272         iph = ip_hdr(skb);
273         ershdr = (struct erspanhdr *)(skb->data + gre_hdr_len);
274
275         /* The original GRE header does not have key field,
276          * Use ERSPAN 10-bit session ID as key.
277          */
278         session_id = cpu_to_be32(ntohs(ershdr->session_id));
279         tpi->key = session_id;
280         index = ershdr->md.index;
281         tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex,
282                                   tpi->flags | TUNNEL_KEY,
283                                   iph->saddr, iph->daddr, tpi->key);
284
285         if (tunnel) {
286                 if (__iptunnel_pull_header(skb,
287                                            gre_hdr_len + sizeof(*ershdr),
288                                            htons(ETH_P_TEB),
289                                            false, false) < 0)
290                         goto drop;
291
292                 if (tunnel->collect_md) {
293                         struct ip_tunnel_info *info;
294                         struct erspan_metadata *md;
295                         __be64 tun_id;
296                         __be16 flags;
297
298                         tpi->flags |= TUNNEL_KEY;
299                         flags = tpi->flags;
300                         tun_id = key32_to_tunnel_id(tpi->key);
301
302                         tun_dst = ip_tun_rx_dst(skb, flags,
303                                                 tun_id, sizeof(*md));
304                         if (!tun_dst)
305                                 return PACKET_REJECT;
306
307                         md = ip_tunnel_info_opts(&tun_dst->u.tun_info);
308                         if (!md)
309                                 return PACKET_REJECT;
310
311                         md->index = index;
312                         info = &tun_dst->u.tun_info;
313                         info->key.tun_flags |= TUNNEL_ERSPAN_OPT;
314                         info->options_len = sizeof(*md);
315                 } else {
316                         tunnel->index = ntohl(index);
317                 }
318
319                 skb_reset_mac_header(skb);
320                 ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
321                 return PACKET_RCVD;
322         }
323 drop:
324         kfree_skb(skb);
325         return PACKET_RCVD;
326 }
327
328 static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
329                        struct ip_tunnel_net *itn, int hdr_len, bool raw_proto)
330 {
331         struct metadata_dst *tun_dst = NULL;
332         const struct iphdr *iph;
333         struct ip_tunnel *tunnel;
334
335         iph = ip_hdr(skb);
336         tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
337                                   iph->saddr, iph->daddr, tpi->key);
338
339         if (tunnel) {
340                 if (__iptunnel_pull_header(skb, hdr_len, tpi->proto,
341                                            raw_proto, false) < 0)
342                         goto drop;
343
344                 if (tunnel->dev->type != ARPHRD_NONE)
345                         skb_pop_mac_header(skb);
346                 else
347                         skb_reset_mac_header(skb);
348                 if (tunnel->collect_md) {
349                         __be16 flags;
350                         __be64 tun_id;
351
352                         flags = tpi->flags & (TUNNEL_CSUM | TUNNEL_KEY);
353                         tun_id = key32_to_tunnel_id(tpi->key);
354                         tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0);
355                         if (!tun_dst)
356                                 return PACKET_REJECT;
357                 }
358
359                 ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
360                 return PACKET_RCVD;
361         }
362         return PACKET_NEXT;
363
364 drop:
365         kfree_skb(skb);
366         return PACKET_RCVD;
367 }
368
369 static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
370                      int hdr_len)
371 {
372         struct net *net = dev_net(skb->dev);
373         struct ip_tunnel_net *itn;
374         int res;
375
376         if (tpi->proto == htons(ETH_P_TEB))
377                 itn = net_generic(net, gre_tap_net_id);
378         else
379                 itn = net_generic(net, ipgre_net_id);
380
381         res = __ipgre_rcv(skb, tpi, itn, hdr_len, false);
382         if (res == PACKET_NEXT && tpi->proto == htons(ETH_P_TEB)) {
383                 /* ipgre tunnels in collect metadata mode should receive
384                  * also ETH_P_TEB traffic.
385                  */
386                 itn = net_generic(net, ipgre_net_id);
387                 res = __ipgre_rcv(skb, tpi, itn, hdr_len, true);
388         }
389         return res;
390 }
391
392 static int gre_rcv(struct sk_buff *skb)
393 {
394         struct tnl_ptk_info tpi;
395         bool csum_err = false;
396         int hdr_len;
397
398 #ifdef CONFIG_NET_IPGRE_BROADCAST
399         if (ipv4_is_multicast(ip_hdr(skb)->daddr)) {
400                 /* Looped back packet, drop it! */
401                 if (rt_is_output_route(skb_rtable(skb)))
402                         goto drop;
403         }
404 #endif
405
406         hdr_len = gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IP), 0);
407         if (hdr_len < 0)
408                 goto drop;
409
410         if (unlikely(tpi.proto == htons(ETH_P_ERSPAN))) {
411                 if (erspan_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
412                         return 0;
413         }
414
415         if (ipgre_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
416                 return 0;
417
418         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
419 drop:
420         kfree_skb(skb);
421         return 0;
422 }
423
424 static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
425                        const struct iphdr *tnl_params,
426                        __be16 proto)
427 {
428         struct ip_tunnel *tunnel = netdev_priv(dev);
429
430         if (tunnel->parms.o_flags & TUNNEL_SEQ)
431                 tunnel->o_seqno++;
432
433         /* Push GRE header. */
434         gre_build_header(skb, tunnel->tun_hlen,
435                          tunnel->parms.o_flags, proto, tunnel->parms.o_key,
436                          htonl(tunnel->o_seqno));
437
438         ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol);
439 }
440
441 static int gre_handle_offloads(struct sk_buff *skb, bool csum)
442 {
443         return iptunnel_handle_offloads(skb, csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE);
444 }
445
446 static struct rtable *gre_get_rt(struct sk_buff *skb,
447                                  struct net_device *dev,
448                                  struct flowi4 *fl,
449                                  const struct ip_tunnel_key *key)
450 {
451         struct net *net = dev_net(dev);
452
453         memset(fl, 0, sizeof(*fl));
454         fl->daddr = key->u.ipv4.dst;
455         fl->saddr = key->u.ipv4.src;
456         fl->flowi4_tos = RT_TOS(key->tos);
457         fl->flowi4_mark = skb->mark;
458         fl->flowi4_proto = IPPROTO_GRE;
459
460         return ip_route_output_key(net, fl);
461 }
462
463 static struct rtable *prepare_fb_xmit(struct sk_buff *skb,
464                                       struct net_device *dev,
465                                       struct flowi4 *fl,
466                                       int tunnel_hlen)
467 {
468         struct ip_tunnel_info *tun_info;
469         const struct ip_tunnel_key *key;
470         struct rtable *rt = NULL;
471         int min_headroom;
472         bool use_cache;
473         int err;
474
475         tun_info = skb_tunnel_info(skb);
476         key = &tun_info->key;
477         use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
478
479         if (use_cache)
480                 rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl->saddr);
481         if (!rt) {
482                 rt = gre_get_rt(skb, dev, fl, key);
483                 if (IS_ERR(rt))
484                         goto err_free_skb;
485                 if (use_cache)
486                         dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
487                                           fl->saddr);
488         }
489
490         min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
491                         + tunnel_hlen + sizeof(struct iphdr);
492         if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) {
493                 int head_delta = SKB_DATA_ALIGN(min_headroom -
494                                                 skb_headroom(skb) +
495                                                 16);
496                 err = pskb_expand_head(skb, max_t(int, head_delta, 0),
497                                        0, GFP_ATOMIC);
498                 if (unlikely(err))
499                         goto err_free_rt;
500         }
501         return rt;
502
503 err_free_rt:
504         ip_rt_put(rt);
505 err_free_skb:
506         kfree_skb(skb);
507         dev->stats.tx_dropped++;
508         return NULL;
509 }
510
511 static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev,
512                         __be16 proto)
513 {
514         struct ip_tunnel_info *tun_info;
515         const struct ip_tunnel_key *key;
516         struct rtable *rt = NULL;
517         struct flowi4 fl;
518         int tunnel_hlen;
519         __be16 df, flags;
520
521         tun_info = skb_tunnel_info(skb);
522         if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
523                      ip_tunnel_info_af(tun_info) != AF_INET))
524                 goto err_free_skb;
525
526         key = &tun_info->key;
527         tunnel_hlen = gre_calc_hlen(key->tun_flags);
528
529         rt = prepare_fb_xmit(skb, dev, &fl, tunnel_hlen);
530         if (!rt)
531                 return;
532
533         /* Push Tunnel header. */
534         if (gre_handle_offloads(skb, !!(tun_info->key.tun_flags & TUNNEL_CSUM)))
535                 goto err_free_rt;
536
537         flags = tun_info->key.tun_flags & (TUNNEL_CSUM | TUNNEL_KEY);
538         gre_build_header(skb, tunnel_hlen, flags, proto,
539                          tunnel_id_to_key32(tun_info->key.tun_id), 0);
540
541         df = key->tun_flags & TUNNEL_DONT_FRAGMENT ?  htons(IP_DF) : 0;
542
543         iptunnel_xmit(skb->sk, rt, skb, fl.saddr, key->u.ipv4.dst, IPPROTO_GRE,
544                       key->tos, key->ttl, df, false);
545         return;
546
547 err_free_rt:
548         ip_rt_put(rt);
549 err_free_skb:
550         kfree_skb(skb);
551         dev->stats.tx_dropped++;
552 }
553
554 static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev,
555                            __be16 proto)
556 {
557         struct ip_tunnel *tunnel = netdev_priv(dev);
558         struct ip_tunnel_info *tun_info;
559         const struct ip_tunnel_key *key;
560         struct erspan_metadata *md;
561         struct rtable *rt = NULL;
562         bool truncate = false;
563         struct flowi4 fl;
564         int tunnel_hlen;
565         __be16 df;
566
567         tun_info = skb_tunnel_info(skb);
568         if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
569                      ip_tunnel_info_af(tun_info) != AF_INET))
570                 goto err_free_skb;
571
572         key = &tun_info->key;
573
574         /* ERSPAN has fixed 8 byte GRE header */
575         tunnel_hlen = 8 + sizeof(struct erspanhdr);
576
577         rt = prepare_fb_xmit(skb, dev, &fl, tunnel_hlen);
578         if (!rt)
579                 return;
580
581         if (gre_handle_offloads(skb, false))
582                 goto err_free_rt;
583
584         if (skb->len > dev->mtu) {
585                 pskb_trim(skb, dev->mtu);
586                 truncate = true;
587         }
588
589         md = ip_tunnel_info_opts(tun_info);
590         if (!md)
591                 goto err_free_rt;
592
593         erspan_build_header(skb, tunnel_id_to_key32(key->tun_id),
594                             ntohl(md->index), truncate);
595
596         gre_build_header(skb, 8, TUNNEL_SEQ,
597                          htons(ETH_P_ERSPAN), 0, htonl(tunnel->o_seqno++));
598
599         df = key->tun_flags & TUNNEL_DONT_FRAGMENT ?  htons(IP_DF) : 0;
600
601         iptunnel_xmit(skb->sk, rt, skb, fl.saddr, key->u.ipv4.dst, IPPROTO_GRE,
602                       key->tos, key->ttl, df, false);
603         return;
604
605 err_free_rt:
606         ip_rt_put(rt);
607 err_free_skb:
608         kfree_skb(skb);
609         dev->stats.tx_dropped++;
610 }
611
612 static int gre_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
613 {
614         struct ip_tunnel_info *info = skb_tunnel_info(skb);
615         struct rtable *rt;
616         struct flowi4 fl4;
617
618         if (ip_tunnel_info_af(info) != AF_INET)
619                 return -EINVAL;
620
621         rt = gre_get_rt(skb, dev, &fl4, &info->key);
622         if (IS_ERR(rt))
623                 return PTR_ERR(rt);
624
625         ip_rt_put(rt);
626         info->key.u.ipv4.src = fl4.saddr;
627         return 0;
628 }
629
630 static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
631                               struct net_device *dev)
632 {
633         struct ip_tunnel *tunnel = netdev_priv(dev);
634         const struct iphdr *tnl_params;
635
636         if (tunnel->collect_md) {
637                 gre_fb_xmit(skb, dev, skb->protocol);
638                 return NETDEV_TX_OK;
639         }
640
641         if (dev->header_ops) {
642                 /* Need space for new headers */
643                 if (skb_cow_head(skb, dev->needed_headroom -
644                                       (tunnel->hlen + sizeof(struct iphdr))))
645                         goto free_skb;
646
647                 tnl_params = (const struct iphdr *)skb->data;
648
649                 /* Pull skb since ip_tunnel_xmit() needs skb->data pointing
650                  * to gre header.
651                  */
652                 skb_pull(skb, tunnel->hlen + sizeof(struct iphdr));
653                 skb_reset_mac_header(skb);
654         } else {
655                 if (skb_cow_head(skb, dev->needed_headroom))
656                         goto free_skb;
657
658                 tnl_params = &tunnel->parms.iph;
659         }
660
661         if (gre_handle_offloads(skb, !!(tunnel->parms.o_flags & TUNNEL_CSUM)))
662                 goto free_skb;
663
664         __gre_xmit(skb, dev, tnl_params, skb->protocol);
665         return NETDEV_TX_OK;
666
667 free_skb:
668         kfree_skb(skb);
669         dev->stats.tx_dropped++;
670         return NETDEV_TX_OK;
671 }
672
673 static inline u8 tos_to_cos(u8 tos)
674 {
675         u8 dscp, cos;
676
677         dscp = tos >> 2;
678         cos = dscp >> 3;
679         return cos;
680 }
681
682 static void erspan_build_header(struct sk_buff *skb,
683                                 __be32 id, u32 index, bool truncate)
684 {
685         struct iphdr *iphdr = ip_hdr(skb);
686         struct ethhdr *eth = eth_hdr(skb);
687         enum erspan_encap_type enc_type;
688         struct erspanhdr *ershdr;
689         struct qtag_prefix {
690                 __be16 eth_type;
691                 __be16 tci;
692         } *qp;
693         u16 vlan_tci = 0;
694
695         enc_type = ERSPAN_ENCAP_NOVLAN;
696
697         /* If mirrored packet has vlan tag, extract tci and
698          *  perserve vlan header in the mirrored frame.
699          */
700         if (eth->h_proto == htons(ETH_P_8021Q)) {
701                 qp = (struct qtag_prefix *)(skb->data + 2 * ETH_ALEN);
702                 vlan_tci = ntohs(qp->tci);
703                 enc_type = ERSPAN_ENCAP_INFRAME;
704         }
705
706         skb_push(skb, sizeof(*ershdr));
707         ershdr = (struct erspanhdr *)skb->data;
708         memset(ershdr, 0, sizeof(*ershdr));
709
710         ershdr->ver_vlan = htons((vlan_tci & VLAN_MASK) |
711                                  (ERSPAN_VERSION << VER_OFFSET));
712         ershdr->session_id = htons((u16)(ntohl(id) & ID_MASK) |
713                            ((tos_to_cos(iphdr->tos) << COS_OFFSET) & COS_MASK) |
714                            (enc_type << EN_OFFSET & EN_MASK) |
715                            ((truncate << T_OFFSET) & T_MASK));
716         ershdr->md.index = htonl(index & INDEX_MASK);
717 }
718
719 static netdev_tx_t erspan_xmit(struct sk_buff *skb,
720                                struct net_device *dev)
721 {
722         struct ip_tunnel *tunnel = netdev_priv(dev);
723         bool truncate = false;
724
725         if (tunnel->collect_md) {
726                 erspan_fb_xmit(skb, dev, skb->protocol);
727                 return NETDEV_TX_OK;
728         }
729
730         if (gre_handle_offloads(skb, false))
731                 goto free_skb;
732
733         if (skb_cow_head(skb, dev->needed_headroom))
734                 goto free_skb;
735
736         if (skb->len > dev->mtu) {
737                 pskb_trim(skb, dev->mtu);
738                 truncate = true;
739         }
740
741         /* Push ERSPAN header */
742         erspan_build_header(skb, tunnel->parms.o_key, tunnel->index, truncate);
743         tunnel->parms.o_flags &= ~TUNNEL_KEY;
744         __gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_ERSPAN));
745         return NETDEV_TX_OK;
746
747 free_skb:
748         kfree_skb(skb);
749         dev->stats.tx_dropped++;
750         return NETDEV_TX_OK;
751 }
752
753 static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
754                                 struct net_device *dev)
755 {
756         struct ip_tunnel *tunnel = netdev_priv(dev);
757
758         if (tunnel->collect_md) {
759                 gre_fb_xmit(skb, dev, htons(ETH_P_TEB));
760                 return NETDEV_TX_OK;
761         }
762
763         if (gre_handle_offloads(skb, !!(tunnel->parms.o_flags & TUNNEL_CSUM)))
764                 goto free_skb;
765
766         if (skb_cow_head(skb, dev->needed_headroom))
767                 goto free_skb;
768
769         __gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_TEB));
770         return NETDEV_TX_OK;
771
772 free_skb:
773         kfree_skb(skb);
774         dev->stats.tx_dropped++;
775         return NETDEV_TX_OK;
776 }
777
778 static int ipgre_tunnel_ioctl(struct net_device *dev,
779                               struct ifreq *ifr, int cmd)
780 {
781         int err;
782         struct ip_tunnel_parm p;
783
784         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
785                 return -EFAULT;
786         if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
787                 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
788                     p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
789                     ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
790                         return -EINVAL;
791         }
792         p.i_flags = gre_flags_to_tnl_flags(p.i_flags);
793         p.o_flags = gre_flags_to_tnl_flags(p.o_flags);
794
795         err = ip_tunnel_ioctl(dev, &p, cmd);
796         if (err)
797                 return err;
798
799         p.i_flags = gre_tnl_flags_to_gre_flags(p.i_flags);
800         p.o_flags = gre_tnl_flags_to_gre_flags(p.o_flags);
801
802         if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
803                 return -EFAULT;
804         return 0;
805 }
806
807 /* Nice toy. Unfortunately, useless in real life :-)
808    It allows to construct virtual multiprotocol broadcast "LAN"
809    over the Internet, provided multicast routing is tuned.
810
811
812    I have no idea was this bicycle invented before me,
813    so that I had to set ARPHRD_IPGRE to a random value.
814    I have an impression, that Cisco could make something similar,
815    but this feature is apparently missing in IOS<=11.2(8).
816
817    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
818    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
819
820    ping -t 255 224.66.66.66
821
822    If nobody answers, mbone does not work.
823
824    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
825    ip addr add 10.66.66.<somewhat>/24 dev Universe
826    ifconfig Universe up
827    ifconfig Universe add fe80::<Your_real_addr>/10
828    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
829    ftp 10.66.66.66
830    ...
831    ftp fec0:6666:6666::193.233.7.65
832    ...
833  */
834 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
835                         unsigned short type,
836                         const void *daddr, const void *saddr, unsigned int len)
837 {
838         struct ip_tunnel *t = netdev_priv(dev);
839         struct iphdr *iph;
840         struct gre_base_hdr *greh;
841
842         iph = skb_push(skb, t->hlen + sizeof(*iph));
843         greh = (struct gre_base_hdr *)(iph+1);
844         greh->flags = gre_tnl_flags_to_gre_flags(t->parms.o_flags);
845         greh->protocol = htons(type);
846
847         memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
848
849         /* Set the source hardware address. */
850         if (saddr)
851                 memcpy(&iph->saddr, saddr, 4);
852         if (daddr)
853                 memcpy(&iph->daddr, daddr, 4);
854         if (iph->daddr)
855                 return t->hlen + sizeof(*iph);
856
857         return -(t->hlen + sizeof(*iph));
858 }
859
860 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
861 {
862         const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
863         memcpy(haddr, &iph->saddr, 4);
864         return 4;
865 }
866
867 static const struct header_ops ipgre_header_ops = {
868         .create = ipgre_header,
869         .parse  = ipgre_header_parse,
870 };
871
872 #ifdef CONFIG_NET_IPGRE_BROADCAST
873 static int ipgre_open(struct net_device *dev)
874 {
875         struct ip_tunnel *t = netdev_priv(dev);
876
877         if (ipv4_is_multicast(t->parms.iph.daddr)) {
878                 struct flowi4 fl4;
879                 struct rtable *rt;
880
881                 rt = ip_route_output_gre(t->net, &fl4,
882                                          t->parms.iph.daddr,
883                                          t->parms.iph.saddr,
884                                          t->parms.o_key,
885                                          RT_TOS(t->parms.iph.tos),
886                                          t->parms.link);
887                 if (IS_ERR(rt))
888                         return -EADDRNOTAVAIL;
889                 dev = rt->dst.dev;
890                 ip_rt_put(rt);
891                 if (!__in_dev_get_rtnl(dev))
892                         return -EADDRNOTAVAIL;
893                 t->mlink = dev->ifindex;
894                 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
895         }
896         return 0;
897 }
898
899 static int ipgre_close(struct net_device *dev)
900 {
901         struct ip_tunnel *t = netdev_priv(dev);
902
903         if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
904                 struct in_device *in_dev;
905                 in_dev = inetdev_by_index(t->net, t->mlink);
906                 if (in_dev)
907                         ip_mc_dec_group(in_dev, t->parms.iph.daddr);
908         }
909         return 0;
910 }
911 #endif
912
913 static const struct net_device_ops ipgre_netdev_ops = {
914         .ndo_init               = ipgre_tunnel_init,
915         .ndo_uninit             = ip_tunnel_uninit,
916 #ifdef CONFIG_NET_IPGRE_BROADCAST
917         .ndo_open               = ipgre_open,
918         .ndo_stop               = ipgre_close,
919 #endif
920         .ndo_start_xmit         = ipgre_xmit,
921         .ndo_do_ioctl           = ipgre_tunnel_ioctl,
922         .ndo_change_mtu         = ip_tunnel_change_mtu,
923         .ndo_get_stats64        = ip_tunnel_get_stats64,
924         .ndo_get_iflink         = ip_tunnel_get_iflink,
925 };
926
927 #define GRE_FEATURES (NETIF_F_SG |              \
928                       NETIF_F_FRAGLIST |        \
929                       NETIF_F_HIGHDMA |         \
930                       NETIF_F_HW_CSUM)
931
932 static void ipgre_tunnel_setup(struct net_device *dev)
933 {
934         dev->netdev_ops         = &ipgre_netdev_ops;
935         dev->type               = ARPHRD_IPGRE;
936         ip_tunnel_setup(dev, ipgre_net_id);
937 }
938
939 static void __gre_tunnel_init(struct net_device *dev)
940 {
941         struct ip_tunnel *tunnel;
942         int t_hlen;
943
944         tunnel = netdev_priv(dev);
945         tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags);
946         tunnel->parms.iph.protocol = IPPROTO_GRE;
947
948         tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
949
950         t_hlen = tunnel->hlen + sizeof(struct iphdr);
951
952         dev->needed_headroom    = LL_MAX_HEADER + t_hlen + 4;
953         dev->mtu                = ETH_DATA_LEN - t_hlen - 4;
954
955         dev->features           |= GRE_FEATURES;
956         dev->hw_features        |= GRE_FEATURES;
957
958         if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) {
959                 /* TCP offload with GRE SEQ is not supported, nor
960                  * can we support 2 levels of outer headers requiring
961                  * an update.
962                  */
963                 if (!(tunnel->parms.o_flags & TUNNEL_CSUM) ||
964                     (tunnel->encap.type == TUNNEL_ENCAP_NONE)) {
965                         dev->features    |= NETIF_F_GSO_SOFTWARE;
966                         dev->hw_features |= NETIF_F_GSO_SOFTWARE;
967                 }
968
969                 /* Can use a lockless transmit, unless we generate
970                  * output sequences
971                  */
972                 dev->features |= NETIF_F_LLTX;
973         }
974 }
975
976 static int ipgre_tunnel_init(struct net_device *dev)
977 {
978         struct ip_tunnel *tunnel = netdev_priv(dev);
979         struct iphdr *iph = &tunnel->parms.iph;
980
981         __gre_tunnel_init(dev);
982
983         memcpy(dev->dev_addr, &iph->saddr, 4);
984         memcpy(dev->broadcast, &iph->daddr, 4);
985
986         dev->flags              = IFF_NOARP;
987         netif_keep_dst(dev);
988         dev->addr_len           = 4;
989
990         if (iph->daddr && !tunnel->collect_md) {
991 #ifdef CONFIG_NET_IPGRE_BROADCAST
992                 if (ipv4_is_multicast(iph->daddr)) {
993                         if (!iph->saddr)
994                                 return -EINVAL;
995                         dev->flags = IFF_BROADCAST;
996                         dev->header_ops = &ipgre_header_ops;
997                 }
998 #endif
999         } else if (!tunnel->collect_md) {
1000                 dev->header_ops = &ipgre_header_ops;
1001         }
1002
1003         return ip_tunnel_init(dev);
1004 }
1005
1006 static const struct gre_protocol ipgre_protocol = {
1007         .handler     = gre_rcv,
1008         .err_handler = gre_err,
1009 };
1010
1011 static int __net_init ipgre_init_net(struct net *net)
1012 {
1013         return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL);
1014 }
1015
1016 static void __net_exit ipgre_exit_net(struct net *net)
1017 {
1018         struct ip_tunnel_net *itn = net_generic(net, ipgre_net_id);
1019         ip_tunnel_delete_net(itn, &ipgre_link_ops);
1020 }
1021
1022 static struct pernet_operations ipgre_net_ops = {
1023         .init = ipgre_init_net,
1024         .exit = ipgre_exit_net,
1025         .id   = &ipgre_net_id,
1026         .size = sizeof(struct ip_tunnel_net),
1027 };
1028
1029 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[],
1030                                  struct netlink_ext_ack *extack)
1031 {
1032         __be16 flags;
1033
1034         if (!data)
1035                 return 0;
1036
1037         flags = 0;
1038         if (data[IFLA_GRE_IFLAGS])
1039                 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1040         if (data[IFLA_GRE_OFLAGS])
1041                 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1042         if (flags & (GRE_VERSION|GRE_ROUTING))
1043                 return -EINVAL;
1044
1045         if (data[IFLA_GRE_COLLECT_METADATA] &&
1046             data[IFLA_GRE_ENCAP_TYPE] &&
1047             nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]) != TUNNEL_ENCAP_NONE)
1048                 return -EINVAL;
1049
1050         return 0;
1051 }
1052
1053 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[],
1054                               struct netlink_ext_ack *extack)
1055 {
1056         __be32 daddr;
1057
1058         if (tb[IFLA_ADDRESS]) {
1059                 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1060                         return -EINVAL;
1061                 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1062                         return -EADDRNOTAVAIL;
1063         }
1064
1065         if (!data)
1066                 goto out;
1067
1068         if (data[IFLA_GRE_REMOTE]) {
1069                 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1070                 if (!daddr)
1071                         return -EINVAL;
1072         }
1073
1074 out:
1075         return ipgre_tunnel_validate(tb, data, extack);
1076 }
1077
1078 static int erspan_validate(struct nlattr *tb[], struct nlattr *data[],
1079                            struct netlink_ext_ack *extack)
1080 {
1081         __be16 flags = 0;
1082         int ret;
1083
1084         if (!data)
1085                 return 0;
1086
1087         ret = ipgre_tap_validate(tb, data, extack);
1088         if (ret)
1089                 return ret;
1090
1091         /* ERSPAN should only have GRE sequence and key flag */
1092         if (data[IFLA_GRE_OFLAGS])
1093                 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1094         if (data[IFLA_GRE_IFLAGS])
1095                 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1096         if (!data[IFLA_GRE_COLLECT_METADATA] &&
1097             flags != (GRE_SEQ | GRE_KEY))
1098                 return -EINVAL;
1099
1100         /* ERSPAN Session ID only has 10-bit. Since we reuse
1101          * 32-bit key field as ID, check it's range.
1102          */
1103         if (data[IFLA_GRE_IKEY] &&
1104             (ntohl(nla_get_be32(data[IFLA_GRE_IKEY])) & ~ID_MASK))
1105                 return -EINVAL;
1106
1107         if (data[IFLA_GRE_OKEY] &&
1108             (ntohl(nla_get_be32(data[IFLA_GRE_OKEY])) & ~ID_MASK))
1109                 return -EINVAL;
1110
1111         return 0;
1112 }
1113
1114 static int ipgre_netlink_parms(struct net_device *dev,
1115                                 struct nlattr *data[],
1116                                 struct nlattr *tb[],
1117                                 struct ip_tunnel_parm *parms,
1118                                 __u32 *fwmark)
1119 {
1120         struct ip_tunnel *t = netdev_priv(dev);
1121
1122         memset(parms, 0, sizeof(*parms));
1123
1124         parms->iph.protocol = IPPROTO_GRE;
1125
1126         if (!data)
1127                 return 0;
1128
1129         if (data[IFLA_GRE_LINK])
1130                 parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1131
1132         if (data[IFLA_GRE_IFLAGS])
1133                 parms->i_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_IFLAGS]));
1134
1135         if (data[IFLA_GRE_OFLAGS])
1136                 parms->o_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_OFLAGS]));
1137
1138         if (data[IFLA_GRE_IKEY])
1139                 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1140
1141         if (data[IFLA_GRE_OKEY])
1142                 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1143
1144         if (data[IFLA_GRE_LOCAL])
1145                 parms->iph.saddr = nla_get_in_addr(data[IFLA_GRE_LOCAL]);
1146
1147         if (data[IFLA_GRE_REMOTE])
1148                 parms->iph.daddr = nla_get_in_addr(data[IFLA_GRE_REMOTE]);
1149
1150         if (data[IFLA_GRE_TTL])
1151                 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1152
1153         if (data[IFLA_GRE_TOS])
1154                 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1155
1156         if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC])) {
1157                 if (t->ignore_df)
1158                         return -EINVAL;
1159                 parms->iph.frag_off = htons(IP_DF);
1160         }
1161
1162         if (data[IFLA_GRE_COLLECT_METADATA]) {
1163                 t->collect_md = true;
1164                 if (dev->type == ARPHRD_IPGRE)
1165                         dev->type = ARPHRD_NONE;
1166         }
1167
1168         if (data[IFLA_GRE_IGNORE_DF]) {
1169                 if (nla_get_u8(data[IFLA_GRE_IGNORE_DF])
1170                   && (parms->iph.frag_off & htons(IP_DF)))
1171                         return -EINVAL;
1172                 t->ignore_df = !!nla_get_u8(data[IFLA_GRE_IGNORE_DF]);
1173         }
1174
1175         if (data[IFLA_GRE_FWMARK])
1176                 *fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]);
1177
1178         if (data[IFLA_GRE_ERSPAN_INDEX]) {
1179                 t->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]);
1180
1181                 if (t->index & ~INDEX_MASK)
1182                         return -EINVAL;
1183         }
1184
1185         return 0;
1186 }
1187
1188 /* This function returns true when ENCAP attributes are present in the nl msg */
1189 static bool ipgre_netlink_encap_parms(struct nlattr *data[],
1190                                       struct ip_tunnel_encap *ipencap)
1191 {
1192         bool ret = false;
1193
1194         memset(ipencap, 0, sizeof(*ipencap));
1195
1196         if (!data)
1197                 return ret;
1198
1199         if (data[IFLA_GRE_ENCAP_TYPE]) {
1200                 ret = true;
1201                 ipencap->type = nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]);
1202         }
1203
1204         if (data[IFLA_GRE_ENCAP_FLAGS]) {
1205                 ret = true;
1206                 ipencap->flags = nla_get_u16(data[IFLA_GRE_ENCAP_FLAGS]);
1207         }
1208
1209         if (data[IFLA_GRE_ENCAP_SPORT]) {
1210                 ret = true;
1211                 ipencap->sport = nla_get_be16(data[IFLA_GRE_ENCAP_SPORT]);
1212         }
1213
1214         if (data[IFLA_GRE_ENCAP_DPORT]) {
1215                 ret = true;
1216                 ipencap->dport = nla_get_be16(data[IFLA_GRE_ENCAP_DPORT]);
1217         }
1218
1219         return ret;
1220 }
1221
1222 static int gre_tap_init(struct net_device *dev)
1223 {
1224         __gre_tunnel_init(dev);
1225         dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1226
1227         return ip_tunnel_init(dev);
1228 }
1229
1230 static const struct net_device_ops gre_tap_netdev_ops = {
1231         .ndo_init               = gre_tap_init,
1232         .ndo_uninit             = ip_tunnel_uninit,
1233         .ndo_start_xmit         = gre_tap_xmit,
1234         .ndo_set_mac_address    = eth_mac_addr,
1235         .ndo_validate_addr      = eth_validate_addr,
1236         .ndo_change_mtu         = ip_tunnel_change_mtu,
1237         .ndo_get_stats64        = ip_tunnel_get_stats64,
1238         .ndo_get_iflink         = ip_tunnel_get_iflink,
1239         .ndo_fill_metadata_dst  = gre_fill_metadata_dst,
1240 };
1241
1242 static int erspan_tunnel_init(struct net_device *dev)
1243 {
1244         struct ip_tunnel *tunnel = netdev_priv(dev);
1245         int t_hlen;
1246
1247         tunnel->tun_hlen = 8;
1248         tunnel->parms.iph.protocol = IPPROTO_GRE;
1249         t_hlen = tunnel->hlen + sizeof(struct iphdr) + sizeof(struct erspanhdr);
1250
1251         dev->needed_headroom = LL_MAX_HEADER + t_hlen + 4;
1252         dev->mtu = ETH_DATA_LEN - t_hlen - 4;
1253         dev->features           |= GRE_FEATURES;
1254         dev->hw_features        |= GRE_FEATURES;
1255         dev->priv_flags         |= IFF_LIVE_ADDR_CHANGE;
1256
1257         return ip_tunnel_init(dev);
1258 }
1259
1260 static const struct net_device_ops erspan_netdev_ops = {
1261         .ndo_init               = erspan_tunnel_init,
1262         .ndo_uninit             = ip_tunnel_uninit,
1263         .ndo_start_xmit         = erspan_xmit,
1264         .ndo_set_mac_address    = eth_mac_addr,
1265         .ndo_validate_addr      = eth_validate_addr,
1266         .ndo_change_mtu         = ip_tunnel_change_mtu,
1267         .ndo_get_stats64        = ip_tunnel_get_stats64,
1268         .ndo_get_iflink         = ip_tunnel_get_iflink,
1269         .ndo_fill_metadata_dst  = gre_fill_metadata_dst,
1270 };
1271
1272 static void ipgre_tap_setup(struct net_device *dev)
1273 {
1274         ether_setup(dev);
1275         dev->netdev_ops = &gre_tap_netdev_ops;
1276         dev->priv_flags &= ~IFF_TX_SKB_SHARING;
1277         dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1278         ip_tunnel_setup(dev, gre_tap_net_id);
1279 }
1280
1281 static int ipgre_newlink(struct net *src_net, struct net_device *dev,
1282                          struct nlattr *tb[], struct nlattr *data[],
1283                          struct netlink_ext_ack *extack)
1284 {
1285         struct ip_tunnel_parm p;
1286         struct ip_tunnel_encap ipencap;
1287         __u32 fwmark = 0;
1288         int err;
1289
1290         if (ipgre_netlink_encap_parms(data, &ipencap)) {
1291                 struct ip_tunnel *t = netdev_priv(dev);
1292                 err = ip_tunnel_encap_setup(t, &ipencap);
1293
1294                 if (err < 0)
1295                         return err;
1296         }
1297
1298         err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark);
1299         if (err < 0)
1300                 return err;
1301         return ip_tunnel_newlink(dev, tb, &p, fwmark);
1302 }
1303
1304 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1305                             struct nlattr *data[],
1306                             struct netlink_ext_ack *extack)
1307 {
1308         struct ip_tunnel *t = netdev_priv(dev);
1309         struct ip_tunnel_parm p;
1310         struct ip_tunnel_encap ipencap;
1311         __u32 fwmark = t->fwmark;
1312         int err;
1313
1314         if (ipgre_netlink_encap_parms(data, &ipencap)) {
1315                 err = ip_tunnel_encap_setup(t, &ipencap);
1316
1317                 if (err < 0)
1318                         return err;
1319         }
1320
1321         err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark);
1322         if (err < 0)
1323                 return err;
1324         return ip_tunnel_changelink(dev, tb, &p, fwmark);
1325 }
1326
1327 static size_t ipgre_get_size(const struct net_device *dev)
1328 {
1329         return
1330                 /* IFLA_GRE_LINK */
1331                 nla_total_size(4) +
1332                 /* IFLA_GRE_IFLAGS */
1333                 nla_total_size(2) +
1334                 /* IFLA_GRE_OFLAGS */
1335                 nla_total_size(2) +
1336                 /* IFLA_GRE_IKEY */
1337                 nla_total_size(4) +
1338                 /* IFLA_GRE_OKEY */
1339                 nla_total_size(4) +
1340                 /* IFLA_GRE_LOCAL */
1341                 nla_total_size(4) +
1342                 /* IFLA_GRE_REMOTE */
1343                 nla_total_size(4) +
1344                 /* IFLA_GRE_TTL */
1345                 nla_total_size(1) +
1346                 /* IFLA_GRE_TOS */
1347                 nla_total_size(1) +
1348                 /* IFLA_GRE_PMTUDISC */
1349                 nla_total_size(1) +
1350                 /* IFLA_GRE_ENCAP_TYPE */
1351                 nla_total_size(2) +
1352                 /* IFLA_GRE_ENCAP_FLAGS */
1353                 nla_total_size(2) +
1354                 /* IFLA_GRE_ENCAP_SPORT */
1355                 nla_total_size(2) +
1356                 /* IFLA_GRE_ENCAP_DPORT */
1357                 nla_total_size(2) +
1358                 /* IFLA_GRE_COLLECT_METADATA */
1359                 nla_total_size(0) +
1360                 /* IFLA_GRE_IGNORE_DF */
1361                 nla_total_size(1) +
1362                 /* IFLA_GRE_FWMARK */
1363                 nla_total_size(4) +
1364                 /* IFLA_GRE_ERSPAN_INDEX */
1365                 nla_total_size(4) +
1366                 0;
1367 }
1368
1369 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1370 {
1371         struct ip_tunnel *t = netdev_priv(dev);
1372         struct ip_tunnel_parm *p = &t->parms;
1373
1374         if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
1375             nla_put_be16(skb, IFLA_GRE_IFLAGS,
1376                          gre_tnl_flags_to_gre_flags(p->i_flags)) ||
1377             nla_put_be16(skb, IFLA_GRE_OFLAGS,
1378                          gre_tnl_flags_to_gre_flags(p->o_flags)) ||
1379             nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
1380             nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
1381             nla_put_in_addr(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
1382             nla_put_in_addr(skb, IFLA_GRE_REMOTE, p->iph.daddr) ||
1383             nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) ||
1384             nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) ||
1385             nla_put_u8(skb, IFLA_GRE_PMTUDISC,
1386                        !!(p->iph.frag_off & htons(IP_DF))) ||
1387             nla_put_u32(skb, IFLA_GRE_FWMARK, t->fwmark))
1388                 goto nla_put_failure;
1389
1390         if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE,
1391                         t->encap.type) ||
1392             nla_put_be16(skb, IFLA_GRE_ENCAP_SPORT,
1393                          t->encap.sport) ||
1394             nla_put_be16(skb, IFLA_GRE_ENCAP_DPORT,
1395                          t->encap.dport) ||
1396             nla_put_u16(skb, IFLA_GRE_ENCAP_FLAGS,
1397                         t->encap.flags))
1398                 goto nla_put_failure;
1399
1400         if (nla_put_u8(skb, IFLA_GRE_IGNORE_DF, t->ignore_df))
1401                 goto nla_put_failure;
1402
1403         if (t->collect_md) {
1404                 if (nla_put_flag(skb, IFLA_GRE_COLLECT_METADATA))
1405                         goto nla_put_failure;
1406         }
1407
1408         if (t->index)
1409                 if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index))
1410                         goto nla_put_failure;
1411
1412         return 0;
1413
1414 nla_put_failure:
1415         return -EMSGSIZE;
1416 }
1417
1418 static void erspan_setup(struct net_device *dev)
1419 {
1420         ether_setup(dev);
1421         dev->netdev_ops = &erspan_netdev_ops;
1422         dev->priv_flags &= ~IFF_TX_SKB_SHARING;
1423         dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1424         ip_tunnel_setup(dev, erspan_net_id);
1425 }
1426
1427 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1428         [IFLA_GRE_LINK]         = { .type = NLA_U32 },
1429         [IFLA_GRE_IFLAGS]       = { .type = NLA_U16 },
1430         [IFLA_GRE_OFLAGS]       = { .type = NLA_U16 },
1431         [IFLA_GRE_IKEY]         = { .type = NLA_U32 },
1432         [IFLA_GRE_OKEY]         = { .type = NLA_U32 },
1433         [IFLA_GRE_LOCAL]        = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1434         [IFLA_GRE_REMOTE]       = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1435         [IFLA_GRE_TTL]          = { .type = NLA_U8 },
1436         [IFLA_GRE_TOS]          = { .type = NLA_U8 },
1437         [IFLA_GRE_PMTUDISC]     = { .type = NLA_U8 },
1438         [IFLA_GRE_ENCAP_TYPE]   = { .type = NLA_U16 },
1439         [IFLA_GRE_ENCAP_FLAGS]  = { .type = NLA_U16 },
1440         [IFLA_GRE_ENCAP_SPORT]  = { .type = NLA_U16 },
1441         [IFLA_GRE_ENCAP_DPORT]  = { .type = NLA_U16 },
1442         [IFLA_GRE_COLLECT_METADATA]     = { .type = NLA_FLAG },
1443         [IFLA_GRE_IGNORE_DF]    = { .type = NLA_U8 },
1444         [IFLA_GRE_FWMARK]       = { .type = NLA_U32 },
1445         [IFLA_GRE_ERSPAN_INDEX] = { .type = NLA_U32 },
1446 };
1447
1448 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1449         .kind           = "gre",
1450         .maxtype        = IFLA_GRE_MAX,
1451         .policy         = ipgre_policy,
1452         .priv_size      = sizeof(struct ip_tunnel),
1453         .setup          = ipgre_tunnel_setup,
1454         .validate       = ipgre_tunnel_validate,
1455         .newlink        = ipgre_newlink,
1456         .changelink     = ipgre_changelink,
1457         .dellink        = ip_tunnel_dellink,
1458         .get_size       = ipgre_get_size,
1459         .fill_info      = ipgre_fill_info,
1460         .get_link_net   = ip_tunnel_get_link_net,
1461 };
1462
1463 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1464         .kind           = "gretap",
1465         .maxtype        = IFLA_GRE_MAX,
1466         .policy         = ipgre_policy,
1467         .priv_size      = sizeof(struct ip_tunnel),
1468         .setup          = ipgre_tap_setup,
1469         .validate       = ipgre_tap_validate,
1470         .newlink        = ipgre_newlink,
1471         .changelink     = ipgre_changelink,
1472         .dellink        = ip_tunnel_dellink,
1473         .get_size       = ipgre_get_size,
1474         .fill_info      = ipgre_fill_info,
1475         .get_link_net   = ip_tunnel_get_link_net,
1476 };
1477
1478 static struct rtnl_link_ops erspan_link_ops __read_mostly = {
1479         .kind           = "erspan",
1480         .maxtype        = IFLA_GRE_MAX,
1481         .policy         = ipgre_policy,
1482         .priv_size      = sizeof(struct ip_tunnel),
1483         .setup          = erspan_setup,
1484         .validate       = erspan_validate,
1485         .newlink        = ipgre_newlink,
1486         .changelink     = ipgre_changelink,
1487         .dellink        = ip_tunnel_dellink,
1488         .get_size       = ipgre_get_size,
1489         .fill_info      = ipgre_fill_info,
1490         .get_link_net   = ip_tunnel_get_link_net,
1491 };
1492
1493 struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
1494                                         u8 name_assign_type)
1495 {
1496         struct nlattr *tb[IFLA_MAX + 1];
1497         struct net_device *dev;
1498         LIST_HEAD(list_kill);
1499         struct ip_tunnel *t;
1500         int err;
1501
1502         memset(&tb, 0, sizeof(tb));
1503
1504         dev = rtnl_create_link(net, name, name_assign_type,
1505                                &ipgre_tap_ops, tb);
1506         if (IS_ERR(dev))
1507                 return dev;
1508
1509         /* Configure flow based GRE device. */
1510         t = netdev_priv(dev);
1511         t->collect_md = true;
1512
1513         err = ipgre_newlink(net, dev, tb, NULL, NULL);
1514         if (err < 0) {
1515                 free_netdev(dev);
1516                 return ERR_PTR(err);
1517         }
1518
1519         /* openvswitch users expect packet sizes to be unrestricted,
1520          * so set the largest MTU we can.
1521          */
1522         err = __ip_tunnel_change_mtu(dev, IP_MAX_MTU, false);
1523         if (err)
1524                 goto out;
1525
1526         err = rtnl_configure_link(dev, NULL);
1527         if (err < 0)
1528                 goto out;
1529
1530         return dev;
1531 out:
1532         ip_tunnel_dellink(dev, &list_kill);
1533         unregister_netdevice_many(&list_kill);
1534         return ERR_PTR(err);
1535 }
1536 EXPORT_SYMBOL_GPL(gretap_fb_dev_create);
1537
1538 static int __net_init ipgre_tap_init_net(struct net *net)
1539 {
1540         return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0");
1541 }
1542
1543 static void __net_exit ipgre_tap_exit_net(struct net *net)
1544 {
1545         struct ip_tunnel_net *itn = net_generic(net, gre_tap_net_id);
1546         ip_tunnel_delete_net(itn, &ipgre_tap_ops);
1547 }
1548
1549 static struct pernet_operations ipgre_tap_net_ops = {
1550         .init = ipgre_tap_init_net,
1551         .exit = ipgre_tap_exit_net,
1552         .id   = &gre_tap_net_id,
1553         .size = sizeof(struct ip_tunnel_net),
1554 };
1555
1556 static int __net_init erspan_init_net(struct net *net)
1557 {
1558         return ip_tunnel_init_net(net, erspan_net_id,
1559                                   &erspan_link_ops, "erspan0");
1560 }
1561
1562 static void __net_exit erspan_exit_net(struct net *net)
1563 {
1564         struct ip_tunnel_net *itn = net_generic(net, erspan_net_id);
1565
1566         ip_tunnel_delete_net(itn, &erspan_link_ops);
1567 }
1568
1569 static struct pernet_operations erspan_net_ops = {
1570         .init = erspan_init_net,
1571         .exit = erspan_exit_net,
1572         .id   = &erspan_net_id,
1573         .size = sizeof(struct ip_tunnel_net),
1574 };
1575
1576 static int __init ipgre_init(void)
1577 {
1578         int err;
1579
1580         pr_info("GRE over IPv4 tunneling driver\n");
1581
1582         err = register_pernet_device(&ipgre_net_ops);
1583         if (err < 0)
1584                 return err;
1585
1586         err = register_pernet_device(&ipgre_tap_net_ops);
1587         if (err < 0)
1588                 goto pnet_tap_failed;
1589
1590         err = register_pernet_device(&erspan_net_ops);
1591         if (err < 0)
1592                 goto pnet_erspan_failed;
1593
1594         err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1595         if (err < 0) {
1596                 pr_info("%s: can't add protocol\n", __func__);
1597                 goto add_proto_failed;
1598         }
1599
1600         err = rtnl_link_register(&ipgre_link_ops);
1601         if (err < 0)
1602                 goto rtnl_link_failed;
1603
1604         err = rtnl_link_register(&ipgre_tap_ops);
1605         if (err < 0)
1606                 goto tap_ops_failed;
1607
1608         err = rtnl_link_register(&erspan_link_ops);
1609         if (err < 0)
1610                 goto erspan_link_failed;
1611
1612         return 0;
1613
1614 erspan_link_failed:
1615         rtnl_link_unregister(&ipgre_tap_ops);
1616 tap_ops_failed:
1617         rtnl_link_unregister(&ipgre_link_ops);
1618 rtnl_link_failed:
1619         gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1620 add_proto_failed:
1621         unregister_pernet_device(&erspan_net_ops);
1622 pnet_erspan_failed:
1623         unregister_pernet_device(&ipgre_tap_net_ops);
1624 pnet_tap_failed:
1625         unregister_pernet_device(&ipgre_net_ops);
1626         return err;
1627 }
1628
1629 static void __exit ipgre_fini(void)
1630 {
1631         rtnl_link_unregister(&ipgre_tap_ops);
1632         rtnl_link_unregister(&ipgre_link_ops);
1633         rtnl_link_unregister(&erspan_link_ops);
1634         gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1635         unregister_pernet_device(&ipgre_tap_net_ops);
1636         unregister_pernet_device(&ipgre_net_ops);
1637         unregister_pernet_device(&erspan_net_ops);
1638 }
1639
1640 module_init(ipgre_init);
1641 module_exit(ipgre_fini);
1642 MODULE_LICENSE("GPL");
1643 MODULE_ALIAS_RTNL_LINK("gre");
1644 MODULE_ALIAS_RTNL_LINK("gretap");
1645 MODULE_ALIAS_RTNL_LINK("erspan");
1646 MODULE_ALIAS_NETDEV("gre0");
1647 MODULE_ALIAS_NETDEV("gretap0");
1648 MODULE_ALIAS_NETDEV("erspan0");