Merge tag 'nfs-for-4.11-1' of git://git.linux-nfs.org/projects/anna/linux-nfs
[platform/kernel/linux-rpi.git] / net / ipv4 / ip_tunnel.c
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/init.h>
34 #include <linux/in6.h>
35 #include <linux/inetdevice.h>
36 #include <linux/igmp.h>
37 #include <linux/netfilter_ipv4.h>
38 #include <linux/etherdevice.h>
39 #include <linux/if_ether.h>
40 #include <linux/if_vlan.h>
41 #include <linux/rculist.h>
42 #include <linux/err.h>
43
44 #include <net/sock.h>
45 #include <net/ip.h>
46 #include <net/icmp.h>
47 #include <net/protocol.h>
48 #include <net/ip_tunnels.h>
49 #include <net/arp.h>
50 #include <net/checksum.h>
51 #include <net/dsfield.h>
52 #include <net/inet_ecn.h>
53 #include <net/xfrm.h>
54 #include <net/net_namespace.h>
55 #include <net/netns/generic.h>
56 #include <net/rtnetlink.h>
57 #include <net/udp.h>
58 #include <net/dst_metadata.h>
59
60 #if IS_ENABLED(CONFIG_IPV6)
61 #include <net/ipv6.h>
62 #include <net/ip6_fib.h>
63 #include <net/ip6_route.h>
64 #endif
65
66 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
67 {
68         return hash_32((__force u32)key ^ (__force u32)remote,
69                          IP_TNL_HASH_BITS);
70 }
71
72 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
73                                 __be16 flags, __be32 key)
74 {
75         if (p->i_flags & TUNNEL_KEY) {
76                 if (flags & TUNNEL_KEY)
77                         return key == p->i_key;
78                 else
79                         /* key expected, none present */
80                         return false;
81         } else
82                 return !(flags & TUNNEL_KEY);
83 }
84
85 /* Fallback tunnel: no source, no destination, no key, no options
86
87    Tunnel hash table:
88    We require exact key match i.e. if a key is present in packet
89    it will match only tunnel with the same key; if it is not present,
90    it will match only keyless tunnel.
91
92    All keysless packets, if not matched configured keyless tunnels
93    will match fallback tunnel.
94    Given src, dst and key, find appropriate for input tunnel.
95 */
96 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
97                                    int link, __be16 flags,
98                                    __be32 remote, __be32 local,
99                                    __be32 key)
100 {
101         unsigned int hash;
102         struct ip_tunnel *t, *cand = NULL;
103         struct hlist_head *head;
104
105         hash = ip_tunnel_hash(key, remote);
106         head = &itn->tunnels[hash];
107
108         hlist_for_each_entry_rcu(t, head, hash_node) {
109                 if (local != t->parms.iph.saddr ||
110                     remote != t->parms.iph.daddr ||
111                     !(t->dev->flags & IFF_UP))
112                         continue;
113
114                 if (!ip_tunnel_key_match(&t->parms, flags, key))
115                         continue;
116
117                 if (t->parms.link == link)
118                         return t;
119                 else
120                         cand = t;
121         }
122
123         hlist_for_each_entry_rcu(t, head, hash_node) {
124                 if (remote != t->parms.iph.daddr ||
125                     t->parms.iph.saddr != 0 ||
126                     !(t->dev->flags & IFF_UP))
127                         continue;
128
129                 if (!ip_tunnel_key_match(&t->parms, flags, key))
130                         continue;
131
132                 if (t->parms.link == link)
133                         return t;
134                 else if (!cand)
135                         cand = t;
136         }
137
138         hash = ip_tunnel_hash(key, 0);
139         head = &itn->tunnels[hash];
140
141         hlist_for_each_entry_rcu(t, head, hash_node) {
142                 if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
143                     (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
144                         continue;
145
146                 if (!(t->dev->flags & IFF_UP))
147                         continue;
148
149                 if (!ip_tunnel_key_match(&t->parms, flags, key))
150                         continue;
151
152                 if (t->parms.link == link)
153                         return t;
154                 else if (!cand)
155                         cand = t;
156         }
157
158         if (flags & TUNNEL_NO_KEY)
159                 goto skip_key_lookup;
160
161         hlist_for_each_entry_rcu(t, head, hash_node) {
162                 if (t->parms.i_key != key ||
163                     t->parms.iph.saddr != 0 ||
164                     t->parms.iph.daddr != 0 ||
165                     !(t->dev->flags & IFF_UP))
166                         continue;
167
168                 if (t->parms.link == link)
169                         return t;
170                 else if (!cand)
171                         cand = t;
172         }
173
174 skip_key_lookup:
175         if (cand)
176                 return cand;
177
178         t = rcu_dereference(itn->collect_md_tun);
179         if (t)
180                 return t;
181
182         if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
183                 return netdev_priv(itn->fb_tunnel_dev);
184
185         return NULL;
186 }
187 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
188
189 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
190                                     struct ip_tunnel_parm *parms)
191 {
192         unsigned int h;
193         __be32 remote;
194         __be32 i_key = parms->i_key;
195
196         if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
197                 remote = parms->iph.daddr;
198         else
199                 remote = 0;
200
201         if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
202                 i_key = 0;
203
204         h = ip_tunnel_hash(i_key, remote);
205         return &itn->tunnels[h];
206 }
207
208 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
209 {
210         struct hlist_head *head = ip_bucket(itn, &t->parms);
211
212         if (t->collect_md)
213                 rcu_assign_pointer(itn->collect_md_tun, t);
214         hlist_add_head_rcu(&t->hash_node, head);
215 }
216
217 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
218 {
219         if (t->collect_md)
220                 rcu_assign_pointer(itn->collect_md_tun, NULL);
221         hlist_del_init_rcu(&t->hash_node);
222 }
223
224 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
225                                         struct ip_tunnel_parm *parms,
226                                         int type)
227 {
228         __be32 remote = parms->iph.daddr;
229         __be32 local = parms->iph.saddr;
230         __be32 key = parms->i_key;
231         __be16 flags = parms->i_flags;
232         int link = parms->link;
233         struct ip_tunnel *t = NULL;
234         struct hlist_head *head = ip_bucket(itn, parms);
235
236         hlist_for_each_entry_rcu(t, head, hash_node) {
237                 if (local == t->parms.iph.saddr &&
238                     remote == t->parms.iph.daddr &&
239                     link == t->parms.link &&
240                     type == t->dev->type &&
241                     ip_tunnel_key_match(&t->parms, flags, key))
242                         break;
243         }
244         return t;
245 }
246
247 static struct net_device *__ip_tunnel_create(struct net *net,
248                                              const struct rtnl_link_ops *ops,
249                                              struct ip_tunnel_parm *parms)
250 {
251         int err;
252         struct ip_tunnel *tunnel;
253         struct net_device *dev;
254         char name[IFNAMSIZ];
255
256         if (parms->name[0])
257                 strlcpy(name, parms->name, IFNAMSIZ);
258         else {
259                 if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
260                         err = -E2BIG;
261                         goto failed;
262                 }
263                 strlcpy(name, ops->kind, IFNAMSIZ);
264                 strncat(name, "%d", 2);
265         }
266
267         ASSERT_RTNL();
268         dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
269         if (!dev) {
270                 err = -ENOMEM;
271                 goto failed;
272         }
273         dev_net_set(dev, net);
274
275         dev->rtnl_link_ops = ops;
276
277         tunnel = netdev_priv(dev);
278         tunnel->parms = *parms;
279         tunnel->net = net;
280
281         err = register_netdevice(dev);
282         if (err)
283                 goto failed_free;
284
285         return dev;
286
287 failed_free:
288         free_netdev(dev);
289 failed:
290         return ERR_PTR(err);
291 }
292
293 static inline void init_tunnel_flow(struct flowi4 *fl4,
294                                     int proto,
295                                     __be32 daddr, __be32 saddr,
296                                     __be32 key, __u8 tos, int oif)
297 {
298         memset(fl4, 0, sizeof(*fl4));
299         fl4->flowi4_oif = oif;
300         fl4->daddr = daddr;
301         fl4->saddr = saddr;
302         fl4->flowi4_tos = tos;
303         fl4->flowi4_proto = proto;
304         fl4->fl4_gre_key = key;
305 }
306
307 static int ip_tunnel_bind_dev(struct net_device *dev)
308 {
309         struct net_device *tdev = NULL;
310         struct ip_tunnel *tunnel = netdev_priv(dev);
311         const struct iphdr *iph;
312         int hlen = LL_MAX_HEADER;
313         int mtu = ETH_DATA_LEN;
314         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
315
316         iph = &tunnel->parms.iph;
317
318         /* Guess output device to choose reasonable mtu and needed_headroom */
319         if (iph->daddr) {
320                 struct flowi4 fl4;
321                 struct rtable *rt;
322
323                 init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
324                                  iph->saddr, tunnel->parms.o_key,
325                                  RT_TOS(iph->tos), tunnel->parms.link);
326                 rt = ip_route_output_key(tunnel->net, &fl4);
327
328                 if (!IS_ERR(rt)) {
329                         tdev = rt->dst.dev;
330                         ip_rt_put(rt);
331                 }
332                 if (dev->type != ARPHRD_ETHER)
333                         dev->flags |= IFF_POINTOPOINT;
334
335                 dst_cache_reset(&tunnel->dst_cache);
336         }
337
338         if (!tdev && tunnel->parms.link)
339                 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
340
341         if (tdev) {
342                 hlen = tdev->hard_header_len + tdev->needed_headroom;
343                 mtu = tdev->mtu;
344         }
345
346         dev->needed_headroom = t_hlen + hlen;
347         mtu -= (dev->hard_header_len + t_hlen);
348
349         if (mtu < 68)
350                 mtu = 68;
351
352         return mtu;
353 }
354
355 static struct ip_tunnel *ip_tunnel_create(struct net *net,
356                                           struct ip_tunnel_net *itn,
357                                           struct ip_tunnel_parm *parms)
358 {
359         struct ip_tunnel *nt;
360         struct net_device *dev;
361         int t_hlen;
362
363         BUG_ON(!itn->fb_tunnel_dev);
364         dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
365         if (IS_ERR(dev))
366                 return ERR_CAST(dev);
367
368         dev->mtu = ip_tunnel_bind_dev(dev);
369
370         nt = netdev_priv(dev);
371         t_hlen = nt->hlen + sizeof(struct iphdr);
372         dev->min_mtu = ETH_MIN_MTU;
373         dev->max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen;
374         ip_tunnel_add(itn, nt);
375         return nt;
376 }
377
378 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
379                   const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
380                   bool log_ecn_error)
381 {
382         struct pcpu_sw_netstats *tstats;
383         const struct iphdr *iph = ip_hdr(skb);
384         int err;
385
386 #ifdef CONFIG_NET_IPGRE_BROADCAST
387         if (ipv4_is_multicast(iph->daddr)) {
388                 tunnel->dev->stats.multicast++;
389                 skb->pkt_type = PACKET_BROADCAST;
390         }
391 #endif
392
393         if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
394              ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
395                 tunnel->dev->stats.rx_crc_errors++;
396                 tunnel->dev->stats.rx_errors++;
397                 goto drop;
398         }
399
400         if (tunnel->parms.i_flags&TUNNEL_SEQ) {
401                 if (!(tpi->flags&TUNNEL_SEQ) ||
402                     (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
403                         tunnel->dev->stats.rx_fifo_errors++;
404                         tunnel->dev->stats.rx_errors++;
405                         goto drop;
406                 }
407                 tunnel->i_seqno = ntohl(tpi->seq) + 1;
408         }
409
410         skb_reset_network_header(skb);
411
412         err = IP_ECN_decapsulate(iph, skb);
413         if (unlikely(err)) {
414                 if (log_ecn_error)
415                         net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
416                                         &iph->saddr, iph->tos);
417                 if (err > 1) {
418                         ++tunnel->dev->stats.rx_frame_errors;
419                         ++tunnel->dev->stats.rx_errors;
420                         goto drop;
421                 }
422         }
423
424         tstats = this_cpu_ptr(tunnel->dev->tstats);
425         u64_stats_update_begin(&tstats->syncp);
426         tstats->rx_packets++;
427         tstats->rx_bytes += skb->len;
428         u64_stats_update_end(&tstats->syncp);
429
430         skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
431
432         if (tunnel->dev->type == ARPHRD_ETHER) {
433                 skb->protocol = eth_type_trans(skb, tunnel->dev);
434                 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
435         } else {
436                 skb->dev = tunnel->dev;
437         }
438
439         if (tun_dst)
440                 skb_dst_set(skb, (struct dst_entry *)tun_dst);
441
442         gro_cells_receive(&tunnel->gro_cells, skb);
443         return 0;
444
445 drop:
446         kfree_skb(skb);
447         return 0;
448 }
449 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
450
451 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
452                             unsigned int num)
453 {
454         if (num >= MAX_IPTUN_ENCAP_OPS)
455                 return -ERANGE;
456
457         return !cmpxchg((const struct ip_tunnel_encap_ops **)
458                         &iptun_encaps[num],
459                         NULL, ops) ? 0 : -1;
460 }
461 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
462
463 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
464                             unsigned int num)
465 {
466         int ret;
467
468         if (num >= MAX_IPTUN_ENCAP_OPS)
469                 return -ERANGE;
470
471         ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
472                        &iptun_encaps[num],
473                        ops, NULL) == ops) ? 0 : -1;
474
475         synchronize_net();
476
477         return ret;
478 }
479 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
480
481 int ip_tunnel_encap_setup(struct ip_tunnel *t,
482                           struct ip_tunnel_encap *ipencap)
483 {
484         int hlen;
485
486         memset(&t->encap, 0, sizeof(t->encap));
487
488         hlen = ip_encap_hlen(ipencap);
489         if (hlen < 0)
490                 return hlen;
491
492         t->encap.type = ipencap->type;
493         t->encap.sport = ipencap->sport;
494         t->encap.dport = ipencap->dport;
495         t->encap.flags = ipencap->flags;
496
497         t->encap_hlen = hlen;
498         t->hlen = t->encap_hlen + t->tun_hlen;
499
500         return 0;
501 }
502 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
503
504 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
505                             struct rtable *rt, __be16 df,
506                             const struct iphdr *inner_iph)
507 {
508         struct ip_tunnel *tunnel = netdev_priv(dev);
509         int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
510         int mtu;
511
512         if (df)
513                 mtu = dst_mtu(&rt->dst) - dev->hard_header_len
514                                         - sizeof(struct iphdr) - tunnel->hlen;
515         else
516                 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
517
518         if (skb_dst(skb))
519                 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
520
521         if (skb->protocol == htons(ETH_P_IP)) {
522                 if (!skb_is_gso(skb) &&
523                     (inner_iph->frag_off & htons(IP_DF)) &&
524                     mtu < pkt_size) {
525                         memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
526                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
527                         return -E2BIG;
528                 }
529         }
530 #if IS_ENABLED(CONFIG_IPV6)
531         else if (skb->protocol == htons(ETH_P_IPV6)) {
532                 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
533
534                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
535                            mtu >= IPV6_MIN_MTU) {
536                         if ((tunnel->parms.iph.daddr &&
537                             !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
538                             rt6->rt6i_dst.plen == 128) {
539                                 rt6->rt6i_flags |= RTF_MODIFIED;
540                                 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
541                         }
542                 }
543
544                 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
545                                         mtu < pkt_size) {
546                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
547                         return -E2BIG;
548                 }
549         }
550 #endif
551         return 0;
552 }
553
554 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto)
555 {
556         struct ip_tunnel *tunnel = netdev_priv(dev);
557         u32 headroom = sizeof(struct iphdr);
558         struct ip_tunnel_info *tun_info;
559         const struct ip_tunnel_key *key;
560         const struct iphdr *inner_iph;
561         struct rtable *rt;
562         struct flowi4 fl4;
563         __be16 df = 0;
564         u8 tos, ttl;
565
566         tun_info = skb_tunnel_info(skb);
567         if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
568                      ip_tunnel_info_af(tun_info) != AF_INET))
569                 goto tx_error;
570         key = &tun_info->key;
571         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
572         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
573         tos = key->tos;
574         if (tos == 1) {
575                 if (skb->protocol == htons(ETH_P_IP))
576                         tos = inner_iph->tos;
577                 else if (skb->protocol == htons(ETH_P_IPV6))
578                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
579         }
580         init_tunnel_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 0,
581                          RT_TOS(tos), tunnel->parms.link);
582         if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
583                 goto tx_error;
584         rt = ip_route_output_key(tunnel->net, &fl4);
585         if (IS_ERR(rt)) {
586                 dev->stats.tx_carrier_errors++;
587                 goto tx_error;
588         }
589         if (rt->dst.dev == dev) {
590                 ip_rt_put(rt);
591                 dev->stats.collisions++;
592                 goto tx_error;
593         }
594         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
595         ttl = key->ttl;
596         if (ttl == 0) {
597                 if (skb->protocol == htons(ETH_P_IP))
598                         ttl = inner_iph->ttl;
599                 else if (skb->protocol == htons(ETH_P_IPV6))
600                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
601                 else
602                         ttl = ip4_dst_hoplimit(&rt->dst);
603         }
604         if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
605                 df = htons(IP_DF);
606         else if (skb->protocol == htons(ETH_P_IP))
607                 df = inner_iph->frag_off & htons(IP_DF);
608         headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
609         if (headroom > dev->needed_headroom)
610                 dev->needed_headroom = headroom;
611
612         if (skb_cow_head(skb, dev->needed_headroom)) {
613                 ip_rt_put(rt);
614                 goto tx_dropped;
615         }
616         iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, key->tos,
617                       key->ttl, df, !net_eq(tunnel->net, dev_net(dev)));
618         return;
619 tx_error:
620         dev->stats.tx_errors++;
621         goto kfree;
622 tx_dropped:
623         dev->stats.tx_dropped++;
624 kfree:
625         kfree_skb(skb);
626 }
627 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
628
629 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
630                     const struct iphdr *tnl_params, u8 protocol)
631 {
632         struct ip_tunnel *tunnel = netdev_priv(dev);
633         const struct iphdr *inner_iph;
634         struct flowi4 fl4;
635         u8     tos, ttl;
636         __be16 df;
637         struct rtable *rt;              /* Route to the other host */
638         unsigned int max_headroom;      /* The extra header space needed */
639         __be32 dst;
640         bool connected;
641
642         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
643         connected = (tunnel->parms.iph.daddr != 0);
644
645         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
646
647         dst = tnl_params->daddr;
648         if (dst == 0) {
649                 /* NBMA tunnel */
650
651                 if (!skb_dst(skb)) {
652                         dev->stats.tx_fifo_errors++;
653                         goto tx_error;
654                 }
655
656                 if (skb->protocol == htons(ETH_P_IP)) {
657                         rt = skb_rtable(skb);
658                         dst = rt_nexthop(rt, inner_iph->daddr);
659                 }
660 #if IS_ENABLED(CONFIG_IPV6)
661                 else if (skb->protocol == htons(ETH_P_IPV6)) {
662                         const struct in6_addr *addr6;
663                         struct neighbour *neigh;
664                         bool do_tx_error_icmp;
665                         int addr_type;
666
667                         neigh = dst_neigh_lookup(skb_dst(skb),
668                                                  &ipv6_hdr(skb)->daddr);
669                         if (!neigh)
670                                 goto tx_error;
671
672                         addr6 = (const struct in6_addr *)&neigh->primary_key;
673                         addr_type = ipv6_addr_type(addr6);
674
675                         if (addr_type == IPV6_ADDR_ANY) {
676                                 addr6 = &ipv6_hdr(skb)->daddr;
677                                 addr_type = ipv6_addr_type(addr6);
678                         }
679
680                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
681                                 do_tx_error_icmp = true;
682                         else {
683                                 do_tx_error_icmp = false;
684                                 dst = addr6->s6_addr32[3];
685                         }
686                         neigh_release(neigh);
687                         if (do_tx_error_icmp)
688                                 goto tx_error_icmp;
689                 }
690 #endif
691                 else
692                         goto tx_error;
693
694                 connected = false;
695         }
696
697         tos = tnl_params->tos;
698         if (tos & 0x1) {
699                 tos &= ~0x1;
700                 if (skb->protocol == htons(ETH_P_IP)) {
701                         tos = inner_iph->tos;
702                         connected = false;
703                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
704                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
705                         connected = false;
706                 }
707         }
708
709         init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
710                          tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
711
712         if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
713                 goto tx_error;
714
715         rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr) :
716                          NULL;
717
718         if (!rt) {
719                 rt = ip_route_output_key(tunnel->net, &fl4);
720
721                 if (IS_ERR(rt)) {
722                         dev->stats.tx_carrier_errors++;
723                         goto tx_error;
724                 }
725                 if (connected)
726                         dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
727                                           fl4.saddr);
728         }
729
730         if (rt->dst.dev == dev) {
731                 ip_rt_put(rt);
732                 dev->stats.collisions++;
733                 goto tx_error;
734         }
735
736         if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph)) {
737                 ip_rt_put(rt);
738                 goto tx_error;
739         }
740
741         if (tunnel->err_count > 0) {
742                 if (time_before(jiffies,
743                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
744                         tunnel->err_count--;
745
746                         dst_link_failure(skb);
747                 } else
748                         tunnel->err_count = 0;
749         }
750
751         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
752         ttl = tnl_params->ttl;
753         if (ttl == 0) {
754                 if (skb->protocol == htons(ETH_P_IP))
755                         ttl = inner_iph->ttl;
756 #if IS_ENABLED(CONFIG_IPV6)
757                 else if (skb->protocol == htons(ETH_P_IPV6))
758                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
759 #endif
760                 else
761                         ttl = ip4_dst_hoplimit(&rt->dst);
762         }
763
764         df = tnl_params->frag_off;
765         if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
766                 df |= (inner_iph->frag_off&htons(IP_DF));
767
768         max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
769                         + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
770         if (max_headroom > dev->needed_headroom)
771                 dev->needed_headroom = max_headroom;
772
773         if (skb_cow_head(skb, dev->needed_headroom)) {
774                 ip_rt_put(rt);
775                 dev->stats.tx_dropped++;
776                 kfree_skb(skb);
777                 return;
778         }
779
780         iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
781                       df, !net_eq(tunnel->net, dev_net(dev)));
782         return;
783
784 #if IS_ENABLED(CONFIG_IPV6)
785 tx_error_icmp:
786         dst_link_failure(skb);
787 #endif
788 tx_error:
789         dev->stats.tx_errors++;
790         kfree_skb(skb);
791 }
792 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
793
794 static void ip_tunnel_update(struct ip_tunnel_net *itn,
795                              struct ip_tunnel *t,
796                              struct net_device *dev,
797                              struct ip_tunnel_parm *p,
798                              bool set_mtu)
799 {
800         ip_tunnel_del(itn, t);
801         t->parms.iph.saddr = p->iph.saddr;
802         t->parms.iph.daddr = p->iph.daddr;
803         t->parms.i_key = p->i_key;
804         t->parms.o_key = p->o_key;
805         if (dev->type != ARPHRD_ETHER) {
806                 memcpy(dev->dev_addr, &p->iph.saddr, 4);
807                 memcpy(dev->broadcast, &p->iph.daddr, 4);
808         }
809         ip_tunnel_add(itn, t);
810
811         t->parms.iph.ttl = p->iph.ttl;
812         t->parms.iph.tos = p->iph.tos;
813         t->parms.iph.frag_off = p->iph.frag_off;
814
815         if (t->parms.link != p->link) {
816                 int mtu;
817
818                 t->parms.link = p->link;
819                 mtu = ip_tunnel_bind_dev(dev);
820                 if (set_mtu)
821                         dev->mtu = mtu;
822         }
823         dst_cache_reset(&t->dst_cache);
824         netdev_state_change(dev);
825 }
826
827 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
828 {
829         int err = 0;
830         struct ip_tunnel *t = netdev_priv(dev);
831         struct net *net = t->net;
832         struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
833
834         BUG_ON(!itn->fb_tunnel_dev);
835         switch (cmd) {
836         case SIOCGETTUNNEL:
837                 if (dev == itn->fb_tunnel_dev) {
838                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
839                         if (!t)
840                                 t = netdev_priv(dev);
841                 }
842                 memcpy(p, &t->parms, sizeof(*p));
843                 break;
844
845         case SIOCADDTUNNEL:
846         case SIOCCHGTUNNEL:
847                 err = -EPERM;
848                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
849                         goto done;
850                 if (p->iph.ttl)
851                         p->iph.frag_off |= htons(IP_DF);
852                 if (!(p->i_flags & VTI_ISVTI)) {
853                         if (!(p->i_flags & TUNNEL_KEY))
854                                 p->i_key = 0;
855                         if (!(p->o_flags & TUNNEL_KEY))
856                                 p->o_key = 0;
857                 }
858
859                 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
860
861                 if (cmd == SIOCADDTUNNEL) {
862                         if (!t) {
863                                 t = ip_tunnel_create(net, itn, p);
864                                 err = PTR_ERR_OR_ZERO(t);
865                                 break;
866                         }
867
868                         err = -EEXIST;
869                         break;
870                 }
871                 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
872                         if (t) {
873                                 if (t->dev != dev) {
874                                         err = -EEXIST;
875                                         break;
876                                 }
877                         } else {
878                                 unsigned int nflags = 0;
879
880                                 if (ipv4_is_multicast(p->iph.daddr))
881                                         nflags = IFF_BROADCAST;
882                                 else if (p->iph.daddr)
883                                         nflags = IFF_POINTOPOINT;
884
885                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
886                                         err = -EINVAL;
887                                         break;
888                                 }
889
890                                 t = netdev_priv(dev);
891                         }
892                 }
893
894                 if (t) {
895                         err = 0;
896                         ip_tunnel_update(itn, t, dev, p, true);
897                 } else {
898                         err = -ENOENT;
899                 }
900                 break;
901
902         case SIOCDELTUNNEL:
903                 err = -EPERM;
904                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
905                         goto done;
906
907                 if (dev == itn->fb_tunnel_dev) {
908                         err = -ENOENT;
909                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
910                         if (!t)
911                                 goto done;
912                         err = -EPERM;
913                         if (t == netdev_priv(itn->fb_tunnel_dev))
914                                 goto done;
915                         dev = t->dev;
916                 }
917                 unregister_netdevice(dev);
918                 err = 0;
919                 break;
920
921         default:
922                 err = -EINVAL;
923         }
924
925 done:
926         return err;
927 }
928 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
929
930 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
931 {
932         struct ip_tunnel *tunnel = netdev_priv(dev);
933         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
934         int max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen;
935
936         if (new_mtu < ETH_MIN_MTU)
937                 return -EINVAL;
938
939         if (new_mtu > max_mtu) {
940                 if (strict)
941                         return -EINVAL;
942
943                 new_mtu = max_mtu;
944         }
945
946         dev->mtu = new_mtu;
947         return 0;
948 }
949 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
950
951 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
952 {
953         return __ip_tunnel_change_mtu(dev, new_mtu, true);
954 }
955 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
956
957 static void ip_tunnel_dev_free(struct net_device *dev)
958 {
959         struct ip_tunnel *tunnel = netdev_priv(dev);
960
961         gro_cells_destroy(&tunnel->gro_cells);
962         dst_cache_destroy(&tunnel->dst_cache);
963         free_percpu(dev->tstats);
964         free_netdev(dev);
965 }
966
967 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
968 {
969         struct ip_tunnel *tunnel = netdev_priv(dev);
970         struct ip_tunnel_net *itn;
971
972         itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
973
974         if (itn->fb_tunnel_dev != dev) {
975                 ip_tunnel_del(itn, netdev_priv(dev));
976                 unregister_netdevice_queue(dev, head);
977         }
978 }
979 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
980
981 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
982 {
983         struct ip_tunnel *tunnel = netdev_priv(dev);
984
985         return tunnel->net;
986 }
987 EXPORT_SYMBOL(ip_tunnel_get_link_net);
988
989 int ip_tunnel_get_iflink(const struct net_device *dev)
990 {
991         struct ip_tunnel *tunnel = netdev_priv(dev);
992
993         return tunnel->parms.link;
994 }
995 EXPORT_SYMBOL(ip_tunnel_get_iflink);
996
997 int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
998                                   struct rtnl_link_ops *ops, char *devname)
999 {
1000         struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1001         struct ip_tunnel_parm parms;
1002         unsigned int i;
1003
1004         for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1005                 INIT_HLIST_HEAD(&itn->tunnels[i]);
1006
1007         if (!ops) {
1008                 itn->fb_tunnel_dev = NULL;
1009                 return 0;
1010         }
1011
1012         memset(&parms, 0, sizeof(parms));
1013         if (devname)
1014                 strlcpy(parms.name, devname, IFNAMSIZ);
1015
1016         rtnl_lock();
1017         itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1018         /* FB netdevice is special: we have one, and only one per netns.
1019          * Allowing to move it to another netns is clearly unsafe.
1020          */
1021         if (!IS_ERR(itn->fb_tunnel_dev)) {
1022                 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1023                 itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1024                 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1025         }
1026         rtnl_unlock();
1027
1028         return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1029 }
1030 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1031
1032 static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
1033                               struct rtnl_link_ops *ops)
1034 {
1035         struct net *net = dev_net(itn->fb_tunnel_dev);
1036         struct net_device *dev, *aux;
1037         int h;
1038
1039         for_each_netdev_safe(net, dev, aux)
1040                 if (dev->rtnl_link_ops == ops)
1041                         unregister_netdevice_queue(dev, head);
1042
1043         for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1044                 struct ip_tunnel *t;
1045                 struct hlist_node *n;
1046                 struct hlist_head *thead = &itn->tunnels[h];
1047
1048                 hlist_for_each_entry_safe(t, n, thead, hash_node)
1049                         /* If dev is in the same netns, it has already
1050                          * been added to the list by the previous loop.
1051                          */
1052                         if (!net_eq(dev_net(t->dev), net))
1053                                 unregister_netdevice_queue(t->dev, head);
1054         }
1055 }
1056
1057 void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
1058 {
1059         LIST_HEAD(list);
1060
1061         rtnl_lock();
1062         ip_tunnel_destroy(itn, &list, ops);
1063         unregister_netdevice_many(&list);
1064         rtnl_unlock();
1065 }
1066 EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
1067
1068 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1069                       struct ip_tunnel_parm *p)
1070 {
1071         struct ip_tunnel *nt;
1072         struct net *net = dev_net(dev);
1073         struct ip_tunnel_net *itn;
1074         int mtu;
1075         int err;
1076
1077         nt = netdev_priv(dev);
1078         itn = net_generic(net, nt->ip_tnl_net_id);
1079
1080         if (nt->collect_md) {
1081                 if (rtnl_dereference(itn->collect_md_tun))
1082                         return -EEXIST;
1083         } else {
1084                 if (ip_tunnel_find(itn, p, dev->type))
1085                         return -EEXIST;
1086         }
1087
1088         nt->net = net;
1089         nt->parms = *p;
1090         err = register_netdevice(dev);
1091         if (err)
1092                 goto out;
1093
1094         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1095                 eth_hw_addr_random(dev);
1096
1097         mtu = ip_tunnel_bind_dev(dev);
1098         if (!tb[IFLA_MTU])
1099                 dev->mtu = mtu;
1100
1101         ip_tunnel_add(itn, nt);
1102 out:
1103         return err;
1104 }
1105 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1106
1107 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1108                          struct ip_tunnel_parm *p)
1109 {
1110         struct ip_tunnel *t;
1111         struct ip_tunnel *tunnel = netdev_priv(dev);
1112         struct net *net = tunnel->net;
1113         struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1114
1115         if (dev == itn->fb_tunnel_dev)
1116                 return -EINVAL;
1117
1118         t = ip_tunnel_find(itn, p, dev->type);
1119
1120         if (t) {
1121                 if (t->dev != dev)
1122                         return -EEXIST;
1123         } else {
1124                 t = tunnel;
1125
1126                 if (dev->type != ARPHRD_ETHER) {
1127                         unsigned int nflags = 0;
1128
1129                         if (ipv4_is_multicast(p->iph.daddr))
1130                                 nflags = IFF_BROADCAST;
1131                         else if (p->iph.daddr)
1132                                 nflags = IFF_POINTOPOINT;
1133
1134                         if ((dev->flags ^ nflags) &
1135                             (IFF_POINTOPOINT | IFF_BROADCAST))
1136                                 return -EINVAL;
1137                 }
1138         }
1139
1140         ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
1141         return 0;
1142 }
1143 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1144
1145 int ip_tunnel_init(struct net_device *dev)
1146 {
1147         struct ip_tunnel *tunnel = netdev_priv(dev);
1148         struct iphdr *iph = &tunnel->parms.iph;
1149         int err;
1150
1151         dev->destructor = ip_tunnel_dev_free;
1152         dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1153         if (!dev->tstats)
1154                 return -ENOMEM;
1155
1156         err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1157         if (err) {
1158                 free_percpu(dev->tstats);
1159                 return err;
1160         }
1161
1162         err = gro_cells_init(&tunnel->gro_cells, dev);
1163         if (err) {
1164                 dst_cache_destroy(&tunnel->dst_cache);
1165                 free_percpu(dev->tstats);
1166                 return err;
1167         }
1168
1169         tunnel->dev = dev;
1170         tunnel->net = dev_net(dev);
1171         strcpy(tunnel->parms.name, dev->name);
1172         iph->version            = 4;
1173         iph->ihl                = 5;
1174
1175         if (tunnel->collect_md) {
1176                 dev->features |= NETIF_F_NETNS_LOCAL;
1177                 netif_keep_dst(dev);
1178         }
1179         return 0;
1180 }
1181 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1182
1183 void ip_tunnel_uninit(struct net_device *dev)
1184 {
1185         struct ip_tunnel *tunnel = netdev_priv(dev);
1186         struct net *net = tunnel->net;
1187         struct ip_tunnel_net *itn;
1188
1189         itn = net_generic(net, tunnel->ip_tnl_net_id);
1190         /* fb_tunnel_dev will be unregisted in net-exit call. */
1191         if (itn->fb_tunnel_dev != dev)
1192                 ip_tunnel_del(itn, netdev_priv(dev));
1193
1194         dst_cache_reset(&tunnel->dst_cache);
1195 }
1196 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1197
1198 /* Do least required initialization, rest of init is done in tunnel_init call */
1199 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1200 {
1201         struct ip_tunnel *tunnel = netdev_priv(dev);
1202         tunnel->ip_tnl_net_id = net_id;
1203 }
1204 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1205
1206 MODULE_LICENSE("GPL");