Merge branch 'per_signal_struct_coredumps-for-v5.16' of git://git.kernel.org/pub...
[platform/kernel/linux-rpi.git] / net / ipv4 / ip_tunnel.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (c) 2013 Nicira, Inc.
4  */
5
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7
8 #include <linux/capability.h>
9 #include <linux/module.h>
10 #include <linux/types.h>
11 #include <linux/kernel.h>
12 #include <linux/slab.h>
13 #include <linux/uaccess.h>
14 #include <linux/skbuff.h>
15 #include <linux/netdevice.h>
16 #include <linux/in.h>
17 #include <linux/tcp.h>
18 #include <linux/udp.h>
19 #include <linux/if_arp.h>
20 #include <linux/init.h>
21 #include <linux/in6.h>
22 #include <linux/inetdevice.h>
23 #include <linux/igmp.h>
24 #include <linux/netfilter_ipv4.h>
25 #include <linux/etherdevice.h>
26 #include <linux/if_ether.h>
27 #include <linux/if_vlan.h>
28 #include <linux/rculist.h>
29 #include <linux/err.h>
30
31 #include <net/sock.h>
32 #include <net/ip.h>
33 #include <net/icmp.h>
34 #include <net/protocol.h>
35 #include <net/ip_tunnels.h>
36 #include <net/arp.h>
37 #include <net/checksum.h>
38 #include <net/dsfield.h>
39 #include <net/inet_ecn.h>
40 #include <net/xfrm.h>
41 #include <net/net_namespace.h>
42 #include <net/netns/generic.h>
43 #include <net/rtnetlink.h>
44 #include <net/udp.h>
45 #include <net/dst_metadata.h>
46
47 #if IS_ENABLED(CONFIG_IPV6)
48 #include <net/ipv6.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #endif
52
53 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
54 {
55         return hash_32((__force u32)key ^ (__force u32)remote,
56                          IP_TNL_HASH_BITS);
57 }
58
59 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
60                                 __be16 flags, __be32 key)
61 {
62         if (p->i_flags & TUNNEL_KEY) {
63                 if (flags & TUNNEL_KEY)
64                         return key == p->i_key;
65                 else
66                         /* key expected, none present */
67                         return false;
68         } else
69                 return !(flags & TUNNEL_KEY);
70 }
71
72 /* Fallback tunnel: no source, no destination, no key, no options
73
74    Tunnel hash table:
75    We require exact key match i.e. if a key is present in packet
76    it will match only tunnel with the same key; if it is not present,
77    it will match only keyless tunnel.
78
79    All keysless packets, if not matched configured keyless tunnels
80    will match fallback tunnel.
81    Given src, dst and key, find appropriate for input tunnel.
82 */
83 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
84                                    int link, __be16 flags,
85                                    __be32 remote, __be32 local,
86                                    __be32 key)
87 {
88         struct ip_tunnel *t, *cand = NULL;
89         struct hlist_head *head;
90         struct net_device *ndev;
91         unsigned int hash;
92
93         hash = ip_tunnel_hash(key, remote);
94         head = &itn->tunnels[hash];
95
96         hlist_for_each_entry_rcu(t, head, hash_node) {
97                 if (local != t->parms.iph.saddr ||
98                     remote != t->parms.iph.daddr ||
99                     !(t->dev->flags & IFF_UP))
100                         continue;
101
102                 if (!ip_tunnel_key_match(&t->parms, flags, key))
103                         continue;
104
105                 if (t->parms.link == link)
106                         return t;
107                 else
108                         cand = t;
109         }
110
111         hlist_for_each_entry_rcu(t, head, hash_node) {
112                 if (remote != t->parms.iph.daddr ||
113                     t->parms.iph.saddr != 0 ||
114                     !(t->dev->flags & IFF_UP))
115                         continue;
116
117                 if (!ip_tunnel_key_match(&t->parms, flags, key))
118                         continue;
119
120                 if (t->parms.link == link)
121                         return t;
122                 else if (!cand)
123                         cand = t;
124         }
125
126         hash = ip_tunnel_hash(key, 0);
127         head = &itn->tunnels[hash];
128
129         hlist_for_each_entry_rcu(t, head, hash_node) {
130                 if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
131                     (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
132                         continue;
133
134                 if (!(t->dev->flags & IFF_UP))
135                         continue;
136
137                 if (!ip_tunnel_key_match(&t->parms, flags, key))
138                         continue;
139
140                 if (t->parms.link == link)
141                         return t;
142                 else if (!cand)
143                         cand = t;
144         }
145
146         hlist_for_each_entry_rcu(t, head, hash_node) {
147                 if ((!(flags & TUNNEL_NO_KEY) && t->parms.i_key != key) ||
148                     t->parms.iph.saddr != 0 ||
149                     t->parms.iph.daddr != 0 ||
150                     !(t->dev->flags & IFF_UP))
151                         continue;
152
153                 if (t->parms.link == link)
154                         return t;
155                 else if (!cand)
156                         cand = t;
157         }
158
159         if (cand)
160                 return cand;
161
162         t = rcu_dereference(itn->collect_md_tun);
163         if (t && t->dev->flags & IFF_UP)
164                 return t;
165
166         ndev = READ_ONCE(itn->fb_tunnel_dev);
167         if (ndev && ndev->flags & IFF_UP)
168                 return netdev_priv(ndev);
169
170         return NULL;
171 }
172 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
173
174 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
175                                     struct ip_tunnel_parm *parms)
176 {
177         unsigned int h;
178         __be32 remote;
179         __be32 i_key = parms->i_key;
180
181         if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
182                 remote = parms->iph.daddr;
183         else
184                 remote = 0;
185
186         if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
187                 i_key = 0;
188
189         h = ip_tunnel_hash(i_key, remote);
190         return &itn->tunnels[h];
191 }
192
193 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
194 {
195         struct hlist_head *head = ip_bucket(itn, &t->parms);
196
197         if (t->collect_md)
198                 rcu_assign_pointer(itn->collect_md_tun, t);
199         hlist_add_head_rcu(&t->hash_node, head);
200 }
201
202 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
203 {
204         if (t->collect_md)
205                 rcu_assign_pointer(itn->collect_md_tun, NULL);
206         hlist_del_init_rcu(&t->hash_node);
207 }
208
209 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
210                                         struct ip_tunnel_parm *parms,
211                                         int type)
212 {
213         __be32 remote = parms->iph.daddr;
214         __be32 local = parms->iph.saddr;
215         __be32 key = parms->i_key;
216         __be16 flags = parms->i_flags;
217         int link = parms->link;
218         struct ip_tunnel *t = NULL;
219         struct hlist_head *head = ip_bucket(itn, parms);
220
221         hlist_for_each_entry_rcu(t, head, hash_node) {
222                 if (local == t->parms.iph.saddr &&
223                     remote == t->parms.iph.daddr &&
224                     link == t->parms.link &&
225                     type == t->dev->type &&
226                     ip_tunnel_key_match(&t->parms, flags, key))
227                         break;
228         }
229         return t;
230 }
231
232 static struct net_device *__ip_tunnel_create(struct net *net,
233                                              const struct rtnl_link_ops *ops,
234                                              struct ip_tunnel_parm *parms)
235 {
236         int err;
237         struct ip_tunnel *tunnel;
238         struct net_device *dev;
239         char name[IFNAMSIZ];
240
241         err = -E2BIG;
242         if (parms->name[0]) {
243                 if (!dev_valid_name(parms->name))
244                         goto failed;
245                 strlcpy(name, parms->name, IFNAMSIZ);
246         } else {
247                 if (strlen(ops->kind) > (IFNAMSIZ - 3))
248                         goto failed;
249                 strcpy(name, ops->kind);
250                 strcat(name, "%d");
251         }
252
253         ASSERT_RTNL();
254         dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
255         if (!dev) {
256                 err = -ENOMEM;
257                 goto failed;
258         }
259         dev_net_set(dev, net);
260
261         dev->rtnl_link_ops = ops;
262
263         tunnel = netdev_priv(dev);
264         tunnel->parms = *parms;
265         tunnel->net = net;
266
267         err = register_netdevice(dev);
268         if (err)
269                 goto failed_free;
270
271         return dev;
272
273 failed_free:
274         free_netdev(dev);
275 failed:
276         return ERR_PTR(err);
277 }
278
279 static int ip_tunnel_bind_dev(struct net_device *dev)
280 {
281         struct net_device *tdev = NULL;
282         struct ip_tunnel *tunnel = netdev_priv(dev);
283         const struct iphdr *iph;
284         int hlen = LL_MAX_HEADER;
285         int mtu = ETH_DATA_LEN;
286         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
287
288         iph = &tunnel->parms.iph;
289
290         /* Guess output device to choose reasonable mtu and needed_headroom */
291         if (iph->daddr) {
292                 struct flowi4 fl4;
293                 struct rtable *rt;
294
295                 ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
296                                     iph->saddr, tunnel->parms.o_key,
297                                     RT_TOS(iph->tos), tunnel->parms.link,
298                                     tunnel->fwmark, 0);
299                 rt = ip_route_output_key(tunnel->net, &fl4);
300
301                 if (!IS_ERR(rt)) {
302                         tdev = rt->dst.dev;
303                         ip_rt_put(rt);
304                 }
305                 if (dev->type != ARPHRD_ETHER)
306                         dev->flags |= IFF_POINTOPOINT;
307
308                 dst_cache_reset(&tunnel->dst_cache);
309         }
310
311         if (!tdev && tunnel->parms.link)
312                 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
313
314         if (tdev) {
315                 hlen = tdev->hard_header_len + tdev->needed_headroom;
316                 mtu = min(tdev->mtu, IP_MAX_MTU);
317         }
318
319         dev->needed_headroom = t_hlen + hlen;
320         mtu -= t_hlen + (dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0);
321
322         if (mtu < IPV4_MIN_MTU)
323                 mtu = IPV4_MIN_MTU;
324
325         return mtu;
326 }
327
328 static struct ip_tunnel *ip_tunnel_create(struct net *net,
329                                           struct ip_tunnel_net *itn,
330                                           struct ip_tunnel_parm *parms)
331 {
332         struct ip_tunnel *nt;
333         struct net_device *dev;
334         int t_hlen;
335         int mtu;
336         int err;
337
338         dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms);
339         if (IS_ERR(dev))
340                 return ERR_CAST(dev);
341
342         mtu = ip_tunnel_bind_dev(dev);
343         err = dev_set_mtu(dev, mtu);
344         if (err)
345                 goto err_dev_set_mtu;
346
347         nt = netdev_priv(dev);
348         t_hlen = nt->hlen + sizeof(struct iphdr);
349         dev->min_mtu = ETH_MIN_MTU;
350         dev->max_mtu = IP_MAX_MTU - t_hlen;
351         if (dev->type == ARPHRD_ETHER)
352                 dev->max_mtu -= dev->hard_header_len;
353
354         ip_tunnel_add(itn, nt);
355         return nt;
356
357 err_dev_set_mtu:
358         unregister_netdevice(dev);
359         return ERR_PTR(err);
360 }
361
362 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
363                   const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
364                   bool log_ecn_error)
365 {
366         const struct iphdr *iph = ip_hdr(skb);
367         int err;
368
369 #ifdef CONFIG_NET_IPGRE_BROADCAST
370         if (ipv4_is_multicast(iph->daddr)) {
371                 tunnel->dev->stats.multicast++;
372                 skb->pkt_type = PACKET_BROADCAST;
373         }
374 #endif
375
376         if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
377              ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
378                 tunnel->dev->stats.rx_crc_errors++;
379                 tunnel->dev->stats.rx_errors++;
380                 goto drop;
381         }
382
383         if (tunnel->parms.i_flags&TUNNEL_SEQ) {
384                 if (!(tpi->flags&TUNNEL_SEQ) ||
385                     (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
386                         tunnel->dev->stats.rx_fifo_errors++;
387                         tunnel->dev->stats.rx_errors++;
388                         goto drop;
389                 }
390                 tunnel->i_seqno = ntohl(tpi->seq) + 1;
391         }
392
393         skb_set_network_header(skb, (tunnel->dev->type == ARPHRD_ETHER) ? ETH_HLEN : 0);
394
395         err = IP_ECN_decapsulate(iph, skb);
396         if (unlikely(err)) {
397                 if (log_ecn_error)
398                         net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
399                                         &iph->saddr, iph->tos);
400                 if (err > 1) {
401                         ++tunnel->dev->stats.rx_frame_errors;
402                         ++tunnel->dev->stats.rx_errors;
403                         goto drop;
404                 }
405         }
406
407         dev_sw_netstats_rx_add(tunnel->dev, skb->len);
408         skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
409
410         if (tunnel->dev->type == ARPHRD_ETHER) {
411                 skb->protocol = eth_type_trans(skb, tunnel->dev);
412                 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
413         } else {
414                 skb->dev = tunnel->dev;
415         }
416
417         if (tun_dst)
418                 skb_dst_set(skb, (struct dst_entry *)tun_dst);
419
420         gro_cells_receive(&tunnel->gro_cells, skb);
421         return 0;
422
423 drop:
424         if (tun_dst)
425                 dst_release((struct dst_entry *)tun_dst);
426         kfree_skb(skb);
427         return 0;
428 }
429 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
430
431 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
432                             unsigned int num)
433 {
434         if (num >= MAX_IPTUN_ENCAP_OPS)
435                 return -ERANGE;
436
437         return !cmpxchg((const struct ip_tunnel_encap_ops **)
438                         &iptun_encaps[num],
439                         NULL, ops) ? 0 : -1;
440 }
441 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
442
443 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
444                             unsigned int num)
445 {
446         int ret;
447
448         if (num >= MAX_IPTUN_ENCAP_OPS)
449                 return -ERANGE;
450
451         ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
452                        &iptun_encaps[num],
453                        ops, NULL) == ops) ? 0 : -1;
454
455         synchronize_net();
456
457         return ret;
458 }
459 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
460
461 int ip_tunnel_encap_setup(struct ip_tunnel *t,
462                           struct ip_tunnel_encap *ipencap)
463 {
464         int hlen;
465
466         memset(&t->encap, 0, sizeof(t->encap));
467
468         hlen = ip_encap_hlen(ipencap);
469         if (hlen < 0)
470                 return hlen;
471
472         t->encap.type = ipencap->type;
473         t->encap.sport = ipencap->sport;
474         t->encap.dport = ipencap->dport;
475         t->encap.flags = ipencap->flags;
476
477         t->encap_hlen = hlen;
478         t->hlen = t->encap_hlen + t->tun_hlen;
479
480         return 0;
481 }
482 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
483
484 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
485                             struct rtable *rt, __be16 df,
486                             const struct iphdr *inner_iph,
487                             int tunnel_hlen, __be32 dst, bool md)
488 {
489         struct ip_tunnel *tunnel = netdev_priv(dev);
490         int pkt_size;
491         int mtu;
492
493         tunnel_hlen = md ? tunnel_hlen : tunnel->hlen;
494         pkt_size = skb->len - tunnel_hlen;
495         pkt_size -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0;
496
497         if (df) {
498                 mtu = dst_mtu(&rt->dst) - (sizeof(struct iphdr) + tunnel_hlen);
499                 mtu -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0;
500         } else {
501                 mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
502         }
503
504         if (skb_valid_dst(skb))
505                 skb_dst_update_pmtu_no_confirm(skb, mtu);
506
507         if (skb->protocol == htons(ETH_P_IP)) {
508                 if (!skb_is_gso(skb) &&
509                     (inner_iph->frag_off & htons(IP_DF)) &&
510                     mtu < pkt_size) {
511                         icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
512                         return -E2BIG;
513                 }
514         }
515 #if IS_ENABLED(CONFIG_IPV6)
516         else if (skb->protocol == htons(ETH_P_IPV6)) {
517                 struct rt6_info *rt6;
518                 __be32 daddr;
519
520                 rt6 = skb_valid_dst(skb) ? (struct rt6_info *)skb_dst(skb) :
521                                            NULL;
522                 daddr = md ? dst : tunnel->parms.iph.daddr;
523
524                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
525                            mtu >= IPV6_MIN_MTU) {
526                         if ((daddr && !ipv4_is_multicast(daddr)) ||
527                             rt6->rt6i_dst.plen == 128) {
528                                 rt6->rt6i_flags |= RTF_MODIFIED;
529                                 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
530                         }
531                 }
532
533                 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
534                                         mtu < pkt_size) {
535                         icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
536                         return -E2BIG;
537                 }
538         }
539 #endif
540         return 0;
541 }
542
543 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
544                        u8 proto, int tunnel_hlen)
545 {
546         struct ip_tunnel *tunnel = netdev_priv(dev);
547         u32 headroom = sizeof(struct iphdr);
548         struct ip_tunnel_info *tun_info;
549         const struct ip_tunnel_key *key;
550         const struct iphdr *inner_iph;
551         struct rtable *rt = NULL;
552         struct flowi4 fl4;
553         __be16 df = 0;
554         u8 tos, ttl;
555         bool use_cache;
556
557         tun_info = skb_tunnel_info(skb);
558         if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
559                      ip_tunnel_info_af(tun_info) != AF_INET))
560                 goto tx_error;
561         key = &tun_info->key;
562         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
563         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
564         tos = key->tos;
565         if (tos == 1) {
566                 if (skb->protocol == htons(ETH_P_IP))
567                         tos = inner_iph->tos;
568                 else if (skb->protocol == htons(ETH_P_IPV6))
569                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
570         }
571         ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src,
572                             tunnel_id_to_key32(key->tun_id), RT_TOS(tos),
573                             0, skb->mark, skb_get_hash(skb));
574         if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
575                 goto tx_error;
576
577         use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
578         if (use_cache)
579                 rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr);
580         if (!rt) {
581                 rt = ip_route_output_key(tunnel->net, &fl4);
582                 if (IS_ERR(rt)) {
583                         dev->stats.tx_carrier_errors++;
584                         goto tx_error;
585                 }
586                 if (use_cache)
587                         dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
588                                           fl4.saddr);
589         }
590         if (rt->dst.dev == dev) {
591                 ip_rt_put(rt);
592                 dev->stats.collisions++;
593                 goto tx_error;
594         }
595
596         if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
597                 df = htons(IP_DF);
598         if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen,
599                             key->u.ipv4.dst, true)) {
600                 ip_rt_put(rt);
601                 goto tx_error;
602         }
603
604         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
605         ttl = key->ttl;
606         if (ttl == 0) {
607                 if (skb->protocol == htons(ETH_P_IP))
608                         ttl = inner_iph->ttl;
609                 else if (skb->protocol == htons(ETH_P_IPV6))
610                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
611                 else
612                         ttl = ip4_dst_hoplimit(&rt->dst);
613         }
614
615         headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
616         if (headroom > dev->needed_headroom)
617                 dev->needed_headroom = headroom;
618
619         if (skb_cow_head(skb, dev->needed_headroom)) {
620                 ip_rt_put(rt);
621                 goto tx_dropped;
622         }
623         iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
624                       df, !net_eq(tunnel->net, dev_net(dev)));
625         return;
626 tx_error:
627         dev->stats.tx_errors++;
628         goto kfree;
629 tx_dropped:
630         dev->stats.tx_dropped++;
631 kfree:
632         kfree_skb(skb);
633 }
634 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
635
636 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
637                     const struct iphdr *tnl_params, u8 protocol)
638 {
639         struct ip_tunnel *tunnel = netdev_priv(dev);
640         struct ip_tunnel_info *tun_info = NULL;
641         const struct iphdr *inner_iph;
642         unsigned int max_headroom;      /* The extra header space needed */
643         struct rtable *rt = NULL;               /* Route to the other host */
644         bool use_cache = false;
645         struct flowi4 fl4;
646         bool md = false;
647         bool connected;
648         u8 tos, ttl;
649         __be32 dst;
650         __be16 df;
651
652         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
653         connected = (tunnel->parms.iph.daddr != 0);
654
655         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
656
657         dst = tnl_params->daddr;
658         if (dst == 0) {
659                 /* NBMA tunnel */
660
661                 if (!skb_dst(skb)) {
662                         dev->stats.tx_fifo_errors++;
663                         goto tx_error;
664                 }
665
666                 tun_info = skb_tunnel_info(skb);
667                 if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) &&
668                     ip_tunnel_info_af(tun_info) == AF_INET &&
669                     tun_info->key.u.ipv4.dst) {
670                         dst = tun_info->key.u.ipv4.dst;
671                         md = true;
672                         connected = true;
673                 }
674                 else if (skb->protocol == htons(ETH_P_IP)) {
675                         rt = skb_rtable(skb);
676                         dst = rt_nexthop(rt, inner_iph->daddr);
677                 }
678 #if IS_ENABLED(CONFIG_IPV6)
679                 else if (skb->protocol == htons(ETH_P_IPV6)) {
680                         const struct in6_addr *addr6;
681                         struct neighbour *neigh;
682                         bool do_tx_error_icmp;
683                         int addr_type;
684
685                         neigh = dst_neigh_lookup(skb_dst(skb),
686                                                  &ipv6_hdr(skb)->daddr);
687                         if (!neigh)
688                                 goto tx_error;
689
690                         addr6 = (const struct in6_addr *)&neigh->primary_key;
691                         addr_type = ipv6_addr_type(addr6);
692
693                         if (addr_type == IPV6_ADDR_ANY) {
694                                 addr6 = &ipv6_hdr(skb)->daddr;
695                                 addr_type = ipv6_addr_type(addr6);
696                         }
697
698                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
699                                 do_tx_error_icmp = true;
700                         else {
701                                 do_tx_error_icmp = false;
702                                 dst = addr6->s6_addr32[3];
703                         }
704                         neigh_release(neigh);
705                         if (do_tx_error_icmp)
706                                 goto tx_error_icmp;
707                 }
708 #endif
709                 else
710                         goto tx_error;
711
712                 if (!md)
713                         connected = false;
714         }
715
716         tos = tnl_params->tos;
717         if (tos & 0x1) {
718                 tos &= ~0x1;
719                 if (skb->protocol == htons(ETH_P_IP)) {
720                         tos = inner_iph->tos;
721                         connected = false;
722                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
723                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
724                         connected = false;
725                 }
726         }
727
728         ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
729                             tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link,
730                             tunnel->fwmark, skb_get_hash(skb));
731
732         if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
733                 goto tx_error;
734
735         if (connected && md) {
736                 use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
737                 if (use_cache)
738                         rt = dst_cache_get_ip4(&tun_info->dst_cache,
739                                                &fl4.saddr);
740         } else {
741                 rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache,
742                                                 &fl4.saddr) : NULL;
743         }
744
745         if (!rt) {
746                 rt = ip_route_output_key(tunnel->net, &fl4);
747
748                 if (IS_ERR(rt)) {
749                         dev->stats.tx_carrier_errors++;
750                         goto tx_error;
751                 }
752                 if (use_cache)
753                         dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
754                                           fl4.saddr);
755                 else if (!md && connected)
756                         dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
757                                           fl4.saddr);
758         }
759
760         if (rt->dst.dev == dev) {
761                 ip_rt_put(rt);
762                 dev->stats.collisions++;
763                 goto tx_error;
764         }
765
766         df = tnl_params->frag_off;
767         if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
768                 df |= (inner_iph->frag_off & htons(IP_DF));
769
770         if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, 0, 0, false)) {
771                 ip_rt_put(rt);
772                 goto tx_error;
773         }
774
775         if (tunnel->err_count > 0) {
776                 if (time_before(jiffies,
777                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
778                         tunnel->err_count--;
779
780                         dst_link_failure(skb);
781                 } else
782                         tunnel->err_count = 0;
783         }
784
785         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
786         ttl = tnl_params->ttl;
787         if (ttl == 0) {
788                 if (skb->protocol == htons(ETH_P_IP))
789                         ttl = inner_iph->ttl;
790 #if IS_ENABLED(CONFIG_IPV6)
791                 else if (skb->protocol == htons(ETH_P_IPV6))
792                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
793 #endif
794                 else
795                         ttl = ip4_dst_hoplimit(&rt->dst);
796         }
797
798         max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
799                         + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
800         if (max_headroom > dev->needed_headroom)
801                 dev->needed_headroom = max_headroom;
802
803         if (skb_cow_head(skb, dev->needed_headroom)) {
804                 ip_rt_put(rt);
805                 dev->stats.tx_dropped++;
806                 kfree_skb(skb);
807                 return;
808         }
809
810         iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
811                       df, !net_eq(tunnel->net, dev_net(dev)));
812         return;
813
814 #if IS_ENABLED(CONFIG_IPV6)
815 tx_error_icmp:
816         dst_link_failure(skb);
817 #endif
818 tx_error:
819         dev->stats.tx_errors++;
820         kfree_skb(skb);
821 }
822 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
823
824 static void ip_tunnel_update(struct ip_tunnel_net *itn,
825                              struct ip_tunnel *t,
826                              struct net_device *dev,
827                              struct ip_tunnel_parm *p,
828                              bool set_mtu,
829                              __u32 fwmark)
830 {
831         ip_tunnel_del(itn, t);
832         t->parms.iph.saddr = p->iph.saddr;
833         t->parms.iph.daddr = p->iph.daddr;
834         t->parms.i_key = p->i_key;
835         t->parms.o_key = p->o_key;
836         if (dev->type != ARPHRD_ETHER) {
837                 __dev_addr_set(dev, &p->iph.saddr, 4);
838                 memcpy(dev->broadcast, &p->iph.daddr, 4);
839         }
840         ip_tunnel_add(itn, t);
841
842         t->parms.iph.ttl = p->iph.ttl;
843         t->parms.iph.tos = p->iph.tos;
844         t->parms.iph.frag_off = p->iph.frag_off;
845
846         if (t->parms.link != p->link || t->fwmark != fwmark) {
847                 int mtu;
848
849                 t->parms.link = p->link;
850                 t->fwmark = fwmark;
851                 mtu = ip_tunnel_bind_dev(dev);
852                 if (set_mtu)
853                         dev->mtu = mtu;
854         }
855         dst_cache_reset(&t->dst_cache);
856         netdev_state_change(dev);
857 }
858
859 int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
860 {
861         int err = 0;
862         struct ip_tunnel *t = netdev_priv(dev);
863         struct net *net = t->net;
864         struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
865
866         switch (cmd) {
867         case SIOCGETTUNNEL:
868                 if (dev == itn->fb_tunnel_dev) {
869                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
870                         if (!t)
871                                 t = netdev_priv(dev);
872                 }
873                 memcpy(p, &t->parms, sizeof(*p));
874                 break;
875
876         case SIOCADDTUNNEL:
877         case SIOCCHGTUNNEL:
878                 err = -EPERM;
879                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
880                         goto done;
881                 if (p->iph.ttl)
882                         p->iph.frag_off |= htons(IP_DF);
883                 if (!(p->i_flags & VTI_ISVTI)) {
884                         if (!(p->i_flags & TUNNEL_KEY))
885                                 p->i_key = 0;
886                         if (!(p->o_flags & TUNNEL_KEY))
887                                 p->o_key = 0;
888                 }
889
890                 t = ip_tunnel_find(itn, p, itn->type);
891
892                 if (cmd == SIOCADDTUNNEL) {
893                         if (!t) {
894                                 t = ip_tunnel_create(net, itn, p);
895                                 err = PTR_ERR_OR_ZERO(t);
896                                 break;
897                         }
898
899                         err = -EEXIST;
900                         break;
901                 }
902                 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
903                         if (t) {
904                                 if (t->dev != dev) {
905                                         err = -EEXIST;
906                                         break;
907                                 }
908                         } else {
909                                 unsigned int nflags = 0;
910
911                                 if (ipv4_is_multicast(p->iph.daddr))
912                                         nflags = IFF_BROADCAST;
913                                 else if (p->iph.daddr)
914                                         nflags = IFF_POINTOPOINT;
915
916                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
917                                         err = -EINVAL;
918                                         break;
919                                 }
920
921                                 t = netdev_priv(dev);
922                         }
923                 }
924
925                 if (t) {
926                         err = 0;
927                         ip_tunnel_update(itn, t, dev, p, true, 0);
928                 } else {
929                         err = -ENOENT;
930                 }
931                 break;
932
933         case SIOCDELTUNNEL:
934                 err = -EPERM;
935                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
936                         goto done;
937
938                 if (dev == itn->fb_tunnel_dev) {
939                         err = -ENOENT;
940                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
941                         if (!t)
942                                 goto done;
943                         err = -EPERM;
944                         if (t == netdev_priv(itn->fb_tunnel_dev))
945                                 goto done;
946                         dev = t->dev;
947                 }
948                 unregister_netdevice(dev);
949                 err = 0;
950                 break;
951
952         default:
953                 err = -EINVAL;
954         }
955
956 done:
957         return err;
958 }
959 EXPORT_SYMBOL_GPL(ip_tunnel_ctl);
960
961 int ip_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
962                              void __user *data, int cmd)
963 {
964         struct ip_tunnel_parm p;
965         int err;
966
967         if (copy_from_user(&p, data, sizeof(p)))
968                 return -EFAULT;
969         err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd);
970         if (!err && copy_to_user(data, &p, sizeof(p)))
971                 return -EFAULT;
972         return err;
973 }
974 EXPORT_SYMBOL_GPL(ip_tunnel_siocdevprivate);
975
976 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
977 {
978         struct ip_tunnel *tunnel = netdev_priv(dev);
979         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
980         int max_mtu = IP_MAX_MTU - t_hlen;
981
982         if (dev->type == ARPHRD_ETHER)
983                 max_mtu -= dev->hard_header_len;
984
985         if (new_mtu < ETH_MIN_MTU)
986                 return -EINVAL;
987
988         if (new_mtu > max_mtu) {
989                 if (strict)
990                         return -EINVAL;
991
992                 new_mtu = max_mtu;
993         }
994
995         dev->mtu = new_mtu;
996         return 0;
997 }
998 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
999
1000 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1001 {
1002         return __ip_tunnel_change_mtu(dev, new_mtu, true);
1003 }
1004 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
1005
1006 static void ip_tunnel_dev_free(struct net_device *dev)
1007 {
1008         struct ip_tunnel *tunnel = netdev_priv(dev);
1009
1010         gro_cells_destroy(&tunnel->gro_cells);
1011         dst_cache_destroy(&tunnel->dst_cache);
1012         free_percpu(dev->tstats);
1013 }
1014
1015 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
1016 {
1017         struct ip_tunnel *tunnel = netdev_priv(dev);
1018         struct ip_tunnel_net *itn;
1019
1020         itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
1021
1022         if (itn->fb_tunnel_dev != dev) {
1023                 ip_tunnel_del(itn, netdev_priv(dev));
1024                 unregister_netdevice_queue(dev, head);
1025         }
1026 }
1027 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
1028
1029 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
1030 {
1031         struct ip_tunnel *tunnel = netdev_priv(dev);
1032
1033         return tunnel->net;
1034 }
1035 EXPORT_SYMBOL(ip_tunnel_get_link_net);
1036
1037 int ip_tunnel_get_iflink(const struct net_device *dev)
1038 {
1039         struct ip_tunnel *tunnel = netdev_priv(dev);
1040
1041         return tunnel->parms.link;
1042 }
1043 EXPORT_SYMBOL(ip_tunnel_get_iflink);
1044
1045 int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
1046                                   struct rtnl_link_ops *ops, char *devname)
1047 {
1048         struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1049         struct ip_tunnel_parm parms;
1050         unsigned int i;
1051
1052         itn->rtnl_link_ops = ops;
1053         for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1054                 INIT_HLIST_HEAD(&itn->tunnels[i]);
1055
1056         if (!ops || !net_has_fallback_tunnels(net)) {
1057                 struct ip_tunnel_net *it_init_net;
1058
1059                 it_init_net = net_generic(&init_net, ip_tnl_net_id);
1060                 itn->type = it_init_net->type;
1061                 itn->fb_tunnel_dev = NULL;
1062                 return 0;
1063         }
1064
1065         memset(&parms, 0, sizeof(parms));
1066         if (devname)
1067                 strlcpy(parms.name, devname, IFNAMSIZ);
1068
1069         rtnl_lock();
1070         itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1071         /* FB netdevice is special: we have one, and only one per netns.
1072          * Allowing to move it to another netns is clearly unsafe.
1073          */
1074         if (!IS_ERR(itn->fb_tunnel_dev)) {
1075                 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1076                 itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1077                 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1078                 itn->type = itn->fb_tunnel_dev->type;
1079         }
1080         rtnl_unlock();
1081
1082         return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1083 }
1084 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1085
1086 static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
1087                               struct list_head *head,
1088                               struct rtnl_link_ops *ops)
1089 {
1090         struct net_device *dev, *aux;
1091         int h;
1092
1093         for_each_netdev_safe(net, dev, aux)
1094                 if (dev->rtnl_link_ops == ops)
1095                         unregister_netdevice_queue(dev, head);
1096
1097         for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1098                 struct ip_tunnel *t;
1099                 struct hlist_node *n;
1100                 struct hlist_head *thead = &itn->tunnels[h];
1101
1102                 hlist_for_each_entry_safe(t, n, thead, hash_node)
1103                         /* If dev is in the same netns, it has already
1104                          * been added to the list by the previous loop.
1105                          */
1106                         if (!net_eq(dev_net(t->dev), net))
1107                                 unregister_netdevice_queue(t->dev, head);
1108         }
1109 }
1110
1111 void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
1112                            struct rtnl_link_ops *ops)
1113 {
1114         struct ip_tunnel_net *itn;
1115         struct net *net;
1116         LIST_HEAD(list);
1117
1118         rtnl_lock();
1119         list_for_each_entry(net, net_list, exit_list) {
1120                 itn = net_generic(net, id);
1121                 ip_tunnel_destroy(net, itn, &list, ops);
1122         }
1123         unregister_netdevice_many(&list);
1124         rtnl_unlock();
1125 }
1126 EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
1127
1128 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1129                       struct ip_tunnel_parm *p, __u32 fwmark)
1130 {
1131         struct ip_tunnel *nt;
1132         struct net *net = dev_net(dev);
1133         struct ip_tunnel_net *itn;
1134         int mtu;
1135         int err;
1136
1137         nt = netdev_priv(dev);
1138         itn = net_generic(net, nt->ip_tnl_net_id);
1139
1140         if (nt->collect_md) {
1141                 if (rtnl_dereference(itn->collect_md_tun))
1142                         return -EEXIST;
1143         } else {
1144                 if (ip_tunnel_find(itn, p, dev->type))
1145                         return -EEXIST;
1146         }
1147
1148         nt->net = net;
1149         nt->parms = *p;
1150         nt->fwmark = fwmark;
1151         err = register_netdevice(dev);
1152         if (err)
1153                 goto err_register_netdevice;
1154
1155         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1156                 eth_hw_addr_random(dev);
1157
1158         mtu = ip_tunnel_bind_dev(dev);
1159         if (tb[IFLA_MTU]) {
1160                 unsigned int max = IP_MAX_MTU - (nt->hlen + sizeof(struct iphdr));
1161
1162                 if (dev->type == ARPHRD_ETHER)
1163                         max -= dev->hard_header_len;
1164
1165                 mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU, max);
1166         }
1167
1168         err = dev_set_mtu(dev, mtu);
1169         if (err)
1170                 goto err_dev_set_mtu;
1171
1172         ip_tunnel_add(itn, nt);
1173         return 0;
1174
1175 err_dev_set_mtu:
1176         unregister_netdevice(dev);
1177 err_register_netdevice:
1178         return err;
1179 }
1180 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1181
1182 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1183                          struct ip_tunnel_parm *p, __u32 fwmark)
1184 {
1185         struct ip_tunnel *t;
1186         struct ip_tunnel *tunnel = netdev_priv(dev);
1187         struct net *net = tunnel->net;
1188         struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1189
1190         if (dev == itn->fb_tunnel_dev)
1191                 return -EINVAL;
1192
1193         t = ip_tunnel_find(itn, p, dev->type);
1194
1195         if (t) {
1196                 if (t->dev != dev)
1197                         return -EEXIST;
1198         } else {
1199                 t = tunnel;
1200
1201                 if (dev->type != ARPHRD_ETHER) {
1202                         unsigned int nflags = 0;
1203
1204                         if (ipv4_is_multicast(p->iph.daddr))
1205                                 nflags = IFF_BROADCAST;
1206                         else if (p->iph.daddr)
1207                                 nflags = IFF_POINTOPOINT;
1208
1209                         if ((dev->flags ^ nflags) &
1210                             (IFF_POINTOPOINT | IFF_BROADCAST))
1211                                 return -EINVAL;
1212                 }
1213         }
1214
1215         ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
1216         return 0;
1217 }
1218 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1219
1220 int ip_tunnel_init(struct net_device *dev)
1221 {
1222         struct ip_tunnel *tunnel = netdev_priv(dev);
1223         struct iphdr *iph = &tunnel->parms.iph;
1224         int err;
1225
1226         dev->needs_free_netdev = true;
1227         dev->priv_destructor = ip_tunnel_dev_free;
1228         dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1229         if (!dev->tstats)
1230                 return -ENOMEM;
1231
1232         err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1233         if (err) {
1234                 free_percpu(dev->tstats);
1235                 return err;
1236         }
1237
1238         err = gro_cells_init(&tunnel->gro_cells, dev);
1239         if (err) {
1240                 dst_cache_destroy(&tunnel->dst_cache);
1241                 free_percpu(dev->tstats);
1242                 return err;
1243         }
1244
1245         tunnel->dev = dev;
1246         tunnel->net = dev_net(dev);
1247         strcpy(tunnel->parms.name, dev->name);
1248         iph->version            = 4;
1249         iph->ihl                = 5;
1250
1251         if (tunnel->collect_md)
1252                 netif_keep_dst(dev);
1253         return 0;
1254 }
1255 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1256
1257 void ip_tunnel_uninit(struct net_device *dev)
1258 {
1259         struct ip_tunnel *tunnel = netdev_priv(dev);
1260         struct net *net = tunnel->net;
1261         struct ip_tunnel_net *itn;
1262
1263         itn = net_generic(net, tunnel->ip_tnl_net_id);
1264         ip_tunnel_del(itn, netdev_priv(dev));
1265         if (itn->fb_tunnel_dev == dev)
1266                 WRITE_ONCE(itn->fb_tunnel_dev, NULL);
1267
1268         dst_cache_reset(&tunnel->dst_cache);
1269 }
1270 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1271
1272 /* Do least required initialization, rest of init is done in tunnel_init call */
1273 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1274 {
1275         struct ip_tunnel *tunnel = netdev_priv(dev);
1276         tunnel->ip_tnl_net_id = net_id;
1277 }
1278 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1279
1280 MODULE_LICENSE("GPL");