Merge remote-tracking branch 'regulator/for-5.20' into regulator-6.0
[platform/kernel/linux-starfive.git] / net / ipv4 / ip_tunnel.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (c) 2013 Nicira, Inc.
4  */
5
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7
8 #include <linux/capability.h>
9 #include <linux/module.h>
10 #include <linux/types.h>
11 #include <linux/kernel.h>
12 #include <linux/slab.h>
13 #include <linux/uaccess.h>
14 #include <linux/skbuff.h>
15 #include <linux/netdevice.h>
16 #include <linux/in.h>
17 #include <linux/tcp.h>
18 #include <linux/udp.h>
19 #include <linux/if_arp.h>
20 #include <linux/init.h>
21 #include <linux/in6.h>
22 #include <linux/inetdevice.h>
23 #include <linux/igmp.h>
24 #include <linux/netfilter_ipv4.h>
25 #include <linux/etherdevice.h>
26 #include <linux/if_ether.h>
27 #include <linux/if_vlan.h>
28 #include <linux/rculist.h>
29 #include <linux/err.h>
30
31 #include <net/sock.h>
32 #include <net/ip.h>
33 #include <net/icmp.h>
34 #include <net/protocol.h>
35 #include <net/ip_tunnels.h>
36 #include <net/arp.h>
37 #include <net/checksum.h>
38 #include <net/dsfield.h>
39 #include <net/inet_ecn.h>
40 #include <net/xfrm.h>
41 #include <net/net_namespace.h>
42 #include <net/netns/generic.h>
43 #include <net/rtnetlink.h>
44 #include <net/udp.h>
45 #include <net/dst_metadata.h>
46
47 #if IS_ENABLED(CONFIG_IPV6)
48 #include <net/ipv6.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #endif
52
53 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
54 {
55         return hash_32((__force u32)key ^ (__force u32)remote,
56                          IP_TNL_HASH_BITS);
57 }
58
59 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
60                                 __be16 flags, __be32 key)
61 {
62         if (p->i_flags & TUNNEL_KEY) {
63                 if (flags & TUNNEL_KEY)
64                         return key == p->i_key;
65                 else
66                         /* key expected, none present */
67                         return false;
68         } else
69                 return !(flags & TUNNEL_KEY);
70 }
71
72 /* Fallback tunnel: no source, no destination, no key, no options
73
74    Tunnel hash table:
75    We require exact key match i.e. if a key is present in packet
76    it will match only tunnel with the same key; if it is not present,
77    it will match only keyless tunnel.
78
79    All keysless packets, if not matched configured keyless tunnels
80    will match fallback tunnel.
81    Given src, dst and key, find appropriate for input tunnel.
82 */
83 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
84                                    int link, __be16 flags,
85                                    __be32 remote, __be32 local,
86                                    __be32 key)
87 {
88         struct ip_tunnel *t, *cand = NULL;
89         struct hlist_head *head;
90         struct net_device *ndev;
91         unsigned int hash;
92
93         hash = ip_tunnel_hash(key, remote);
94         head = &itn->tunnels[hash];
95
96         hlist_for_each_entry_rcu(t, head, hash_node) {
97                 if (local != t->parms.iph.saddr ||
98                     remote != t->parms.iph.daddr ||
99                     !(t->dev->flags & IFF_UP))
100                         continue;
101
102                 if (!ip_tunnel_key_match(&t->parms, flags, key))
103                         continue;
104
105                 if (t->parms.link == link)
106                         return t;
107                 else
108                         cand = t;
109         }
110
111         hlist_for_each_entry_rcu(t, head, hash_node) {
112                 if (remote != t->parms.iph.daddr ||
113                     t->parms.iph.saddr != 0 ||
114                     !(t->dev->flags & IFF_UP))
115                         continue;
116
117                 if (!ip_tunnel_key_match(&t->parms, flags, key))
118                         continue;
119
120                 if (t->parms.link == link)
121                         return t;
122                 else if (!cand)
123                         cand = t;
124         }
125
126         hash = ip_tunnel_hash(key, 0);
127         head = &itn->tunnels[hash];
128
129         hlist_for_each_entry_rcu(t, head, hash_node) {
130                 if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
131                     (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
132                         continue;
133
134                 if (!(t->dev->flags & IFF_UP))
135                         continue;
136
137                 if (!ip_tunnel_key_match(&t->parms, flags, key))
138                         continue;
139
140                 if (t->parms.link == link)
141                         return t;
142                 else if (!cand)
143                         cand = t;
144         }
145
146         hlist_for_each_entry_rcu(t, head, hash_node) {
147                 if ((!(flags & TUNNEL_NO_KEY) && t->parms.i_key != key) ||
148                     t->parms.iph.saddr != 0 ||
149                     t->parms.iph.daddr != 0 ||
150                     !(t->dev->flags & IFF_UP))
151                         continue;
152
153                 if (t->parms.link == link)
154                         return t;
155                 else if (!cand)
156                         cand = t;
157         }
158
159         if (cand)
160                 return cand;
161
162         t = rcu_dereference(itn->collect_md_tun);
163         if (t && t->dev->flags & IFF_UP)
164                 return t;
165
166         ndev = READ_ONCE(itn->fb_tunnel_dev);
167         if (ndev && ndev->flags & IFF_UP)
168                 return netdev_priv(ndev);
169
170         return NULL;
171 }
172 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
173
174 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
175                                     struct ip_tunnel_parm *parms)
176 {
177         unsigned int h;
178         __be32 remote;
179         __be32 i_key = parms->i_key;
180
181         if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
182                 remote = parms->iph.daddr;
183         else
184                 remote = 0;
185
186         if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
187                 i_key = 0;
188
189         h = ip_tunnel_hash(i_key, remote);
190         return &itn->tunnels[h];
191 }
192
193 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
194 {
195         struct hlist_head *head = ip_bucket(itn, &t->parms);
196
197         if (t->collect_md)
198                 rcu_assign_pointer(itn->collect_md_tun, t);
199         hlist_add_head_rcu(&t->hash_node, head);
200 }
201
202 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
203 {
204         if (t->collect_md)
205                 rcu_assign_pointer(itn->collect_md_tun, NULL);
206         hlist_del_init_rcu(&t->hash_node);
207 }
208
209 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
210                                         struct ip_tunnel_parm *parms,
211                                         int type)
212 {
213         __be32 remote = parms->iph.daddr;
214         __be32 local = parms->iph.saddr;
215         __be32 key = parms->i_key;
216         __be16 flags = parms->i_flags;
217         int link = parms->link;
218         struct ip_tunnel *t = NULL;
219         struct hlist_head *head = ip_bucket(itn, parms);
220
221         hlist_for_each_entry_rcu(t, head, hash_node) {
222                 if (local == t->parms.iph.saddr &&
223                     remote == t->parms.iph.daddr &&
224                     link == t->parms.link &&
225                     type == t->dev->type &&
226                     ip_tunnel_key_match(&t->parms, flags, key))
227                         break;
228         }
229         return t;
230 }
231
232 static struct net_device *__ip_tunnel_create(struct net *net,
233                                              const struct rtnl_link_ops *ops,
234                                              struct ip_tunnel_parm *parms)
235 {
236         int err;
237         struct ip_tunnel *tunnel;
238         struct net_device *dev;
239         char name[IFNAMSIZ];
240
241         err = -E2BIG;
242         if (parms->name[0]) {
243                 if (!dev_valid_name(parms->name))
244                         goto failed;
245                 strscpy(name, parms->name, IFNAMSIZ);
246         } else {
247                 if (strlen(ops->kind) > (IFNAMSIZ - 3))
248                         goto failed;
249                 strcpy(name, ops->kind);
250                 strcat(name, "%d");
251         }
252
253         ASSERT_RTNL();
254         dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
255         if (!dev) {
256                 err = -ENOMEM;
257                 goto failed;
258         }
259         dev_net_set(dev, net);
260
261         dev->rtnl_link_ops = ops;
262
263         tunnel = netdev_priv(dev);
264         tunnel->parms = *parms;
265         tunnel->net = net;
266
267         err = register_netdevice(dev);
268         if (err)
269                 goto failed_free;
270
271         return dev;
272
273 failed_free:
274         free_netdev(dev);
275 failed:
276         return ERR_PTR(err);
277 }
278
279 static int ip_tunnel_bind_dev(struct net_device *dev)
280 {
281         struct net_device *tdev = NULL;
282         struct ip_tunnel *tunnel = netdev_priv(dev);
283         const struct iphdr *iph;
284         int hlen = LL_MAX_HEADER;
285         int mtu = ETH_DATA_LEN;
286         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
287
288         iph = &tunnel->parms.iph;
289
290         /* Guess output device to choose reasonable mtu and needed_headroom */
291         if (iph->daddr) {
292                 struct flowi4 fl4;
293                 struct rtable *rt;
294
295                 ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
296                                     iph->saddr, tunnel->parms.o_key,
297                                     RT_TOS(iph->tos), dev_net(dev),
298                                     tunnel->parms.link, tunnel->fwmark, 0);
299                 rt = ip_route_output_key(tunnel->net, &fl4);
300
301                 if (!IS_ERR(rt)) {
302                         tdev = rt->dst.dev;
303                         ip_rt_put(rt);
304                 }
305                 if (dev->type != ARPHRD_ETHER)
306                         dev->flags |= IFF_POINTOPOINT;
307
308                 dst_cache_reset(&tunnel->dst_cache);
309         }
310
311         if (!tdev && tunnel->parms.link)
312                 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
313
314         if (tdev) {
315                 hlen = tdev->hard_header_len + tdev->needed_headroom;
316                 mtu = min(tdev->mtu, IP_MAX_MTU);
317         }
318
319         dev->needed_headroom = t_hlen + hlen;
320         mtu -= t_hlen + (dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0);
321
322         if (mtu < IPV4_MIN_MTU)
323                 mtu = IPV4_MIN_MTU;
324
325         return mtu;
326 }
327
328 static struct ip_tunnel *ip_tunnel_create(struct net *net,
329                                           struct ip_tunnel_net *itn,
330                                           struct ip_tunnel_parm *parms)
331 {
332         struct ip_tunnel *nt;
333         struct net_device *dev;
334         int t_hlen;
335         int mtu;
336         int err;
337
338         dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms);
339         if (IS_ERR(dev))
340                 return ERR_CAST(dev);
341
342         mtu = ip_tunnel_bind_dev(dev);
343         err = dev_set_mtu(dev, mtu);
344         if (err)
345                 goto err_dev_set_mtu;
346
347         nt = netdev_priv(dev);
348         t_hlen = nt->hlen + sizeof(struct iphdr);
349         dev->min_mtu = ETH_MIN_MTU;
350         dev->max_mtu = IP_MAX_MTU - t_hlen;
351         if (dev->type == ARPHRD_ETHER)
352                 dev->max_mtu -= dev->hard_header_len;
353
354         ip_tunnel_add(itn, nt);
355         return nt;
356
357 err_dev_set_mtu:
358         unregister_netdevice(dev);
359         return ERR_PTR(err);
360 }
361
362 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
363                   const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
364                   bool log_ecn_error)
365 {
366         const struct iphdr *iph = ip_hdr(skb);
367         int err;
368
369 #ifdef CONFIG_NET_IPGRE_BROADCAST
370         if (ipv4_is_multicast(iph->daddr)) {
371                 tunnel->dev->stats.multicast++;
372                 skb->pkt_type = PACKET_BROADCAST;
373         }
374 #endif
375
376         if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
377              ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
378                 tunnel->dev->stats.rx_crc_errors++;
379                 tunnel->dev->stats.rx_errors++;
380                 goto drop;
381         }
382
383         if (tunnel->parms.i_flags&TUNNEL_SEQ) {
384                 if (!(tpi->flags&TUNNEL_SEQ) ||
385                     (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
386                         tunnel->dev->stats.rx_fifo_errors++;
387                         tunnel->dev->stats.rx_errors++;
388                         goto drop;
389                 }
390                 tunnel->i_seqno = ntohl(tpi->seq) + 1;
391         }
392
393         skb_set_network_header(skb, (tunnel->dev->type == ARPHRD_ETHER) ? ETH_HLEN : 0);
394
395         err = IP_ECN_decapsulate(iph, skb);
396         if (unlikely(err)) {
397                 if (log_ecn_error)
398                         net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
399                                         &iph->saddr, iph->tos);
400                 if (err > 1) {
401                         ++tunnel->dev->stats.rx_frame_errors;
402                         ++tunnel->dev->stats.rx_errors;
403                         goto drop;
404                 }
405         }
406
407         dev_sw_netstats_rx_add(tunnel->dev, skb->len);
408         skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
409
410         if (tunnel->dev->type == ARPHRD_ETHER) {
411                 skb->protocol = eth_type_trans(skb, tunnel->dev);
412                 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
413         } else {
414                 skb->dev = tunnel->dev;
415         }
416
417         if (tun_dst)
418                 skb_dst_set(skb, (struct dst_entry *)tun_dst);
419
420         gro_cells_receive(&tunnel->gro_cells, skb);
421         return 0;
422
423 drop:
424         if (tun_dst)
425                 dst_release((struct dst_entry *)tun_dst);
426         kfree_skb(skb);
427         return 0;
428 }
429 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
430
431 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
432                             unsigned int num)
433 {
434         if (num >= MAX_IPTUN_ENCAP_OPS)
435                 return -ERANGE;
436
437         return !cmpxchg((const struct ip_tunnel_encap_ops **)
438                         &iptun_encaps[num],
439                         NULL, ops) ? 0 : -1;
440 }
441 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
442
443 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
444                             unsigned int num)
445 {
446         int ret;
447
448         if (num >= MAX_IPTUN_ENCAP_OPS)
449                 return -ERANGE;
450
451         ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
452                        &iptun_encaps[num],
453                        ops, NULL) == ops) ? 0 : -1;
454
455         synchronize_net();
456
457         return ret;
458 }
459 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
460
461 int ip_tunnel_encap_setup(struct ip_tunnel *t,
462                           struct ip_tunnel_encap *ipencap)
463 {
464         int hlen;
465
466         memset(&t->encap, 0, sizeof(t->encap));
467
468         hlen = ip_encap_hlen(ipencap);
469         if (hlen < 0)
470                 return hlen;
471
472         t->encap.type = ipencap->type;
473         t->encap.sport = ipencap->sport;
474         t->encap.dport = ipencap->dport;
475         t->encap.flags = ipencap->flags;
476
477         t->encap_hlen = hlen;
478         t->hlen = t->encap_hlen + t->tun_hlen;
479
480         return 0;
481 }
482 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
483
484 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
485                             struct rtable *rt, __be16 df,
486                             const struct iphdr *inner_iph,
487                             int tunnel_hlen, __be32 dst, bool md)
488 {
489         struct ip_tunnel *tunnel = netdev_priv(dev);
490         int pkt_size;
491         int mtu;
492
493         tunnel_hlen = md ? tunnel_hlen : tunnel->hlen;
494         pkt_size = skb->len - tunnel_hlen;
495         pkt_size -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0;
496
497         if (df) {
498                 mtu = dst_mtu(&rt->dst) - (sizeof(struct iphdr) + tunnel_hlen);
499                 mtu -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0;
500         } else {
501                 mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
502         }
503
504         if (skb_valid_dst(skb))
505                 skb_dst_update_pmtu_no_confirm(skb, mtu);
506
507         if (skb->protocol == htons(ETH_P_IP)) {
508                 if (!skb_is_gso(skb) &&
509                     (inner_iph->frag_off & htons(IP_DF)) &&
510                     mtu < pkt_size) {
511                         icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
512                         return -E2BIG;
513                 }
514         }
515 #if IS_ENABLED(CONFIG_IPV6)
516         else if (skb->protocol == htons(ETH_P_IPV6)) {
517                 struct rt6_info *rt6;
518                 __be32 daddr;
519
520                 rt6 = skb_valid_dst(skb) ? (struct rt6_info *)skb_dst(skb) :
521                                            NULL;
522                 daddr = md ? dst : tunnel->parms.iph.daddr;
523
524                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
525                            mtu >= IPV6_MIN_MTU) {
526                         if ((daddr && !ipv4_is_multicast(daddr)) ||
527                             rt6->rt6i_dst.plen == 128) {
528                                 rt6->rt6i_flags |= RTF_MODIFIED;
529                                 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
530                         }
531                 }
532
533                 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
534                                         mtu < pkt_size) {
535                         icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
536                         return -E2BIG;
537                 }
538         }
539 #endif
540         return 0;
541 }
542
543 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
544                        u8 proto, int tunnel_hlen)
545 {
546         struct ip_tunnel *tunnel = netdev_priv(dev);
547         u32 headroom = sizeof(struct iphdr);
548         struct ip_tunnel_info *tun_info;
549         const struct ip_tunnel_key *key;
550         const struct iphdr *inner_iph;
551         struct rtable *rt = NULL;
552         struct flowi4 fl4;
553         __be16 df = 0;
554         u8 tos, ttl;
555         bool use_cache;
556
557         tun_info = skb_tunnel_info(skb);
558         if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
559                      ip_tunnel_info_af(tun_info) != AF_INET))
560                 goto tx_error;
561         key = &tun_info->key;
562         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
563         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
564         tos = key->tos;
565         if (tos == 1) {
566                 if (skb->protocol == htons(ETH_P_IP))
567                         tos = inner_iph->tos;
568                 else if (skb->protocol == htons(ETH_P_IPV6))
569                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
570         }
571         ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src,
572                             tunnel_id_to_key32(key->tun_id), RT_TOS(tos),
573                             dev_net(dev), 0, skb->mark, skb_get_hash(skb));
574         if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
575                 goto tx_error;
576
577         use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
578         if (use_cache)
579                 rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr);
580         if (!rt) {
581                 rt = ip_route_output_key(tunnel->net, &fl4);
582                 if (IS_ERR(rt)) {
583                         dev->stats.tx_carrier_errors++;
584                         goto tx_error;
585                 }
586                 if (use_cache)
587                         dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
588                                           fl4.saddr);
589         }
590         if (rt->dst.dev == dev) {
591                 ip_rt_put(rt);
592                 dev->stats.collisions++;
593                 goto tx_error;
594         }
595
596         if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
597                 df = htons(IP_DF);
598         if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen,
599                             key->u.ipv4.dst, true)) {
600                 ip_rt_put(rt);
601                 goto tx_error;
602         }
603
604         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
605         ttl = key->ttl;
606         if (ttl == 0) {
607                 if (skb->protocol == htons(ETH_P_IP))
608                         ttl = inner_iph->ttl;
609                 else if (skb->protocol == htons(ETH_P_IPV6))
610                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
611                 else
612                         ttl = ip4_dst_hoplimit(&rt->dst);
613         }
614
615         headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
616         if (headroom > dev->needed_headroom)
617                 dev->needed_headroom = headroom;
618
619         if (skb_cow_head(skb, dev->needed_headroom)) {
620                 ip_rt_put(rt);
621                 goto tx_dropped;
622         }
623         iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
624                       df, !net_eq(tunnel->net, dev_net(dev)));
625         return;
626 tx_error:
627         dev->stats.tx_errors++;
628         goto kfree;
629 tx_dropped:
630         dev->stats.tx_dropped++;
631 kfree:
632         kfree_skb(skb);
633 }
634 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
635
636 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
637                     const struct iphdr *tnl_params, u8 protocol)
638 {
639         struct ip_tunnel *tunnel = netdev_priv(dev);
640         struct ip_tunnel_info *tun_info = NULL;
641         const struct iphdr *inner_iph;
642         unsigned int max_headroom;      /* The extra header space needed */
643         struct rtable *rt = NULL;               /* Route to the other host */
644         __be16 payload_protocol;
645         bool use_cache = false;
646         struct flowi4 fl4;
647         bool md = false;
648         bool connected;
649         u8 tos, ttl;
650         __be32 dst;
651         __be16 df;
652
653         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
654         connected = (tunnel->parms.iph.daddr != 0);
655         payload_protocol = skb_protocol(skb, true);
656
657         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
658
659         dst = tnl_params->daddr;
660         if (dst == 0) {
661                 /* NBMA tunnel */
662
663                 if (!skb_dst(skb)) {
664                         dev->stats.tx_fifo_errors++;
665                         goto tx_error;
666                 }
667
668                 tun_info = skb_tunnel_info(skb);
669                 if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) &&
670                     ip_tunnel_info_af(tun_info) == AF_INET &&
671                     tun_info->key.u.ipv4.dst) {
672                         dst = tun_info->key.u.ipv4.dst;
673                         md = true;
674                         connected = true;
675                 } else if (payload_protocol == htons(ETH_P_IP)) {
676                         rt = skb_rtable(skb);
677                         dst = rt_nexthop(rt, inner_iph->daddr);
678                 }
679 #if IS_ENABLED(CONFIG_IPV6)
680                 else if (payload_protocol == htons(ETH_P_IPV6)) {
681                         const struct in6_addr *addr6;
682                         struct neighbour *neigh;
683                         bool do_tx_error_icmp;
684                         int addr_type;
685
686                         neigh = dst_neigh_lookup(skb_dst(skb),
687                                                  &ipv6_hdr(skb)->daddr);
688                         if (!neigh)
689                                 goto tx_error;
690
691                         addr6 = (const struct in6_addr *)&neigh->primary_key;
692                         addr_type = ipv6_addr_type(addr6);
693
694                         if (addr_type == IPV6_ADDR_ANY) {
695                                 addr6 = &ipv6_hdr(skb)->daddr;
696                                 addr_type = ipv6_addr_type(addr6);
697                         }
698
699                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
700                                 do_tx_error_icmp = true;
701                         else {
702                                 do_tx_error_icmp = false;
703                                 dst = addr6->s6_addr32[3];
704                         }
705                         neigh_release(neigh);
706                         if (do_tx_error_icmp)
707                                 goto tx_error_icmp;
708                 }
709 #endif
710                 else
711                         goto tx_error;
712
713                 if (!md)
714                         connected = false;
715         }
716
717         tos = tnl_params->tos;
718         if (tos & 0x1) {
719                 tos &= ~0x1;
720                 if (payload_protocol == htons(ETH_P_IP)) {
721                         tos = inner_iph->tos;
722                         connected = false;
723                 } else if (payload_protocol == htons(ETH_P_IPV6)) {
724                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
725                         connected = false;
726                 }
727         }
728
729         ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
730                             tunnel->parms.o_key, RT_TOS(tos),
731                             dev_net(dev), tunnel->parms.link,
732                             tunnel->fwmark, skb_get_hash(skb));
733
734         if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
735                 goto tx_error;
736
737         if (connected && md) {
738                 use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
739                 if (use_cache)
740                         rt = dst_cache_get_ip4(&tun_info->dst_cache,
741                                                &fl4.saddr);
742         } else {
743                 rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache,
744                                                 &fl4.saddr) : NULL;
745         }
746
747         if (!rt) {
748                 rt = ip_route_output_key(tunnel->net, &fl4);
749
750                 if (IS_ERR(rt)) {
751                         dev->stats.tx_carrier_errors++;
752                         goto tx_error;
753                 }
754                 if (use_cache)
755                         dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
756                                           fl4.saddr);
757                 else if (!md && connected)
758                         dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
759                                           fl4.saddr);
760         }
761
762         if (rt->dst.dev == dev) {
763                 ip_rt_put(rt);
764                 dev->stats.collisions++;
765                 goto tx_error;
766         }
767
768         df = tnl_params->frag_off;
769         if (payload_protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
770                 df |= (inner_iph->frag_off & htons(IP_DF));
771
772         if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, 0, 0, false)) {
773                 ip_rt_put(rt);
774                 goto tx_error;
775         }
776
777         if (tunnel->err_count > 0) {
778                 if (time_before(jiffies,
779                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
780                         tunnel->err_count--;
781
782                         dst_link_failure(skb);
783                 } else
784                         tunnel->err_count = 0;
785         }
786
787         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
788         ttl = tnl_params->ttl;
789         if (ttl == 0) {
790                 if (payload_protocol == htons(ETH_P_IP))
791                         ttl = inner_iph->ttl;
792 #if IS_ENABLED(CONFIG_IPV6)
793                 else if (payload_protocol == htons(ETH_P_IPV6))
794                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
795 #endif
796                 else
797                         ttl = ip4_dst_hoplimit(&rt->dst);
798         }
799
800         max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
801                         + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
802         if (max_headroom > dev->needed_headroom)
803                 dev->needed_headroom = max_headroom;
804
805         if (skb_cow_head(skb, dev->needed_headroom)) {
806                 ip_rt_put(rt);
807                 dev->stats.tx_dropped++;
808                 kfree_skb(skb);
809                 return;
810         }
811
812         iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
813                       df, !net_eq(tunnel->net, dev_net(dev)));
814         return;
815
816 #if IS_ENABLED(CONFIG_IPV6)
817 tx_error_icmp:
818         dst_link_failure(skb);
819 #endif
820 tx_error:
821         dev->stats.tx_errors++;
822         kfree_skb(skb);
823 }
824 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
825
826 static void ip_tunnel_update(struct ip_tunnel_net *itn,
827                              struct ip_tunnel *t,
828                              struct net_device *dev,
829                              struct ip_tunnel_parm *p,
830                              bool set_mtu,
831                              __u32 fwmark)
832 {
833         ip_tunnel_del(itn, t);
834         t->parms.iph.saddr = p->iph.saddr;
835         t->parms.iph.daddr = p->iph.daddr;
836         t->parms.i_key = p->i_key;
837         t->parms.o_key = p->o_key;
838         if (dev->type != ARPHRD_ETHER) {
839                 __dev_addr_set(dev, &p->iph.saddr, 4);
840                 memcpy(dev->broadcast, &p->iph.daddr, 4);
841         }
842         ip_tunnel_add(itn, t);
843
844         t->parms.iph.ttl = p->iph.ttl;
845         t->parms.iph.tos = p->iph.tos;
846         t->parms.iph.frag_off = p->iph.frag_off;
847
848         if (t->parms.link != p->link || t->fwmark != fwmark) {
849                 int mtu;
850
851                 t->parms.link = p->link;
852                 t->fwmark = fwmark;
853                 mtu = ip_tunnel_bind_dev(dev);
854                 if (set_mtu)
855                         dev->mtu = mtu;
856         }
857         dst_cache_reset(&t->dst_cache);
858         netdev_state_change(dev);
859 }
860
861 int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
862 {
863         int err = 0;
864         struct ip_tunnel *t = netdev_priv(dev);
865         struct net *net = t->net;
866         struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
867
868         switch (cmd) {
869         case SIOCGETTUNNEL:
870                 if (dev == itn->fb_tunnel_dev) {
871                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
872                         if (!t)
873                                 t = netdev_priv(dev);
874                 }
875                 memcpy(p, &t->parms, sizeof(*p));
876                 break;
877
878         case SIOCADDTUNNEL:
879         case SIOCCHGTUNNEL:
880                 err = -EPERM;
881                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
882                         goto done;
883                 if (p->iph.ttl)
884                         p->iph.frag_off |= htons(IP_DF);
885                 if (!(p->i_flags & VTI_ISVTI)) {
886                         if (!(p->i_flags & TUNNEL_KEY))
887                                 p->i_key = 0;
888                         if (!(p->o_flags & TUNNEL_KEY))
889                                 p->o_key = 0;
890                 }
891
892                 t = ip_tunnel_find(itn, p, itn->type);
893
894                 if (cmd == SIOCADDTUNNEL) {
895                         if (!t) {
896                                 t = ip_tunnel_create(net, itn, p);
897                                 err = PTR_ERR_OR_ZERO(t);
898                                 break;
899                         }
900
901                         err = -EEXIST;
902                         break;
903                 }
904                 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
905                         if (t) {
906                                 if (t->dev != dev) {
907                                         err = -EEXIST;
908                                         break;
909                                 }
910                         } else {
911                                 unsigned int nflags = 0;
912
913                                 if (ipv4_is_multicast(p->iph.daddr))
914                                         nflags = IFF_BROADCAST;
915                                 else if (p->iph.daddr)
916                                         nflags = IFF_POINTOPOINT;
917
918                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
919                                         err = -EINVAL;
920                                         break;
921                                 }
922
923                                 t = netdev_priv(dev);
924                         }
925                 }
926
927                 if (t) {
928                         err = 0;
929                         ip_tunnel_update(itn, t, dev, p, true, 0);
930                 } else {
931                         err = -ENOENT;
932                 }
933                 break;
934
935         case SIOCDELTUNNEL:
936                 err = -EPERM;
937                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
938                         goto done;
939
940                 if (dev == itn->fb_tunnel_dev) {
941                         err = -ENOENT;
942                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
943                         if (!t)
944                                 goto done;
945                         err = -EPERM;
946                         if (t == netdev_priv(itn->fb_tunnel_dev))
947                                 goto done;
948                         dev = t->dev;
949                 }
950                 unregister_netdevice(dev);
951                 err = 0;
952                 break;
953
954         default:
955                 err = -EINVAL;
956         }
957
958 done:
959         return err;
960 }
961 EXPORT_SYMBOL_GPL(ip_tunnel_ctl);
962
963 int ip_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
964                              void __user *data, int cmd)
965 {
966         struct ip_tunnel_parm p;
967         int err;
968
969         if (copy_from_user(&p, data, sizeof(p)))
970                 return -EFAULT;
971         err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd);
972         if (!err && copy_to_user(data, &p, sizeof(p)))
973                 return -EFAULT;
974         return err;
975 }
976 EXPORT_SYMBOL_GPL(ip_tunnel_siocdevprivate);
977
978 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
979 {
980         struct ip_tunnel *tunnel = netdev_priv(dev);
981         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
982         int max_mtu = IP_MAX_MTU - t_hlen;
983
984         if (dev->type == ARPHRD_ETHER)
985                 max_mtu -= dev->hard_header_len;
986
987         if (new_mtu < ETH_MIN_MTU)
988                 return -EINVAL;
989
990         if (new_mtu > max_mtu) {
991                 if (strict)
992                         return -EINVAL;
993
994                 new_mtu = max_mtu;
995         }
996
997         dev->mtu = new_mtu;
998         return 0;
999 }
1000 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
1001
1002 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1003 {
1004         return __ip_tunnel_change_mtu(dev, new_mtu, true);
1005 }
1006 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
1007
1008 static void ip_tunnel_dev_free(struct net_device *dev)
1009 {
1010         struct ip_tunnel *tunnel = netdev_priv(dev);
1011
1012         gro_cells_destroy(&tunnel->gro_cells);
1013         dst_cache_destroy(&tunnel->dst_cache);
1014         free_percpu(dev->tstats);
1015 }
1016
1017 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
1018 {
1019         struct ip_tunnel *tunnel = netdev_priv(dev);
1020         struct ip_tunnel_net *itn;
1021
1022         itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
1023
1024         if (itn->fb_tunnel_dev != dev) {
1025                 ip_tunnel_del(itn, netdev_priv(dev));
1026                 unregister_netdevice_queue(dev, head);
1027         }
1028 }
1029 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
1030
1031 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
1032 {
1033         struct ip_tunnel *tunnel = netdev_priv(dev);
1034
1035         return tunnel->net;
1036 }
1037 EXPORT_SYMBOL(ip_tunnel_get_link_net);
1038
1039 int ip_tunnel_get_iflink(const struct net_device *dev)
1040 {
1041         struct ip_tunnel *tunnel = netdev_priv(dev);
1042
1043         return tunnel->parms.link;
1044 }
1045 EXPORT_SYMBOL(ip_tunnel_get_iflink);
1046
1047 int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
1048                                   struct rtnl_link_ops *ops, char *devname)
1049 {
1050         struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1051         struct ip_tunnel_parm parms;
1052         unsigned int i;
1053
1054         itn->rtnl_link_ops = ops;
1055         for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1056                 INIT_HLIST_HEAD(&itn->tunnels[i]);
1057
1058         if (!ops || !net_has_fallback_tunnels(net)) {
1059                 struct ip_tunnel_net *it_init_net;
1060
1061                 it_init_net = net_generic(&init_net, ip_tnl_net_id);
1062                 itn->type = it_init_net->type;
1063                 itn->fb_tunnel_dev = NULL;
1064                 return 0;
1065         }
1066
1067         memset(&parms, 0, sizeof(parms));
1068         if (devname)
1069                 strscpy(parms.name, devname, IFNAMSIZ);
1070
1071         rtnl_lock();
1072         itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1073         /* FB netdevice is special: we have one, and only one per netns.
1074          * Allowing to move it to another netns is clearly unsafe.
1075          */
1076         if (!IS_ERR(itn->fb_tunnel_dev)) {
1077                 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1078                 itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1079                 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1080                 itn->type = itn->fb_tunnel_dev->type;
1081         }
1082         rtnl_unlock();
1083
1084         return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1085 }
1086 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1087
1088 static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
1089                               struct list_head *head,
1090                               struct rtnl_link_ops *ops)
1091 {
1092         struct net_device *dev, *aux;
1093         int h;
1094
1095         for_each_netdev_safe(net, dev, aux)
1096                 if (dev->rtnl_link_ops == ops)
1097                         unregister_netdevice_queue(dev, head);
1098
1099         for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1100                 struct ip_tunnel *t;
1101                 struct hlist_node *n;
1102                 struct hlist_head *thead = &itn->tunnels[h];
1103
1104                 hlist_for_each_entry_safe(t, n, thead, hash_node)
1105                         /* If dev is in the same netns, it has already
1106                          * been added to the list by the previous loop.
1107                          */
1108                         if (!net_eq(dev_net(t->dev), net))
1109                                 unregister_netdevice_queue(t->dev, head);
1110         }
1111 }
1112
1113 void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
1114                            struct rtnl_link_ops *ops)
1115 {
1116         struct ip_tunnel_net *itn;
1117         struct net *net;
1118         LIST_HEAD(list);
1119
1120         rtnl_lock();
1121         list_for_each_entry(net, net_list, exit_list) {
1122                 itn = net_generic(net, id);
1123                 ip_tunnel_destroy(net, itn, &list, ops);
1124         }
1125         unregister_netdevice_many(&list);
1126         rtnl_unlock();
1127 }
1128 EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
1129
1130 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1131                       struct ip_tunnel_parm *p, __u32 fwmark)
1132 {
1133         struct ip_tunnel *nt;
1134         struct net *net = dev_net(dev);
1135         struct ip_tunnel_net *itn;
1136         int mtu;
1137         int err;
1138
1139         nt = netdev_priv(dev);
1140         itn = net_generic(net, nt->ip_tnl_net_id);
1141
1142         if (nt->collect_md) {
1143                 if (rtnl_dereference(itn->collect_md_tun))
1144                         return -EEXIST;
1145         } else {
1146                 if (ip_tunnel_find(itn, p, dev->type))
1147                         return -EEXIST;
1148         }
1149
1150         nt->net = net;
1151         nt->parms = *p;
1152         nt->fwmark = fwmark;
1153         err = register_netdevice(dev);
1154         if (err)
1155                 goto err_register_netdevice;
1156
1157         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1158                 eth_hw_addr_random(dev);
1159
1160         mtu = ip_tunnel_bind_dev(dev);
1161         if (tb[IFLA_MTU]) {
1162                 unsigned int max = IP_MAX_MTU - (nt->hlen + sizeof(struct iphdr));
1163
1164                 if (dev->type == ARPHRD_ETHER)
1165                         max -= dev->hard_header_len;
1166
1167                 mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU, max);
1168         }
1169
1170         err = dev_set_mtu(dev, mtu);
1171         if (err)
1172                 goto err_dev_set_mtu;
1173
1174         ip_tunnel_add(itn, nt);
1175         return 0;
1176
1177 err_dev_set_mtu:
1178         unregister_netdevice(dev);
1179 err_register_netdevice:
1180         return err;
1181 }
1182 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1183
1184 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1185                          struct ip_tunnel_parm *p, __u32 fwmark)
1186 {
1187         struct ip_tunnel *t;
1188         struct ip_tunnel *tunnel = netdev_priv(dev);
1189         struct net *net = tunnel->net;
1190         struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1191
1192         if (dev == itn->fb_tunnel_dev)
1193                 return -EINVAL;
1194
1195         t = ip_tunnel_find(itn, p, dev->type);
1196
1197         if (t) {
1198                 if (t->dev != dev)
1199                         return -EEXIST;
1200         } else {
1201                 t = tunnel;
1202
1203                 if (dev->type != ARPHRD_ETHER) {
1204                         unsigned int nflags = 0;
1205
1206                         if (ipv4_is_multicast(p->iph.daddr))
1207                                 nflags = IFF_BROADCAST;
1208                         else if (p->iph.daddr)
1209                                 nflags = IFF_POINTOPOINT;
1210
1211                         if ((dev->flags ^ nflags) &
1212                             (IFF_POINTOPOINT | IFF_BROADCAST))
1213                                 return -EINVAL;
1214                 }
1215         }
1216
1217         ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
1218         return 0;
1219 }
1220 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1221
1222 int ip_tunnel_init(struct net_device *dev)
1223 {
1224         struct ip_tunnel *tunnel = netdev_priv(dev);
1225         struct iphdr *iph = &tunnel->parms.iph;
1226         int err;
1227
1228         dev->needs_free_netdev = true;
1229         dev->priv_destructor = ip_tunnel_dev_free;
1230         dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1231         if (!dev->tstats)
1232                 return -ENOMEM;
1233
1234         err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1235         if (err) {
1236                 free_percpu(dev->tstats);
1237                 return err;
1238         }
1239
1240         err = gro_cells_init(&tunnel->gro_cells, dev);
1241         if (err) {
1242                 dst_cache_destroy(&tunnel->dst_cache);
1243                 free_percpu(dev->tstats);
1244                 return err;
1245         }
1246
1247         tunnel->dev = dev;
1248         tunnel->net = dev_net(dev);
1249         strcpy(tunnel->parms.name, dev->name);
1250         iph->version            = 4;
1251         iph->ihl                = 5;
1252
1253         if (tunnel->collect_md)
1254                 netif_keep_dst(dev);
1255         return 0;
1256 }
1257 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1258
1259 void ip_tunnel_uninit(struct net_device *dev)
1260 {
1261         struct ip_tunnel *tunnel = netdev_priv(dev);
1262         struct net *net = tunnel->net;
1263         struct ip_tunnel_net *itn;
1264
1265         itn = net_generic(net, tunnel->ip_tnl_net_id);
1266         ip_tunnel_del(itn, netdev_priv(dev));
1267         if (itn->fb_tunnel_dev == dev)
1268                 WRITE_ONCE(itn->fb_tunnel_dev, NULL);
1269
1270         dst_cache_reset(&tunnel->dst_cache);
1271 }
1272 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1273
1274 /* Do least required initialization, rest of init is done in tunnel_init call */
1275 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1276 {
1277         struct ip_tunnel *tunnel = netdev_priv(dev);
1278         tunnel->ip_tnl_net_id = net_id;
1279 }
1280 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1281
1282 MODULE_LICENSE("GPL");