2 * Linux IPv6 multicast routing support for BSD pim6sd
3 * Based on net/ipv4/ipmr.c.
5 * (c) 2004 Mickael Hoerdt, <hoerdt@clarinet.u-strasbg.fr>
6 * LSIIT Laboratory, Strasbourg, France
7 * (c) 2004 Jean-Philippe Andriot, <jean-philippe.andriot@6WIND.com>
9 * Copyright (C)2007,2008 USAGI/WIDE Project
10 * YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version.
19 #include <asm/system.h>
20 #include <asm/uaccess.h>
21 #include <linux/types.h>
22 #include <linux/sched.h>
23 #include <linux/errno.h>
24 #include <linux/timer.h>
26 #include <linux/kernel.h>
27 #include <linux/fcntl.h>
28 #include <linux/stat.h>
29 #include <linux/socket.h>
30 #include <linux/inet.h>
31 #include <linux/netdevice.h>
32 #include <linux/inetdevice.h>
33 #include <linux/proc_fs.h>
34 #include <linux/seq_file.h>
35 #include <linux/init.h>
36 #include <net/protocol.h>
37 #include <linux/skbuff.h>
40 #include <linux/notifier.h>
41 #include <linux/if_arp.h>
42 #include <net/checksum.h>
43 #include <net/netlink.h>
46 #include <net/ip6_route.h>
47 #include <linux/mroute6.h>
48 #include <linux/pim.h>
49 #include <net/addrconf.h>
50 #include <linux/netfilter_ipv6.h>
52 /* Big lock, protecting vif table, mrt cache and mroute socket state.
53 Note that the changes are semaphored via rtnl_lock.
56 static DEFINE_RWLOCK(mrt_lock);
59 * Multicast router control variables
62 #define MIF_EXISTS(_net, _idx) ((_net)->ipv6.vif6_table[_idx].dev != NULL)
64 static struct mfc6_cache *mfc_unres_queue; /* Queue of unresolved entries */
66 /* Special spinlock for queue of unresolved entries */
67 static DEFINE_SPINLOCK(mfc_unres_lock);
69 /* We return to original Alan's scheme. Hash table of resolved
70 entries is changed only in process context and protected
71 with weak lock mrt_lock. Queue of unresolved entries is protected
72 with strong spinlock mfc_unres_lock.
74 In this case data path is free of exclusive locks at all.
77 static struct kmem_cache *mrt_cachep __read_mostly;
79 static int ip6_mr_forward(struct sk_buff *skb, struct mfc6_cache *cache);
80 static int ip6mr_cache_report(struct net *net, struct sk_buff *pkt,
81 mifi_t mifi, int assert);
82 static int ip6mr_fill_mroute(struct sk_buff *skb, struct mfc6_cache *c, struct rtmsg *rtm);
83 static void mroute_clean_tables(struct net *net);
85 #ifdef CONFIG_IPV6_PIMSM_V2
86 static struct inet6_protocol pim6_protocol;
89 static struct timer_list ipmr_expire_timer;
94 struct ipmr_mfc_iter {
95 struct seq_net_private p;
96 struct mfc6_cache **cache;
101 static struct mfc6_cache *ipmr_mfc_seq_idx(struct net *net,
102 struct ipmr_mfc_iter *it, loff_t pos)
104 struct mfc6_cache *mfc;
106 it->cache = net->ipv6.mfc6_cache_array;
107 read_lock(&mrt_lock);
108 for (it->ct = 0; it->ct < MFC6_LINES; it->ct++)
109 for (mfc = net->ipv6.mfc6_cache_array[it->ct];
110 mfc; mfc = mfc->next)
113 read_unlock(&mrt_lock);
115 it->cache = &mfc_unres_queue;
116 spin_lock_bh(&mfc_unres_lock);
117 for (mfc = mfc_unres_queue; mfc; mfc = mfc->next)
118 if (net_eq(mfc6_net(mfc), net) &&
121 spin_unlock_bh(&mfc_unres_lock);
131 * The /proc interfaces to multicast routing /proc/ip6_mr_cache /proc/ip6_mr_vif
134 struct ipmr_vif_iter {
135 struct seq_net_private p;
139 static struct mif_device *ip6mr_vif_seq_idx(struct net *net,
140 struct ipmr_vif_iter *iter,
143 for (iter->ct = 0; iter->ct < net->ipv6.maxvif; ++iter->ct) {
144 if (!MIF_EXISTS(net, iter->ct))
147 return &net->ipv6.vif6_table[iter->ct];
152 static void *ip6mr_vif_seq_start(struct seq_file *seq, loff_t *pos)
155 struct net *net = seq_file_net(seq);
157 read_lock(&mrt_lock);
158 return *pos ? ip6mr_vif_seq_idx(net, seq->private, *pos - 1)
162 static void *ip6mr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
164 struct ipmr_vif_iter *iter = seq->private;
165 struct net *net = seq_file_net(seq);
168 if (v == SEQ_START_TOKEN)
169 return ip6mr_vif_seq_idx(net, iter, 0);
171 while (++iter->ct < net->ipv6.maxvif) {
172 if (!MIF_EXISTS(net, iter->ct))
174 return &net->ipv6.vif6_table[iter->ct];
179 static void ip6mr_vif_seq_stop(struct seq_file *seq, void *v)
182 read_unlock(&mrt_lock);
185 static int ip6mr_vif_seq_show(struct seq_file *seq, void *v)
187 struct net *net = seq_file_net(seq);
189 if (v == SEQ_START_TOKEN) {
191 "Interface BytesIn PktsIn BytesOut PktsOut Flags\n");
193 const struct mif_device *vif = v;
194 const char *name = vif->dev ? vif->dev->name : "none";
197 "%2td %-10s %8ld %7ld %8ld %7ld %05X\n",
198 vif - net->ipv6.vif6_table,
199 name, vif->bytes_in, vif->pkt_in,
200 vif->bytes_out, vif->pkt_out,
206 static struct seq_operations ip6mr_vif_seq_ops = {
207 .start = ip6mr_vif_seq_start,
208 .next = ip6mr_vif_seq_next,
209 .stop = ip6mr_vif_seq_stop,
210 .show = ip6mr_vif_seq_show,
213 static int ip6mr_vif_open(struct inode *inode, struct file *file)
215 return seq_open_net(inode, file, &ip6mr_vif_seq_ops,
216 sizeof(struct ipmr_vif_iter));
219 static struct file_operations ip6mr_vif_fops = {
220 .owner = THIS_MODULE,
221 .open = ip6mr_vif_open,
224 .release = seq_release_net,
227 static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
229 struct net *net = seq_file_net(seq);
231 return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1)
235 static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
237 struct mfc6_cache *mfc = v;
238 struct ipmr_mfc_iter *it = seq->private;
239 struct net *net = seq_file_net(seq);
243 if (v == SEQ_START_TOKEN)
244 return ipmr_mfc_seq_idx(net, seq->private, 0);
249 if (it->cache == &mfc_unres_queue)
252 BUG_ON(it->cache != net->ipv6.mfc6_cache_array);
254 while (++it->ct < MFC6_LINES) {
255 mfc = net->ipv6.mfc6_cache_array[it->ct];
260 /* exhausted cache_array, show unresolved */
261 read_unlock(&mrt_lock);
262 it->cache = &mfc_unres_queue;
265 spin_lock_bh(&mfc_unres_lock);
266 mfc = mfc_unres_queue;
271 spin_unlock_bh(&mfc_unres_lock);
277 static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
279 struct ipmr_mfc_iter *it = seq->private;
280 struct net *net = seq_file_net(seq);
282 if (it->cache == &mfc_unres_queue)
283 spin_unlock_bh(&mfc_unres_lock);
284 else if (it->cache == net->ipv6.mfc6_cache_array)
285 read_unlock(&mrt_lock);
288 static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
291 struct net *net = seq_file_net(seq);
293 if (v == SEQ_START_TOKEN) {
297 "Iif Pkts Bytes Wrong Oifs\n");
299 const struct mfc6_cache *mfc = v;
300 const struct ipmr_mfc_iter *it = seq->private;
302 seq_printf(seq, "%pI6 %pI6 %-3hd",
303 &mfc->mf6c_mcastgrp, &mfc->mf6c_origin,
306 if (it->cache != &mfc_unres_queue) {
307 seq_printf(seq, " %8lu %8lu %8lu",
309 mfc->mfc_un.res.bytes,
310 mfc->mfc_un.res.wrong_if);
311 for (n = mfc->mfc_un.res.minvif;
312 n < mfc->mfc_un.res.maxvif; n++) {
313 if (MIF_EXISTS(net, n) &&
314 mfc->mfc_un.res.ttls[n] < 255)
317 n, mfc->mfc_un.res.ttls[n]);
320 /* unresolved mfc_caches don't contain
321 * pkt, bytes and wrong_if values
323 seq_printf(seq, " %8lu %8lu %8lu", 0ul, 0ul, 0ul);
330 static struct seq_operations ipmr_mfc_seq_ops = {
331 .start = ipmr_mfc_seq_start,
332 .next = ipmr_mfc_seq_next,
333 .stop = ipmr_mfc_seq_stop,
334 .show = ipmr_mfc_seq_show,
337 static int ipmr_mfc_open(struct inode *inode, struct file *file)
339 return seq_open_net(inode, file, &ipmr_mfc_seq_ops,
340 sizeof(struct ipmr_mfc_iter));
343 static struct file_operations ip6mr_mfc_fops = {
344 .owner = THIS_MODULE,
345 .open = ipmr_mfc_open,
348 .release = seq_release_net,
352 #ifdef CONFIG_IPV6_PIMSM_V2
354 static int pim6_rcv(struct sk_buff *skb)
356 struct pimreghdr *pim;
357 struct ipv6hdr *encap;
358 struct net_device *reg_dev = NULL;
359 struct net *net = dev_net(skb->dev);
360 int reg_vif_num = net->ipv6.mroute_reg_vif_num;
362 if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
365 pim = (struct pimreghdr *)skb_transport_header(skb);
366 if (pim->type != ((PIM_VERSION << 4) | PIM_REGISTER) ||
367 (pim->flags & PIM_NULL_REGISTER) ||
368 (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
369 csum_fold(skb_checksum(skb, 0, skb->len, 0))))
372 /* check if the inner packet is destined to mcast group */
373 encap = (struct ipv6hdr *)(skb_transport_header(skb) +
376 if (!ipv6_addr_is_multicast(&encap->daddr) ||
377 encap->payload_len == 0 ||
378 ntohs(encap->payload_len) + sizeof(*pim) > skb->len)
381 read_lock(&mrt_lock);
382 if (reg_vif_num >= 0)
383 reg_dev = net->ipv6.vif6_table[reg_vif_num].dev;
386 read_unlock(&mrt_lock);
391 skb->mac_header = skb->network_header;
392 skb_pull(skb, (u8 *)encap - skb->data);
393 skb_reset_network_header(skb);
395 skb->protocol = htons(ETH_P_IP);
397 skb->pkt_type = PACKET_HOST;
398 dst_release(skb->dst);
399 reg_dev->stats.rx_bytes += skb->len;
400 reg_dev->stats.rx_packets++;
411 static struct inet6_protocol pim6_protocol = {
415 /* Service routines creating virtual interfaces: PIMREG */
417 static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
419 struct net *net = dev_net(dev);
421 read_lock(&mrt_lock);
422 dev->stats.tx_bytes += skb->len;
423 dev->stats.tx_packets++;
424 ip6mr_cache_report(net, skb, net->ipv6.mroute_reg_vif_num,
426 read_unlock(&mrt_lock);
431 static const struct net_device_ops reg_vif_netdev_ops = {
432 .ndo_start_xmit = reg_vif_xmit,
435 static void reg_vif_setup(struct net_device *dev)
437 dev->type = ARPHRD_PIMREG;
438 dev->mtu = 1500 - sizeof(struct ipv6hdr) - 8;
439 dev->flags = IFF_NOARP;
440 dev->netdev_ops = ®_vif_netdev_ops;
441 dev->destructor = free_netdev;
444 static struct net_device *ip6mr_reg_vif(struct net *net)
446 struct net_device *dev;
448 dev = alloc_netdev(0, "pim6reg", reg_vif_setup);
452 dev_net_set(dev, net);
454 if (register_netdevice(dev)) {
467 /* allow the register to be completed before unregistering. */
471 unregister_netdevice(dev);
480 static int mif6_delete(struct net *net, int vifi)
482 struct mif_device *v;
483 struct net_device *dev;
484 if (vifi < 0 || vifi >= net->ipv6.maxvif)
485 return -EADDRNOTAVAIL;
487 v = &net->ipv6.vif6_table[vifi];
489 write_lock_bh(&mrt_lock);
494 write_unlock_bh(&mrt_lock);
495 return -EADDRNOTAVAIL;
498 #ifdef CONFIG_IPV6_PIMSM_V2
499 if (vifi == net->ipv6.mroute_reg_vif_num)
500 net->ipv6.mroute_reg_vif_num = -1;
503 if (vifi + 1 == net->ipv6.maxvif) {
505 for (tmp = vifi - 1; tmp >= 0; tmp--) {
506 if (MIF_EXISTS(net, tmp))
509 net->ipv6.maxvif = tmp + 1;
512 write_unlock_bh(&mrt_lock);
514 dev_set_allmulti(dev, -1);
516 if (v->flags & MIFF_REGISTER)
517 unregister_netdevice(dev);
523 static inline void ip6mr_cache_free(struct mfc6_cache *c)
525 release_net(mfc6_net(c));
526 kmem_cache_free(mrt_cachep, c);
529 /* Destroy an unresolved cache entry, killing queued skbs
530 and reporting error to netlink readers.
533 static void ip6mr_destroy_unres(struct mfc6_cache *c)
536 struct net *net = mfc6_net(c);
538 atomic_dec(&net->ipv6.cache_resolve_queue_len);
540 while((skb = skb_dequeue(&c->mfc_un.unres.unresolved)) != NULL) {
541 if (ipv6_hdr(skb)->version == 0) {
542 struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct ipv6hdr));
543 nlh->nlmsg_type = NLMSG_ERROR;
544 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
545 skb_trim(skb, nlh->nlmsg_len);
546 ((struct nlmsgerr *)NLMSG_DATA(nlh))->error = -ETIMEDOUT;
547 rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
556 /* Single timer process for all the unresolved queue. */
558 static void ipmr_do_expire_process(unsigned long dummy)
560 unsigned long now = jiffies;
561 unsigned long expires = 10 * HZ;
562 struct mfc6_cache *c, **cp;
564 cp = &mfc_unres_queue;
566 while ((c = *cp) != NULL) {
567 if (time_after(c->mfc_un.unres.expires, now)) {
569 unsigned long interval = c->mfc_un.unres.expires - now;
570 if (interval < expires)
577 ip6mr_destroy_unres(c);
580 if (mfc_unres_queue != NULL)
581 mod_timer(&ipmr_expire_timer, jiffies + expires);
584 static void ipmr_expire_process(unsigned long dummy)
586 if (!spin_trylock(&mfc_unres_lock)) {
587 mod_timer(&ipmr_expire_timer, jiffies + 1);
591 if (mfc_unres_queue != NULL)
592 ipmr_do_expire_process(dummy);
594 spin_unlock(&mfc_unres_lock);
597 /* Fill oifs list. It is called under write locked mrt_lock. */
599 static void ip6mr_update_thresholds(struct mfc6_cache *cache, unsigned char *ttls)
602 struct net *net = mfc6_net(cache);
604 cache->mfc_un.res.minvif = MAXMIFS;
605 cache->mfc_un.res.maxvif = 0;
606 memset(cache->mfc_un.res.ttls, 255, MAXMIFS);
608 for (vifi = 0; vifi < net->ipv6.maxvif; vifi++) {
609 if (MIF_EXISTS(net, vifi) &&
610 ttls[vifi] && ttls[vifi] < 255) {
611 cache->mfc_un.res.ttls[vifi] = ttls[vifi];
612 if (cache->mfc_un.res.minvif > vifi)
613 cache->mfc_un.res.minvif = vifi;
614 if (cache->mfc_un.res.maxvif <= vifi)
615 cache->mfc_un.res.maxvif = vifi + 1;
620 static int mif6_add(struct net *net, struct mif6ctl *vifc, int mrtsock)
622 int vifi = vifc->mif6c_mifi;
623 struct mif_device *v = &net->ipv6.vif6_table[vifi];
624 struct net_device *dev;
628 if (MIF_EXISTS(net, vifi))
631 switch (vifc->mif6c_flags) {
632 #ifdef CONFIG_IPV6_PIMSM_V2
635 * Special Purpose VIF in PIM
636 * All the packets will be sent to the daemon
638 if (net->ipv6.mroute_reg_vif_num >= 0)
640 dev = ip6mr_reg_vif(net);
643 err = dev_set_allmulti(dev, 1);
645 unregister_netdevice(dev);
652 dev = dev_get_by_index(net, vifc->mif6c_pifi);
654 return -EADDRNOTAVAIL;
655 err = dev_set_allmulti(dev, 1);
666 * Fill in the VIF structures
668 v->rate_limit = vifc->vifc_rate_limit;
669 v->flags = vifc->mif6c_flags;
671 v->flags |= VIFF_STATIC;
672 v->threshold = vifc->vifc_threshold;
677 v->link = dev->ifindex;
678 if (v->flags & MIFF_REGISTER)
679 v->link = dev->iflink;
681 /* And finish update writing critical data */
682 write_lock_bh(&mrt_lock);
684 #ifdef CONFIG_IPV6_PIMSM_V2
685 if (v->flags & MIFF_REGISTER)
686 net->ipv6.mroute_reg_vif_num = vifi;
688 if (vifi + 1 > net->ipv6.maxvif)
689 net->ipv6.maxvif = vifi + 1;
690 write_unlock_bh(&mrt_lock);
694 static struct mfc6_cache *ip6mr_cache_find(struct net *net,
695 struct in6_addr *origin,
696 struct in6_addr *mcastgrp)
698 int line = MFC6_HASH(mcastgrp, origin);
699 struct mfc6_cache *c;
701 for (c = net->ipv6.mfc6_cache_array[line]; c; c = c->next) {
702 if (ipv6_addr_equal(&c->mf6c_origin, origin) &&
703 ipv6_addr_equal(&c->mf6c_mcastgrp, mcastgrp))
710 * Allocate a multicast cache entry
712 static struct mfc6_cache *ip6mr_cache_alloc(struct net *net)
714 struct mfc6_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
717 c->mfc_un.res.minvif = MAXMIFS;
718 mfc6_net_set(c, net);
722 static struct mfc6_cache *ip6mr_cache_alloc_unres(struct net *net)
724 struct mfc6_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
727 skb_queue_head_init(&c->mfc_un.unres.unresolved);
728 c->mfc_un.unres.expires = jiffies + 10 * HZ;
729 mfc6_net_set(c, net);
734 * A cache entry has gone into a resolved state from queued
737 static void ip6mr_cache_resolve(struct mfc6_cache *uc, struct mfc6_cache *c)
742 * Play the pending entries through our router
745 while((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) {
746 if (ipv6_hdr(skb)->version == 0) {
748 struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct ipv6hdr));
750 if (ip6mr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) {
751 nlh->nlmsg_len = skb_tail_pointer(skb) - (u8 *)nlh;
753 nlh->nlmsg_type = NLMSG_ERROR;
754 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
755 skb_trim(skb, nlh->nlmsg_len);
756 ((struct nlmsgerr *)NLMSG_DATA(nlh))->error = -EMSGSIZE;
758 err = rtnl_unicast(skb, mfc6_net(uc), NETLINK_CB(skb).pid);
760 ip6_mr_forward(skb, c);
765 * Bounce a cache query up to pim6sd. We could use netlink for this but pim6sd
766 * expects the following bizarre scheme.
768 * Called under mrt_lock.
771 static int ip6mr_cache_report(struct net *net, struct sk_buff *pkt, mifi_t mifi,
778 #ifdef CONFIG_IPV6_PIMSM_V2
779 if (assert == MRT6MSG_WHOLEPKT)
780 skb = skb_realloc_headroom(pkt, -skb_network_offset(pkt)
784 skb = alloc_skb(sizeof(struct ipv6hdr) + sizeof(*msg), GFP_ATOMIC);
789 /* I suppose that internal messages
790 * do not require checksums */
792 skb->ip_summed = CHECKSUM_UNNECESSARY;
794 #ifdef CONFIG_IPV6_PIMSM_V2
795 if (assert == MRT6MSG_WHOLEPKT) {
796 /* Ugly, but we have no choice with this interface.
797 Duplicate old header, fix length etc.
798 And all this only to mangle msg->im6_msgtype and
799 to set msg->im6_mbz to "mbz" :-)
801 skb_push(skb, -skb_network_offset(pkt));
803 skb_push(skb, sizeof(*msg));
804 skb_reset_transport_header(skb);
805 msg = (struct mrt6msg *)skb_transport_header(skb);
807 msg->im6_msgtype = MRT6MSG_WHOLEPKT;
808 msg->im6_mif = net->ipv6.mroute_reg_vif_num;
810 ipv6_addr_copy(&msg->im6_src, &ipv6_hdr(pkt)->saddr);
811 ipv6_addr_copy(&msg->im6_dst, &ipv6_hdr(pkt)->daddr);
813 skb->ip_summed = CHECKSUM_UNNECESSARY;
821 skb_put(skb, sizeof(struct ipv6hdr));
822 skb_reset_network_header(skb);
823 skb_copy_to_linear_data(skb, ipv6_hdr(pkt), sizeof(struct ipv6hdr));
828 skb_put(skb, sizeof(*msg));
829 skb_reset_transport_header(skb);
830 msg = (struct mrt6msg *)skb_transport_header(skb);
833 msg->im6_msgtype = assert;
836 ipv6_addr_copy(&msg->im6_src, &ipv6_hdr(pkt)->saddr);
837 ipv6_addr_copy(&msg->im6_dst, &ipv6_hdr(pkt)->daddr);
839 skb->dst = dst_clone(pkt->dst);
840 skb->ip_summed = CHECKSUM_UNNECESSARY;
842 skb_pull(skb, sizeof(struct ipv6hdr));
845 if (net->ipv6.mroute6_sk == NULL) {
851 * Deliver to user space multicast routing algorithms
853 ret = sock_queue_rcv_skb(net->ipv6.mroute6_sk, skb);
856 printk(KERN_WARNING "mroute6: pending queue full, dropping entries.\n");
864 * Queue a packet for resolution. It gets locked cache entry!
868 ip6mr_cache_unresolved(struct net *net, mifi_t mifi, struct sk_buff *skb)
871 struct mfc6_cache *c;
873 spin_lock_bh(&mfc_unres_lock);
874 for (c = mfc_unres_queue; c; c = c->next) {
875 if (net_eq(mfc6_net(c), net) &&
876 ipv6_addr_equal(&c->mf6c_mcastgrp, &ipv6_hdr(skb)->daddr) &&
877 ipv6_addr_equal(&c->mf6c_origin, &ipv6_hdr(skb)->saddr))
883 * Create a new entry if allowable
886 if (atomic_read(&net->ipv6.cache_resolve_queue_len) >= 10 ||
887 (c = ip6mr_cache_alloc_unres(net)) == NULL) {
888 spin_unlock_bh(&mfc_unres_lock);
895 * Fill in the new cache entry
898 c->mf6c_origin = ipv6_hdr(skb)->saddr;
899 c->mf6c_mcastgrp = ipv6_hdr(skb)->daddr;
902 * Reflect first query at pim6sd
904 err = ip6mr_cache_report(net, skb, mifi, MRT6MSG_NOCACHE);
906 /* If the report failed throw the cache entry
909 spin_unlock_bh(&mfc_unres_lock);
916 atomic_inc(&net->ipv6.cache_resolve_queue_len);
917 c->next = mfc_unres_queue;
920 ipmr_do_expire_process(1);
924 * See if we can append the packet
926 if (c->mfc_un.unres.unresolved.qlen > 3) {
930 skb_queue_tail(&c->mfc_un.unres.unresolved, skb);
934 spin_unlock_bh(&mfc_unres_lock);
939 * MFC6 cache manipulation by user space
942 static int ip6mr_mfc_delete(struct net *net, struct mf6cctl *mfc)
945 struct mfc6_cache *c, **cp;
947 line = MFC6_HASH(&mfc->mf6cc_mcastgrp.sin6_addr, &mfc->mf6cc_origin.sin6_addr);
949 for (cp = &net->ipv6.mfc6_cache_array[line];
950 (c = *cp) != NULL; cp = &c->next) {
951 if (ipv6_addr_equal(&c->mf6c_origin, &mfc->mf6cc_origin.sin6_addr) &&
952 ipv6_addr_equal(&c->mf6c_mcastgrp, &mfc->mf6cc_mcastgrp.sin6_addr)) {
953 write_lock_bh(&mrt_lock);
955 write_unlock_bh(&mrt_lock);
964 static int ip6mr_device_event(struct notifier_block *this,
965 unsigned long event, void *ptr)
967 struct net_device *dev = ptr;
968 struct net *net = dev_net(dev);
969 struct mif_device *v;
972 if (event != NETDEV_UNREGISTER)
975 v = &net->ipv6.vif6_table[0];
976 for (ct = 0; ct < net->ipv6.maxvif; ct++, v++) {
978 mif6_delete(net, ct);
983 static struct notifier_block ip6_mr_notifier = {
984 .notifier_call = ip6mr_device_event
988 * Setup for IP multicast routing
991 static int __net_init ip6mr_net_init(struct net *net)
994 net->ipv6.vif6_table = kcalloc(MAXMIFS, sizeof(struct mif_device),
996 if (!net->ipv6.vif6_table) {
1001 /* Forwarding cache */
1002 net->ipv6.mfc6_cache_array = kcalloc(MFC6_LINES,
1003 sizeof(struct mfc6_cache *),
1005 if (!net->ipv6.mfc6_cache_array) {
1007 goto fail_mfc6_cache;
1010 #ifdef CONFIG_IPV6_PIMSM_V2
1011 net->ipv6.mroute_reg_vif_num = -1;
1014 #ifdef CONFIG_PROC_FS
1016 if (!proc_net_fops_create(net, "ip6_mr_vif", 0, &ip6mr_vif_fops))
1018 if (!proc_net_fops_create(net, "ip6_mr_cache", 0, &ip6mr_mfc_fops))
1019 goto proc_cache_fail;
1023 #ifdef CONFIG_PROC_FS
1025 proc_net_remove(net, "ip6_mr_vif");
1027 kfree(net->ipv6.mfc6_cache_array);
1030 kfree(net->ipv6.vif6_table);
1035 static void __net_exit ip6mr_net_exit(struct net *net)
1037 #ifdef CONFIG_PROC_FS
1038 proc_net_remove(net, "ip6_mr_cache");
1039 proc_net_remove(net, "ip6_mr_vif");
1041 mroute_clean_tables(net);
1042 kfree(net->ipv6.mfc6_cache_array);
1043 kfree(net->ipv6.vif6_table);
1046 static struct pernet_operations ip6mr_net_ops = {
1047 .init = ip6mr_net_init,
1048 .exit = ip6mr_net_exit,
1051 int __init ip6_mr_init(void)
1055 mrt_cachep = kmem_cache_create("ip6_mrt_cache",
1056 sizeof(struct mfc6_cache),
1057 0, SLAB_HWCACHE_ALIGN,
1062 err = register_pernet_subsys(&ip6mr_net_ops);
1064 goto reg_pernet_fail;
1066 setup_timer(&ipmr_expire_timer, ipmr_expire_process, 0);
1067 err = register_netdevice_notifier(&ip6_mr_notifier);
1069 goto reg_notif_fail;
1072 del_timer(&ipmr_expire_timer);
1073 unregister_pernet_subsys(&ip6mr_net_ops);
1075 kmem_cache_destroy(mrt_cachep);
1079 void ip6_mr_cleanup(void)
1081 unregister_netdevice_notifier(&ip6_mr_notifier);
1082 del_timer(&ipmr_expire_timer);
1083 unregister_pernet_subsys(&ip6mr_net_ops);
1084 kmem_cache_destroy(mrt_cachep);
1087 static int ip6mr_mfc_add(struct net *net, struct mf6cctl *mfc, int mrtsock)
1090 struct mfc6_cache *uc, *c, **cp;
1091 unsigned char ttls[MAXMIFS];
1094 memset(ttls, 255, MAXMIFS);
1095 for (i = 0; i < MAXMIFS; i++) {
1096 if (IF_ISSET(i, &mfc->mf6cc_ifset))
1101 line = MFC6_HASH(&mfc->mf6cc_mcastgrp.sin6_addr, &mfc->mf6cc_origin.sin6_addr);
1103 for (cp = &net->ipv6.mfc6_cache_array[line];
1104 (c = *cp) != NULL; cp = &c->next) {
1105 if (ipv6_addr_equal(&c->mf6c_origin, &mfc->mf6cc_origin.sin6_addr) &&
1106 ipv6_addr_equal(&c->mf6c_mcastgrp, &mfc->mf6cc_mcastgrp.sin6_addr))
1111 write_lock_bh(&mrt_lock);
1112 c->mf6c_parent = mfc->mf6cc_parent;
1113 ip6mr_update_thresholds(c, ttls);
1115 c->mfc_flags |= MFC_STATIC;
1116 write_unlock_bh(&mrt_lock);
1120 if (!ipv6_addr_is_multicast(&mfc->mf6cc_mcastgrp.sin6_addr))
1123 c = ip6mr_cache_alloc(net);
1127 c->mf6c_origin = mfc->mf6cc_origin.sin6_addr;
1128 c->mf6c_mcastgrp = mfc->mf6cc_mcastgrp.sin6_addr;
1129 c->mf6c_parent = mfc->mf6cc_parent;
1130 ip6mr_update_thresholds(c, ttls);
1132 c->mfc_flags |= MFC_STATIC;
1134 write_lock_bh(&mrt_lock);
1135 c->next = net->ipv6.mfc6_cache_array[line];
1136 net->ipv6.mfc6_cache_array[line] = c;
1137 write_unlock_bh(&mrt_lock);
1140 * Check to see if we resolved a queued list. If so we
1141 * need to send on the frames and tidy up.
1143 spin_lock_bh(&mfc_unres_lock);
1144 for (cp = &mfc_unres_queue; (uc = *cp) != NULL;
1146 if (net_eq(mfc6_net(uc), net) &&
1147 ipv6_addr_equal(&uc->mf6c_origin, &c->mf6c_origin) &&
1148 ipv6_addr_equal(&uc->mf6c_mcastgrp, &c->mf6c_mcastgrp)) {
1150 atomic_dec(&net->ipv6.cache_resolve_queue_len);
1154 if (mfc_unres_queue == NULL)
1155 del_timer(&ipmr_expire_timer);
1156 spin_unlock_bh(&mfc_unres_lock);
1159 ip6mr_cache_resolve(uc, c);
1160 ip6mr_cache_free(uc);
1166 * Close the multicast socket, and clear the vif tables etc
1169 static void mroute_clean_tables(struct net *net)
1174 * Shut down all active vif entries
1176 for (i = 0; i < net->ipv6.maxvif; i++) {
1177 if (!(net->ipv6.vif6_table[i].flags & VIFF_STATIC))
1178 mif6_delete(net, i);
1184 for (i = 0; i < MFC6_LINES; i++) {
1185 struct mfc6_cache *c, **cp;
1187 cp = &net->ipv6.mfc6_cache_array[i];
1188 while ((c = *cp) != NULL) {
1189 if (c->mfc_flags & MFC_STATIC) {
1193 write_lock_bh(&mrt_lock);
1195 write_unlock_bh(&mrt_lock);
1197 ip6mr_cache_free(c);
1201 if (atomic_read(&net->ipv6.cache_resolve_queue_len) != 0) {
1202 struct mfc6_cache *c, **cp;
1204 spin_lock_bh(&mfc_unres_lock);
1205 cp = &mfc_unres_queue;
1206 while ((c = *cp) != NULL) {
1207 if (!net_eq(mfc6_net(c), net)) {
1212 ip6mr_destroy_unres(c);
1214 spin_unlock_bh(&mfc_unres_lock);
1218 static int ip6mr_sk_init(struct sock *sk)
1221 struct net *net = sock_net(sk);
1224 write_lock_bh(&mrt_lock);
1225 if (likely(net->ipv6.mroute6_sk == NULL))
1226 net->ipv6.mroute6_sk = sk;
1229 write_unlock_bh(&mrt_lock);
1236 int ip6mr_sk_done(struct sock *sk)
1239 struct net *net = sock_net(sk);
1242 if (sk == net->ipv6.mroute6_sk) {
1243 write_lock_bh(&mrt_lock);
1244 net->ipv6.mroute6_sk = NULL;
1245 write_unlock_bh(&mrt_lock);
1247 mroute_clean_tables(net);
1256 * Socket options and virtual interface manipulation. The whole
1257 * virtual interface system is a complete heap, but unfortunately
1258 * that's how BSD mrouted happens to think. Maybe one day with a proper
1259 * MOSPF/PIM router set up we can clean this up.
1262 int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, int optlen)
1268 struct net *net = sock_net(sk);
1270 if (optname != MRT6_INIT) {
1271 if (sk != net->ipv6.mroute6_sk && !capable(CAP_NET_ADMIN))
1277 if (sk->sk_type != SOCK_RAW ||
1278 inet_sk(sk)->num != IPPROTO_ICMPV6)
1280 if (optlen < sizeof(int))
1283 return ip6mr_sk_init(sk);
1286 return ip6mr_sk_done(sk);
1289 if (optlen < sizeof(vif))
1291 if (copy_from_user(&vif, optval, sizeof(vif)))
1293 if (vif.mif6c_mifi >= MAXMIFS)
1296 ret = mif6_add(net, &vif, sk == net->ipv6.mroute6_sk);
1301 if (optlen < sizeof(mifi_t))
1303 if (copy_from_user(&mifi, optval, sizeof(mifi_t)))
1306 ret = mif6_delete(net, mifi);
1311 * Manipulate the forwarding caches. These live
1312 * in a sort of kernel/user symbiosis.
1316 if (optlen < sizeof(mfc))
1318 if (copy_from_user(&mfc, optval, sizeof(mfc)))
1321 if (optname == MRT6_DEL_MFC)
1322 ret = ip6mr_mfc_delete(net, &mfc);
1324 ret = ip6mr_mfc_add(net, &mfc,
1325 sk == net->ipv6.mroute6_sk);
1330 * Control PIM assert (to activate pim will activate assert)
1335 if (get_user(v, (int __user *)optval))
1337 net->ipv6.mroute_do_assert = !!v;
1341 #ifdef CONFIG_IPV6_PIMSM_V2
1345 if (get_user(v, (int __user *)optval))
1350 if (v != net->ipv6.mroute_do_pim) {
1351 net->ipv6.mroute_do_pim = v;
1352 net->ipv6.mroute_do_assert = v;
1353 if (net->ipv6.mroute_do_pim)
1354 ret = inet6_add_protocol(&pim6_protocol,
1357 ret = inet6_del_protocol(&pim6_protocol,
1368 * Spurious command, or MRT6_VERSION which you cannot
1372 return -ENOPROTOOPT;
1377 * Getsock opt support for the multicast routing system.
1380 int ip6_mroute_getsockopt(struct sock *sk, int optname, char __user *optval,
1385 struct net *net = sock_net(sk);
1391 #ifdef CONFIG_IPV6_PIMSM_V2
1393 val = net->ipv6.mroute_do_pim;
1397 val = net->ipv6.mroute_do_assert;
1400 return -ENOPROTOOPT;
1403 if (get_user(olr, optlen))
1406 olr = min_t(int, olr, sizeof(int));
1410 if (put_user(olr, optlen))
1412 if (copy_to_user(optval, &val, olr))
1418 * The IP multicast ioctl support routines.
1421 int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg)
1423 struct sioc_sg_req6 sr;
1424 struct sioc_mif_req6 vr;
1425 struct mif_device *vif;
1426 struct mfc6_cache *c;
1427 struct net *net = sock_net(sk);
1430 case SIOCGETMIFCNT_IN6:
1431 if (copy_from_user(&vr, arg, sizeof(vr)))
1433 if (vr.mifi >= net->ipv6.maxvif)
1435 read_lock(&mrt_lock);
1436 vif = &net->ipv6.vif6_table[vr.mifi];
1437 if (MIF_EXISTS(net, vr.mifi)) {
1438 vr.icount = vif->pkt_in;
1439 vr.ocount = vif->pkt_out;
1440 vr.ibytes = vif->bytes_in;
1441 vr.obytes = vif->bytes_out;
1442 read_unlock(&mrt_lock);
1444 if (copy_to_user(arg, &vr, sizeof(vr)))
1448 read_unlock(&mrt_lock);
1449 return -EADDRNOTAVAIL;
1450 case SIOCGETSGCNT_IN6:
1451 if (copy_from_user(&sr, arg, sizeof(sr)))
1454 read_lock(&mrt_lock);
1455 c = ip6mr_cache_find(net, &sr.src.sin6_addr, &sr.grp.sin6_addr);
1457 sr.pktcnt = c->mfc_un.res.pkt;
1458 sr.bytecnt = c->mfc_un.res.bytes;
1459 sr.wrong_if = c->mfc_un.res.wrong_if;
1460 read_unlock(&mrt_lock);
1462 if (copy_to_user(arg, &sr, sizeof(sr)))
1466 read_unlock(&mrt_lock);
1467 return -EADDRNOTAVAIL;
1469 return -ENOIOCTLCMD;
1474 static inline int ip6mr_forward2_finish(struct sk_buff *skb)
1476 IP6_INC_STATS_BH(dev_net(skb->dst->dev), ip6_dst_idev(skb->dst),
1477 IPSTATS_MIB_OUTFORWDATAGRAMS);
1478 return dst_output(skb);
1482 * Processing handlers for ip6mr_forward
1485 static int ip6mr_forward2(struct sk_buff *skb, struct mfc6_cache *c, int vifi)
1487 struct ipv6hdr *ipv6h;
1488 struct net *net = mfc6_net(c);
1489 struct mif_device *vif = &net->ipv6.vif6_table[vifi];
1490 struct net_device *dev;
1491 struct dst_entry *dst;
1494 if (vif->dev == NULL)
1497 #ifdef CONFIG_IPV6_PIMSM_V2
1498 if (vif->flags & MIFF_REGISTER) {
1500 vif->bytes_out += skb->len;
1501 vif->dev->stats.tx_bytes += skb->len;
1502 vif->dev->stats.tx_packets++;
1503 ip6mr_cache_report(net, skb, vifi, MRT6MSG_WHOLEPKT);
1508 ipv6h = ipv6_hdr(skb);
1510 fl = (struct flowi) {
1513 { .daddr = ipv6h->daddr, }
1517 dst = ip6_route_output(net, NULL, &fl);
1521 dst_release(skb->dst);
1525 * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
1526 * not only before forwarding, but after forwarding on all output
1527 * interfaces. It is clear, if mrouter runs a multicasting
1528 * program, it should receive packets not depending to what interface
1529 * program is joined.
1530 * If we will not make it, the program will have to join on all
1531 * interfaces. On the other hand, multihoming host (or router, but
1532 * not mrouter) cannot join to more than one interface - it will
1533 * result in receiving multiple packets.
1538 vif->bytes_out += skb->len;
1540 /* We are about to write */
1541 /* XXX: extension headers? */
1542 if (skb_cow(skb, sizeof(*ipv6h) + LL_RESERVED_SPACE(dev)))
1545 ipv6h = ipv6_hdr(skb);
1548 IP6CB(skb)->flags |= IP6SKB_FORWARDED;
1550 return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dev,
1551 ip6mr_forward2_finish);
1558 static int ip6mr_find_vif(struct net_device *dev)
1560 struct net *net = dev_net(dev);
1562 for (ct = net->ipv6.maxvif - 1; ct >= 0; ct--) {
1563 if (net->ipv6.vif6_table[ct].dev == dev)
1569 static int ip6_mr_forward(struct sk_buff *skb, struct mfc6_cache *cache)
1573 struct net *net = mfc6_net(cache);
1575 vif = cache->mf6c_parent;
1576 cache->mfc_un.res.pkt++;
1577 cache->mfc_un.res.bytes += skb->len;
1580 * Wrong interface: drop packet and (maybe) send PIM assert.
1582 if (net->ipv6.vif6_table[vif].dev != skb->dev) {
1585 cache->mfc_un.res.wrong_if++;
1586 true_vifi = ip6mr_find_vif(skb->dev);
1588 if (true_vifi >= 0 && net->ipv6.mroute_do_assert &&
1589 /* pimsm uses asserts, when switching from RPT to SPT,
1590 so that we cannot check that packet arrived on an oif.
1591 It is bad, but otherwise we would need to move pretty
1592 large chunk of pimd to kernel. Ough... --ANK
1594 (net->ipv6.mroute_do_pim ||
1595 cache->mfc_un.res.ttls[true_vifi] < 255) &&
1597 cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
1598 cache->mfc_un.res.last_assert = jiffies;
1599 ip6mr_cache_report(net, skb, true_vifi, MRT6MSG_WRONGMIF);
1604 net->ipv6.vif6_table[vif].pkt_in++;
1605 net->ipv6.vif6_table[vif].bytes_in += skb->len;
1610 for (ct = cache->mfc_un.res.maxvif - 1; ct >= cache->mfc_un.res.minvif; ct--) {
1611 if (ipv6_hdr(skb)->hop_limit > cache->mfc_un.res.ttls[ct]) {
1613 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1615 ip6mr_forward2(skb2, cache, psend);
1621 ip6mr_forward2(skb, cache, psend);
1632 * Multicast packets for forwarding arrive here
1635 int ip6_mr_input(struct sk_buff *skb)
1637 struct mfc6_cache *cache;
1638 struct net *net = dev_net(skb->dev);
1640 read_lock(&mrt_lock);
1641 cache = ip6mr_cache_find(net,
1642 &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr);
1645 * No usable cache entry
1647 if (cache == NULL) {
1650 vif = ip6mr_find_vif(skb->dev);
1652 int err = ip6mr_cache_unresolved(net, vif, skb);
1653 read_unlock(&mrt_lock);
1657 read_unlock(&mrt_lock);
1662 ip6_mr_forward(skb, cache);
1664 read_unlock(&mrt_lock);
1671 ip6mr_fill_mroute(struct sk_buff *skb, struct mfc6_cache *c, struct rtmsg *rtm)
1674 struct rtnexthop *nhp;
1675 struct net *net = mfc6_net(c);
1676 struct net_device *dev = net->ipv6.vif6_table[c->mf6c_parent].dev;
1677 u8 *b = skb_tail_pointer(skb);
1678 struct rtattr *mp_head;
1681 RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex);
1683 mp_head = (struct rtattr *)skb_put(skb, RTA_LENGTH(0));
1685 for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
1686 if (c->mfc_un.res.ttls[ct] < 255) {
1687 if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
1688 goto rtattr_failure;
1689 nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
1690 nhp->rtnh_flags = 0;
1691 nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
1692 nhp->rtnh_ifindex = net->ipv6.vif6_table[ct].dev->ifindex;
1693 nhp->rtnh_len = sizeof(*nhp);
1696 mp_head->rta_type = RTA_MULTIPATH;
1697 mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head;
1698 rtm->rtm_type = RTN_MULTICAST;
1706 int ip6mr_get_route(struct net *net,
1707 struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1710 struct mfc6_cache *cache;
1711 struct rt6_info *rt = (struct rt6_info *)skb->dst;
1713 read_lock(&mrt_lock);
1714 cache = ip6mr_cache_find(net, &rt->rt6i_src.addr, &rt->rt6i_dst.addr);
1717 struct sk_buff *skb2;
1718 struct ipv6hdr *iph;
1719 struct net_device *dev;
1723 read_unlock(&mrt_lock);
1728 if (dev == NULL || (vif = ip6mr_find_vif(dev)) < 0) {
1729 read_unlock(&mrt_lock);
1733 /* really correct? */
1734 skb2 = alloc_skb(sizeof(struct ipv6hdr), GFP_ATOMIC);
1736 read_unlock(&mrt_lock);
1740 skb_reset_transport_header(skb2);
1742 skb_put(skb2, sizeof(struct ipv6hdr));
1743 skb_reset_network_header(skb2);
1745 iph = ipv6_hdr(skb2);
1748 iph->flow_lbl[0] = 0;
1749 iph->flow_lbl[1] = 0;
1750 iph->flow_lbl[2] = 0;
1751 iph->payload_len = 0;
1752 iph->nexthdr = IPPROTO_NONE;
1754 ipv6_addr_copy(&iph->saddr, &rt->rt6i_src.addr);
1755 ipv6_addr_copy(&iph->daddr, &rt->rt6i_dst.addr);
1757 err = ip6mr_cache_unresolved(net, vif, skb2);
1758 read_unlock(&mrt_lock);
1763 if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
1764 cache->mfc_flags |= MFC_NOTIFY;
1766 err = ip6mr_fill_mroute(skb, cache, rtm);
1767 read_unlock(&mrt_lock);