1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * net/sched/sch_api.c Packet scheduler API.
5 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
9 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
10 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
11 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/string.h>
18 #include <linux/errno.h>
19 #include <linux/skbuff.h>
20 #include <linux/init.h>
21 #include <linux/proc_fs.h>
22 #include <linux/seq_file.h>
23 #include <linux/kmod.h>
24 #include <linux/list.h>
25 #include <linux/hrtimer.h>
26 #include <linux/slab.h>
27 #include <linux/hashtable.h>
29 #include <net/net_namespace.h>
31 #include <net/netlink.h>
32 #include <net/pkt_sched.h>
33 #include <net/pkt_cls.h>
35 #include <trace/events/qdisc.h>
42 This file consists of two interrelated parts:
44 1. queueing disciplines manager frontend.
45 2. traffic classes manager frontend.
47 Generally, queueing discipline ("qdisc") is a black box,
48 which is able to enqueue packets and to dequeue them (when
49 device is ready to send something) in order and at times
50 determined by algorithm hidden in it.
52 qdisc's are divided to two categories:
53 - "queues", which have no internal structure visible from outside.
54 - "schedulers", which split all the packets to "traffic classes",
55 using "packet classifiers" (look at cls_api.c)
57 In turn, classes may have child qdiscs (as rule, queues)
58 attached to them etc. etc. etc.
60 The goal of the routines in this file is to translate
61 information supplied by user in the form of handles
62 to more intelligible for kernel form, to make some sanity
63 checks and part of work, which is common to all qdiscs
64 and to provide rtnetlink notifications.
66 All real intelligent work is done inside qdisc modules.
70 Every discipline has two major routines: enqueue and dequeue.
74 dequeue usually returns a skb to send. It is allowed to return NULL,
75 but it does not mean that queue is empty, it just means that
76 discipline does not want to send anything this time.
77 Queue is really empty if q->q.qlen == 0.
78 For complicated disciplines with multiple queues q->q is not
79 real packet queue, but however q->q.qlen must be valid.
83 enqueue returns 0, if packet was enqueued successfully.
84 If packet (this one or another one) was dropped, it returns
86 NET_XMIT_DROP - this packet dropped
87 Expected action: do not backoff, but wait until queue will clear.
88 NET_XMIT_CN - probably this packet enqueued, but another one dropped.
89 Expected action: backoff or ignore
95 like dequeue but without removing a packet from the queue
99 returns qdisc to initial state: purge all buffers, clear all
100 timers, counters (except for statistics) etc.
104 initializes newly created qdisc.
108 destroys resources allocated by init and during lifetime of qdisc.
112 changes qdisc parameters.
115 /* Protects list of registered TC modules. It is pure SMP lock. */
116 static DEFINE_RWLOCK(qdisc_mod_lock);
119 /************************************************
120 * Queueing disciplines manipulation. *
121 ************************************************/
124 /* The list of all installed queueing disciplines. */
126 static struct Qdisc_ops *qdisc_base;
128 /* Register/unregister queueing discipline */
130 int register_qdisc(struct Qdisc_ops *qops)
132 struct Qdisc_ops *q, **qp;
135 write_lock(&qdisc_mod_lock);
136 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
137 if (!strcmp(qops->id, q->id))
140 if (qops->enqueue == NULL)
141 qops->enqueue = noop_qdisc_ops.enqueue;
142 if (qops->peek == NULL) {
143 if (qops->dequeue == NULL)
144 qops->peek = noop_qdisc_ops.peek;
148 if (qops->dequeue == NULL)
149 qops->dequeue = noop_qdisc_ops.dequeue;
152 const struct Qdisc_class_ops *cops = qops->cl_ops;
154 if (!(cops->find && cops->walk && cops->leaf))
157 if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
165 write_unlock(&qdisc_mod_lock);
172 EXPORT_SYMBOL(register_qdisc);
174 void unregister_qdisc(struct Qdisc_ops *qops)
176 struct Qdisc_ops *q, **qp;
179 write_lock(&qdisc_mod_lock);
180 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
188 write_unlock(&qdisc_mod_lock);
190 WARN(err, "unregister qdisc(%s) failed\n", qops->id);
192 EXPORT_SYMBOL(unregister_qdisc);
194 /* Get default qdisc if not otherwise specified */
195 void qdisc_get_default(char *name, size_t len)
197 read_lock(&qdisc_mod_lock);
198 strscpy(name, default_qdisc_ops->id, len);
199 read_unlock(&qdisc_mod_lock);
202 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
204 struct Qdisc_ops *q = NULL;
206 for (q = qdisc_base; q; q = q->next) {
207 if (!strcmp(name, q->id)) {
208 if (!try_module_get(q->owner))
217 /* Set new default qdisc to use */
218 int qdisc_set_default(const char *name)
220 const struct Qdisc_ops *ops;
222 if (!capable(CAP_NET_ADMIN))
225 write_lock(&qdisc_mod_lock);
226 ops = qdisc_lookup_default(name);
228 /* Not found, drop lock and try to load module */
229 write_unlock(&qdisc_mod_lock);
230 request_module("sch_%s", name);
231 write_lock(&qdisc_mod_lock);
233 ops = qdisc_lookup_default(name);
237 /* Set new default */
238 module_put(default_qdisc_ops->owner);
239 default_qdisc_ops = ops;
241 write_unlock(&qdisc_mod_lock);
243 return ops ? 0 : -ENOENT;
246 #ifdef CONFIG_NET_SCH_DEFAULT
247 /* Set default value from kernel config */
248 static int __init sch_default_qdisc(void)
250 return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
252 late_initcall(sch_default_qdisc);
255 /* We know handle. Find qdisc among all qdisc's attached to device
256 * (root qdisc, all its children, children of children etc.)
257 * Note: caller either uses rtnl or rcu_read_lock()
260 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
264 if (!qdisc_dev(root))
265 return (root->handle == handle ? root : NULL);
267 if (!(root->flags & TCQ_F_BUILTIN) &&
268 root->handle == handle)
271 hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle,
272 lockdep_rtnl_is_held()) {
273 if (q->handle == handle)
279 void qdisc_hash_add(struct Qdisc *q, bool invisible)
281 if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
283 hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
285 q->flags |= TCQ_F_INVISIBLE;
288 EXPORT_SYMBOL(qdisc_hash_add);
290 void qdisc_hash_del(struct Qdisc *q)
292 if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
294 hash_del_rcu(&q->hash);
297 EXPORT_SYMBOL(qdisc_hash_del);
299 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
305 q = qdisc_match_from_root(rtnl_dereference(dev->qdisc), handle);
309 if (dev_ingress_queue(dev))
310 q = qdisc_match_from_root(
311 rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping),
317 struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle)
319 struct netdev_queue *nq;
324 q = qdisc_match_from_root(rcu_dereference(dev->qdisc), handle);
328 nq = dev_ingress_queue_rcu(dev);
330 q = qdisc_match_from_root(rcu_dereference(nq->qdisc_sleeping),
336 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
339 const struct Qdisc_class_ops *cops = p->ops->cl_ops;
343 cl = cops->find(p, classid);
347 return cops->leaf(p, cl);
350 /* Find queueing discipline by name */
352 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
354 struct Qdisc_ops *q = NULL;
357 read_lock(&qdisc_mod_lock);
358 for (q = qdisc_base; q; q = q->next) {
359 if (nla_strcmp(kind, q->id) == 0) {
360 if (!try_module_get(q->owner))
365 read_unlock(&qdisc_mod_lock);
370 /* The linklayer setting were not transferred from iproute2, in older
371 * versions, and the rate tables lookup systems have been dropped in
372 * the kernel. To keep backward compatible with older iproute2 tc
373 * utils, we detect the linklayer setting by detecting if the rate
374 * table were modified.
376 * For linklayer ATM table entries, the rate table will be aligned to
377 * 48 bytes, thus some table entries will contain the same value. The
378 * mpu (min packet unit) is also encoded into the old rate table, thus
379 * starting from the mpu, we find low and high table entries for
380 * mapping this cell. If these entries contain the same value, when
381 * the rate tables have been modified for linklayer ATM.
383 * This is done by rounding mpu to the nearest 48 bytes cell/entry,
384 * and then roundup to the next cell, calc the table entry one below,
387 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
389 int low = roundup(r->mpu, 48);
390 int high = roundup(low+1, 48);
391 int cell_low = low >> r->cell_log;
392 int cell_high = (high >> r->cell_log) - 1;
394 /* rtab is too inaccurate at rates > 100Mbit/s */
395 if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
396 pr_debug("TC linklayer: Giving up ATM detection\n");
397 return TC_LINKLAYER_ETHERNET;
400 if ((cell_high > cell_low) && (cell_high < 256)
401 && (rtab[cell_low] == rtab[cell_high])) {
402 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
403 cell_low, cell_high, rtab[cell_high]);
404 return TC_LINKLAYER_ATM;
406 return TC_LINKLAYER_ETHERNET;
409 static struct qdisc_rate_table *qdisc_rtab_list;
411 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
413 struct netlink_ext_ack *extack)
415 struct qdisc_rate_table *rtab;
417 if (tab == NULL || r->rate == 0 ||
418 r->cell_log == 0 || r->cell_log >= 32 ||
419 nla_len(tab) != TC_RTAB_SIZE) {
420 NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
424 for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
425 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
426 !memcmp(&rtab->data, nla_data(tab), 1024)) {
432 rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
436 memcpy(rtab->data, nla_data(tab), 1024);
437 if (r->linklayer == TC_LINKLAYER_UNAWARE)
438 r->linklayer = __detect_linklayer(r, rtab->data);
439 rtab->next = qdisc_rtab_list;
440 qdisc_rtab_list = rtab;
442 NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
446 EXPORT_SYMBOL(qdisc_get_rtab);
448 void qdisc_put_rtab(struct qdisc_rate_table *tab)
450 struct qdisc_rate_table *rtab, **rtabp;
452 if (!tab || --tab->refcnt)
455 for (rtabp = &qdisc_rtab_list;
456 (rtab = *rtabp) != NULL;
457 rtabp = &rtab->next) {
465 EXPORT_SYMBOL(qdisc_put_rtab);
467 static LIST_HEAD(qdisc_stab_list);
469 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
470 [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
471 [TCA_STAB_DATA] = { .type = NLA_BINARY },
474 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
475 struct netlink_ext_ack *extack)
477 struct nlattr *tb[TCA_STAB_MAX + 1];
478 struct qdisc_size_table *stab;
479 struct tc_sizespec *s;
480 unsigned int tsize = 0;
484 err = nla_parse_nested_deprecated(tb, TCA_STAB_MAX, opt, stab_policy,
488 if (!tb[TCA_STAB_BASE]) {
489 NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
490 return ERR_PTR(-EINVAL);
493 s = nla_data(tb[TCA_STAB_BASE]);
496 if (!tb[TCA_STAB_DATA]) {
497 NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
498 return ERR_PTR(-EINVAL);
500 tab = nla_data(tb[TCA_STAB_DATA]);
501 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
504 if (tsize != s->tsize || (!tab && tsize > 0)) {
505 NL_SET_ERR_MSG(extack, "Invalid size of size table");
506 return ERR_PTR(-EINVAL);
509 list_for_each_entry(stab, &qdisc_stab_list, list) {
510 if (memcmp(&stab->szopts, s, sizeof(*s)))
513 memcmp(stab->data, tab, flex_array_size(stab, data, tsize)))
519 if (s->size_log > STAB_SIZE_LOG_MAX ||
520 s->cell_log > STAB_SIZE_LOG_MAX) {
521 NL_SET_ERR_MSG(extack, "Invalid logarithmic size of size table");
522 return ERR_PTR(-EINVAL);
525 stab = kmalloc(struct_size(stab, data, tsize), GFP_KERNEL);
527 return ERR_PTR(-ENOMEM);
532 memcpy(stab->data, tab, flex_array_size(stab, data, tsize));
534 list_add_tail(&stab->list, &qdisc_stab_list);
539 void qdisc_put_stab(struct qdisc_size_table *tab)
544 if (--tab->refcnt == 0) {
545 list_del(&tab->list);
549 EXPORT_SYMBOL(qdisc_put_stab);
551 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
555 nest = nla_nest_start_noflag(skb, TCA_STAB);
557 goto nla_put_failure;
558 if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
559 goto nla_put_failure;
560 nla_nest_end(skb, nest);
568 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
569 const struct qdisc_size_table *stab)
573 pkt_len = skb->len + stab->szopts.overhead;
574 if (unlikely(!stab->szopts.tsize))
577 slot = pkt_len + stab->szopts.cell_align;
578 if (unlikely(slot < 0))
581 slot >>= stab->szopts.cell_log;
582 if (likely(slot < stab->szopts.tsize))
583 pkt_len = stab->data[slot];
585 pkt_len = stab->data[stab->szopts.tsize - 1] *
586 (slot / stab->szopts.tsize) +
587 stab->data[slot % stab->szopts.tsize];
589 pkt_len <<= stab->szopts.size_log;
591 if (unlikely(pkt_len < 1))
593 qdisc_skb_cb(skb)->pkt_len = pkt_len;
595 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
597 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
599 if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
600 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
601 txt, qdisc->ops->id, qdisc->handle >> 16);
602 qdisc->flags |= TCQ_F_WARN_NONWC;
605 EXPORT_SYMBOL(qdisc_warn_nonwc);
607 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
609 struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
613 __netif_schedule(qdisc_root(wd->qdisc));
616 return HRTIMER_NORESTART;
619 void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
622 hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
623 wd->timer.function = qdisc_watchdog;
626 EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
628 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
630 qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
632 EXPORT_SYMBOL(qdisc_watchdog_init);
634 void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires,
640 deactivated = test_bit(__QDISC_STATE_DEACTIVATED,
641 &qdisc_root_sleeping(wd->qdisc)->state);
646 if (hrtimer_is_queued(&wd->timer)) {
647 /* If timer is already set in [expires, expires + delta_ns],
648 * do not reprogram it.
650 if (wd->last_expires - expires <= delta_ns)
654 wd->last_expires = expires;
655 hrtimer_start_range_ns(&wd->timer,
656 ns_to_ktime(expires),
658 HRTIMER_MODE_ABS_PINNED);
660 EXPORT_SYMBOL(qdisc_watchdog_schedule_range_ns);
662 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
664 hrtimer_cancel(&wd->timer);
666 EXPORT_SYMBOL(qdisc_watchdog_cancel);
668 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
670 struct hlist_head *h;
673 h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
676 for (i = 0; i < n; i++)
677 INIT_HLIST_HEAD(&h[i]);
682 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
684 struct Qdisc_class_common *cl;
685 struct hlist_node *next;
686 struct hlist_head *nhash, *ohash;
687 unsigned int nsize, nmask, osize;
690 /* Rehash when load factor exceeds 0.75 */
691 if (clhash->hashelems * 4 <= clhash->hashsize * 3)
693 nsize = clhash->hashsize * 2;
695 nhash = qdisc_class_hash_alloc(nsize);
699 ohash = clhash->hash;
700 osize = clhash->hashsize;
703 for (i = 0; i < osize; i++) {
704 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
705 h = qdisc_class_hash(cl->classid, nmask);
706 hlist_add_head(&cl->hnode, &nhash[h]);
709 clhash->hash = nhash;
710 clhash->hashsize = nsize;
711 clhash->hashmask = nmask;
712 sch_tree_unlock(sch);
716 EXPORT_SYMBOL(qdisc_class_hash_grow);
718 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
720 unsigned int size = 4;
722 clhash->hash = qdisc_class_hash_alloc(size);
725 clhash->hashsize = size;
726 clhash->hashmask = size - 1;
727 clhash->hashelems = 0;
730 EXPORT_SYMBOL(qdisc_class_hash_init);
732 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
734 kvfree(clhash->hash);
736 EXPORT_SYMBOL(qdisc_class_hash_destroy);
738 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
739 struct Qdisc_class_common *cl)
743 INIT_HLIST_NODE(&cl->hnode);
744 h = qdisc_class_hash(cl->classid, clhash->hashmask);
745 hlist_add_head(&cl->hnode, &clhash->hash[h]);
748 EXPORT_SYMBOL(qdisc_class_hash_insert);
750 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
751 struct Qdisc_class_common *cl)
753 hlist_del(&cl->hnode);
756 EXPORT_SYMBOL(qdisc_class_hash_remove);
758 /* Allocate an unique handle from space managed by kernel
759 * Possible range is [8000-FFFF]:0000 (0x8000 values)
761 static u32 qdisc_alloc_handle(struct net_device *dev)
764 static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
767 autohandle += TC_H_MAKE(0x10000U, 0);
768 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
769 autohandle = TC_H_MAKE(0x80000000U, 0);
770 if (!qdisc_lookup(dev, autohandle))
778 void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len)
780 bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
781 const struct Qdisc_class_ops *cops;
787 if (n == 0 && len == 0)
789 drops = max_t(int, n, 0);
791 while ((parentid = sch->parent)) {
792 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
795 if (sch->flags & TCQ_F_NOPARENT)
797 /* Notify parent qdisc only if child qdisc becomes empty.
799 * If child was empty even before update then backlog
800 * counter is screwed and we skip notification because
801 * parent class is already passive.
803 * If the original child was offloaded then it is allowed
804 * to be seem as empty, so the parent is notified anyway.
806 notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
807 !qdisc_is_offloaded);
808 /* TODO: perform the search on a per txq basis */
809 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
811 WARN_ON_ONCE(parentid != TC_H_ROOT);
814 cops = sch->ops->cl_ops;
815 if (notify && cops->qlen_notify) {
816 cl = cops->find(sch, parentid);
817 cops->qlen_notify(sch, cl);
820 sch->qstats.backlog -= len;
821 __qdisc_qstats_drop(sch, drops);
825 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
827 int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type,
830 struct net_device *dev = qdisc_dev(sch);
833 sch->flags &= ~TCQ_F_OFFLOADED;
834 if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
837 err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
838 if (err == -EOPNOTSUPP)
842 sch->flags |= TCQ_F_OFFLOADED;
846 EXPORT_SYMBOL(qdisc_offload_dump_helper);
848 void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
849 struct Qdisc *new, struct Qdisc *old,
850 enum tc_setup_type type, void *type_data,
851 struct netlink_ext_ack *extack)
853 bool any_qdisc_is_offloaded;
856 if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
859 err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
861 /* Don't report error if the graft is part of destroy operation. */
862 if (!err || !new || new == &noop_qdisc)
865 /* Don't report error if the parent, the old child and the new
866 * one are not offloaded.
868 any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED;
869 any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED;
870 any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED;
872 if (any_qdisc_is_offloaded)
873 NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
875 EXPORT_SYMBOL(qdisc_offload_graft_helper);
877 void qdisc_offload_query_caps(struct net_device *dev,
878 enum tc_setup_type type,
879 void *caps, size_t caps_len)
881 const struct net_device_ops *ops = dev->netdev_ops;
882 struct tc_query_caps_base base = {
887 memset(caps, 0, caps_len);
889 if (ops->ndo_setup_tc)
890 ops->ndo_setup_tc(dev, TC_QUERY_CAPS, &base);
892 EXPORT_SYMBOL(qdisc_offload_query_caps);
894 static void qdisc_offload_graft_root(struct net_device *dev,
895 struct Qdisc *new, struct Qdisc *old,
896 struct netlink_ext_ack *extack)
898 struct tc_root_qopt_offload graft_offload = {
899 .command = TC_ROOT_GRAFT,
900 .handle = new ? new->handle : 0,
901 .ingress = (new && new->flags & TCQ_F_INGRESS) ||
902 (old && old->flags & TCQ_F_INGRESS),
905 qdisc_offload_graft_helper(dev, NULL, new, old,
906 TC_SETUP_ROOT_QDISC, &graft_offload, extack);
909 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
910 u32 portid, u32 seq, u16 flags, int event,
911 struct netlink_ext_ack *extack)
913 struct gnet_stats_basic_sync __percpu *cpu_bstats = NULL;
914 struct gnet_stats_queue __percpu *cpu_qstats = NULL;
916 struct nlmsghdr *nlh;
917 unsigned char *b = skb_tail_pointer(skb);
919 struct qdisc_size_table *stab;
924 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
927 tcm = nlmsg_data(nlh);
928 tcm->tcm_family = AF_UNSPEC;
931 tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
932 tcm->tcm_parent = clid;
933 tcm->tcm_handle = q->handle;
934 tcm->tcm_info = refcount_read(&q->refcnt);
935 if (nla_put_string(skb, TCA_KIND, q->ops->id))
936 goto nla_put_failure;
937 if (q->ops->ingress_block_get) {
938 block_index = q->ops->ingress_block_get(q);
940 nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
941 goto nla_put_failure;
943 if (q->ops->egress_block_get) {
944 block_index = q->ops->egress_block_get(q);
946 nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
947 goto nla_put_failure;
949 if (q->ops->dump && q->ops->dump(q, skb) < 0)
950 goto nla_put_failure;
951 if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
952 goto nla_put_failure;
953 qlen = qdisc_qlen_sum(q);
955 stab = rtnl_dereference(q->stab);
956 if (stab && qdisc_dump_stab(skb, stab) < 0)
957 goto nla_put_failure;
959 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
960 NULL, &d, TCA_PAD) < 0)
961 goto nla_put_failure;
963 if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
964 goto nla_put_failure;
966 if (qdisc_is_percpu_stats(q)) {
967 cpu_bstats = q->cpu_bstats;
968 cpu_qstats = q->cpu_qstats;
971 if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats, true) < 0 ||
972 gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
973 gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
974 goto nla_put_failure;
976 if (gnet_stats_finish_copy(&d) < 0)
977 goto nla_put_failure;
979 if (extack && extack->_msg &&
980 nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg))
983 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
993 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
995 if (q->flags & TCQ_F_BUILTIN)
997 if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
1003 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1004 struct nlmsghdr *n, u32 clid,
1005 struct Qdisc *old, struct Qdisc *new,
1006 struct netlink_ext_ack *extack)
1008 struct sk_buff *skb;
1009 u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1011 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1015 if (old && !tc_qdisc_dump_ignore(old, false)) {
1016 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
1017 0, RTM_DELQDISC, extack) < 0)
1020 if (new && !tc_qdisc_dump_ignore(new, false)) {
1021 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
1022 old ? NLM_F_REPLACE : 0, RTM_NEWQDISC, extack) < 0)
1027 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1028 n->nlmsg_flags & NLM_F_ECHO);
1035 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
1036 struct nlmsghdr *n, u32 clid,
1037 struct Qdisc *old, struct Qdisc *new,
1038 struct netlink_ext_ack *extack)
1041 qdisc_notify(net, skb, n, clid, old, new, extack);
1047 static void qdisc_clear_nolock(struct Qdisc *sch)
1049 sch->flags &= ~TCQ_F_NOLOCK;
1050 if (!(sch->flags & TCQ_F_CPUSTATS))
1053 free_percpu(sch->cpu_bstats);
1054 free_percpu(sch->cpu_qstats);
1055 sch->cpu_bstats = NULL;
1056 sch->cpu_qstats = NULL;
1057 sch->flags &= ~TCQ_F_CPUSTATS;
1060 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
1063 * When appropriate send a netlink notification using 'skb'
1066 * On success, destroy old qdisc.
1069 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
1070 struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
1071 struct Qdisc *new, struct Qdisc *old,
1072 struct netlink_ext_ack *extack)
1074 struct Qdisc *q = old;
1075 struct net *net = dev_net(dev);
1077 if (parent == NULL) {
1078 unsigned int i, num_q, ingress;
1079 struct netdev_queue *dev_queue;
1082 num_q = dev->num_tx_queues;
1083 if ((q && q->flags & TCQ_F_INGRESS) ||
1084 (new && new->flags & TCQ_F_INGRESS)) {
1086 dev_queue = dev_ingress_queue(dev);
1088 NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
1092 q = rtnl_dereference(dev_queue->qdisc_sleeping);
1094 /* This is the counterpart of that qdisc_refcount_inc_nz() call in
1095 * __tcf_qdisc_find() for filter requests.
1097 if (!qdisc_refcount_dec_if_one(q)) {
1098 NL_SET_ERR_MSG(extack,
1099 "Current ingress or clsact Qdisc has ongoing filter requests");
1104 if (dev->flags & IFF_UP)
1105 dev_deactivate(dev);
1107 qdisc_offload_graft_root(dev, new, old, extack);
1109 if (new && new->ops->attach && !ingress)
1113 for (i = 0; i < num_q; i++) {
1114 dev_queue = netdev_get_tx_queue(dev, i);
1115 old = dev_graft_qdisc(dev_queue, new);
1118 qdisc_refcount_inc(new);
1122 old = dev_graft_qdisc(dev_queue, NULL);
1124 /* {ingress,clsact}_destroy() @old before grafting @new to avoid
1125 * unprotected concurrent accesses to net_device::miniq_{in,e}gress
1126 * pointer(s) in mini_qdisc_pair_swap().
1128 qdisc_notify(net, skb, n, classid, old, new, extack);
1131 dev_graft_qdisc(dev_queue, new);
1136 old = rtnl_dereference(dev->qdisc);
1137 if (new && !new->ops->attach)
1138 qdisc_refcount_inc(new);
1139 rcu_assign_pointer(dev->qdisc, new ? : &noop_qdisc);
1141 notify_and_destroy(net, skb, n, classid, old, new, extack);
1143 if (new && new->ops->attach)
1144 new->ops->attach(new);
1147 if (dev->flags & IFF_UP)
1150 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
1154 /* Only support running class lockless if parent is lockless */
1155 if (new && (new->flags & TCQ_F_NOLOCK) && !(parent->flags & TCQ_F_NOLOCK))
1156 qdisc_clear_nolock(new);
1158 if (!cops || !cops->graft)
1161 cl = cops->find(parent, classid);
1163 NL_SET_ERR_MSG(extack, "Specified class not found");
1167 if (new && new->ops == &noqueue_qdisc_ops) {
1168 NL_SET_ERR_MSG(extack, "Cannot assign noqueue to a class");
1172 err = cops->graft(parent, cl, new, &old, extack);
1175 notify_and_destroy(net, skb, n, classid, old, new, extack);
1180 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1181 struct netlink_ext_ack *extack)
1185 if (tca[TCA_INGRESS_BLOCK]) {
1186 block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1189 NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1192 if (!sch->ops->ingress_block_set) {
1193 NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1196 sch->ops->ingress_block_set(sch, block_index);
1198 if (tca[TCA_EGRESS_BLOCK]) {
1199 block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1202 NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1205 if (!sch->ops->egress_block_set) {
1206 NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1209 sch->ops->egress_block_set(sch, block_index);
1215 Allocate and initialize new qdisc.
1217 Parameters are passed via opt.
1220 static struct Qdisc *qdisc_create(struct net_device *dev,
1221 struct netdev_queue *dev_queue,
1222 u32 parent, u32 handle,
1223 struct nlattr **tca, int *errp,
1224 struct netlink_ext_ack *extack)
1227 struct nlattr *kind = tca[TCA_KIND];
1229 struct Qdisc_ops *ops;
1230 struct qdisc_size_table *stab;
1232 ops = qdisc_lookup_ops(kind);
1233 #ifdef CONFIG_MODULES
1234 if (ops == NULL && kind != NULL) {
1235 char name[IFNAMSIZ];
1236 if (nla_strscpy(name, kind, IFNAMSIZ) >= 0) {
1237 /* We dropped the RTNL semaphore in order to
1238 * perform the module load. So, even if we
1239 * succeeded in loading the module we have to
1240 * tell the caller to replay the request. We
1241 * indicate this using -EAGAIN.
1242 * We replay the request because the device may
1243 * go away in the mean time.
1246 request_module("sch_%s", name);
1248 ops = qdisc_lookup_ops(kind);
1250 /* We will try again qdisc_lookup_ops,
1251 * so don't keep a reference.
1253 module_put(ops->owner);
1263 NL_SET_ERR_MSG(extack, "Specified qdisc kind is unknown");
1267 sch = qdisc_alloc(dev_queue, ops, extack);
1273 sch->parent = parent;
1275 if (handle == TC_H_INGRESS) {
1276 if (!(sch->flags & TCQ_F_INGRESS)) {
1277 NL_SET_ERR_MSG(extack,
1278 "Specified parent ID is reserved for ingress and clsact Qdiscs");
1282 handle = TC_H_MAKE(TC_H_INGRESS, 0);
1285 handle = qdisc_alloc_handle(dev);
1287 NL_SET_ERR_MSG(extack, "Maximum number of qdisc handles was exceeded");
1292 if (!netif_is_multiqueue(dev))
1293 sch->flags |= TCQ_F_ONETXQUEUE;
1296 sch->handle = handle;
1298 /* This exist to keep backward compatible with a userspace
1299 * loophole, what allowed userspace to get IFF_NO_QUEUE
1300 * facility on older kernels by setting tx_queue_len=0 (prior
1301 * to qdisc init), and then forgot to reinit tx_queue_len
1302 * before again attaching a qdisc.
1304 if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1305 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1306 netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1309 err = qdisc_block_indexes_set(sch, tca, extack);
1314 err = ops->init(sch, tca[TCA_OPTIONS], extack);
1319 if (tca[TCA_STAB]) {
1320 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1322 err = PTR_ERR(stab);
1325 rcu_assign_pointer(sch->stab, stab);
1327 if (tca[TCA_RATE]) {
1329 if (sch->flags & TCQ_F_MQROOT) {
1330 NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1334 err = gen_new_estimator(&sch->bstats,
1341 NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1346 qdisc_hash_add(sch, false);
1347 trace_qdisc_create(ops, dev, parent);
1352 /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1356 netdev_put(dev, &sch->dev_tracker);
1359 module_put(ops->owner);
1366 * Any broken qdiscs that would require a ops->reset() here?
1367 * The qdisc was never in action so it shouldn't be necessary.
1369 qdisc_put_stab(rtnl_dereference(sch->stab));
1375 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1376 struct netlink_ext_ack *extack)
1378 struct qdisc_size_table *ostab, *stab = NULL;
1381 if (tca[TCA_OPTIONS]) {
1382 if (!sch->ops->change) {
1383 NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1386 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1387 NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1390 err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1395 if (tca[TCA_STAB]) {
1396 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1398 return PTR_ERR(stab);
1401 ostab = rtnl_dereference(sch->stab);
1402 rcu_assign_pointer(sch->stab, stab);
1403 qdisc_put_stab(ostab);
1405 if (tca[TCA_RATE]) {
1406 /* NB: ignores errors from replace_estimator
1407 because change can't be undone. */
1408 if (sch->flags & TCQ_F_MQROOT)
1410 gen_replace_estimator(&sch->bstats,
1421 struct check_loop_arg {
1422 struct qdisc_walker w;
1427 static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1428 struct qdisc_walker *w);
1430 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1432 struct check_loop_arg arg;
1434 if (q->ops->cl_ops == NULL)
1437 arg.w.stop = arg.w.skip = arg.w.count = 0;
1438 arg.w.fn = check_loop_fn;
1441 q->ops->cl_ops->walk(q, &arg.w);
1442 return arg.w.stop ? -ELOOP : 0;
1446 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1449 const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1450 struct check_loop_arg *arg = (struct check_loop_arg *)w;
1452 leaf = cops->leaf(q, cl);
1454 if (leaf == arg->p || arg->depth > 7)
1456 return check_loop(leaf, arg->p, arg->depth + 1);
1461 const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
1462 [TCA_KIND] = { .type = NLA_STRING },
1463 [TCA_RATE] = { .type = NLA_BINARY,
1464 .len = sizeof(struct tc_estimator) },
1465 [TCA_STAB] = { .type = NLA_NESTED },
1466 [TCA_DUMP_INVISIBLE] = { .type = NLA_FLAG },
1467 [TCA_CHAIN] = { .type = NLA_U32 },
1468 [TCA_INGRESS_BLOCK] = { .type = NLA_U32 },
1469 [TCA_EGRESS_BLOCK] = { .type = NLA_U32 },
1476 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1477 struct netlink_ext_ack *extack)
1479 struct net *net = sock_net(skb->sk);
1480 struct tcmsg *tcm = nlmsg_data(n);
1481 struct nlattr *tca[TCA_MAX + 1];
1482 struct net_device *dev;
1484 struct Qdisc *q = NULL;
1485 struct Qdisc *p = NULL;
1488 err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1489 rtm_tca_policy, extack);
1493 dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1497 clid = tcm->tcm_parent;
1499 if (clid != TC_H_ROOT) {
1500 if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1501 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1503 NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1506 q = qdisc_leaf(p, clid);
1507 } else if (dev_ingress_queue(dev)) {
1508 q = rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping);
1511 q = rtnl_dereference(dev->qdisc);
1514 NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1518 if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1519 NL_SET_ERR_MSG(extack, "Invalid handle");
1523 q = qdisc_lookup(dev, tcm->tcm_handle);
1525 NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1530 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1531 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1535 if (n->nlmsg_type == RTM_DELQDISC) {
1537 NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1540 if (q->handle == 0) {
1541 NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1544 err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1548 qdisc_notify(net, skb, n, clid, NULL, q, NULL);
1554 * Create/change qdisc.
1557 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1558 struct netlink_ext_ack *extack)
1560 struct net *net = sock_net(skb->sk);
1562 struct nlattr *tca[TCA_MAX + 1];
1563 struct net_device *dev;
1565 struct Qdisc *q, *p;
1569 /* Reinit, just in case something touches this. */
1570 err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1571 rtm_tca_policy, extack);
1575 tcm = nlmsg_data(n);
1576 clid = tcm->tcm_parent;
1579 dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1585 if (clid != TC_H_ROOT) {
1586 if (clid != TC_H_INGRESS) {
1587 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1589 NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1592 q = qdisc_leaf(p, clid);
1593 } else if (dev_ingress_queue_create(dev)) {
1594 q = rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping);
1597 q = rtnl_dereference(dev->qdisc);
1600 /* It may be default qdisc, ignore it */
1601 if (q && q->handle == 0)
1604 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1605 if (tcm->tcm_handle) {
1606 if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1607 NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1610 if (TC_H_MIN(tcm->tcm_handle)) {
1611 NL_SET_ERR_MSG(extack, "Invalid minor handle");
1614 q = qdisc_lookup(dev, tcm->tcm_handle);
1616 goto create_n_graft;
1617 if (n->nlmsg_flags & NLM_F_EXCL) {
1618 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1621 if (tca[TCA_KIND] &&
1622 nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1623 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1626 if (q->flags & TCQ_F_INGRESS) {
1627 NL_SET_ERR_MSG(extack,
1628 "Cannot regraft ingress or clsact Qdiscs");
1632 (p && check_loop(q, p, 0))) {
1633 NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1636 if (clid == TC_H_INGRESS) {
1637 NL_SET_ERR_MSG(extack, "Ingress cannot graft directly");
1640 qdisc_refcount_inc(q);
1644 goto create_n_graft;
1646 /* This magic test requires explanation.
1648 * We know, that some child q is already
1649 * attached to this parent and have choice:
1650 * either to change it or to create/graft new one.
1652 * 1. We are allowed to create/graft only
1653 * if CREATE and REPLACE flags are set.
1655 * 2. If EXCL is set, requestor wanted to say,
1656 * that qdisc tcm_handle is not expected
1657 * to exist, so that we choose create/graft too.
1659 * 3. The last case is when no flags are set.
1660 * Alas, it is sort of hole in API, we
1661 * cannot decide what to do unambiguously.
1662 * For now we select create/graft, if
1663 * user gave KIND, which does not match existing.
1665 if ((n->nlmsg_flags & NLM_F_CREATE) &&
1666 (n->nlmsg_flags & NLM_F_REPLACE) &&
1667 ((n->nlmsg_flags & NLM_F_EXCL) ||
1669 nla_strcmp(tca[TCA_KIND], q->ops->id))))
1670 goto create_n_graft;
1674 if (!tcm->tcm_handle) {
1675 NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1678 q = qdisc_lookup(dev, tcm->tcm_handle);
1681 /* Change qdisc parameters */
1683 NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1686 if (n->nlmsg_flags & NLM_F_EXCL) {
1687 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1690 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1691 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1694 err = qdisc_change(q, tca, extack);
1696 qdisc_notify(net, skb, n, clid, NULL, q, extack);
1700 if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1701 NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1704 if (clid == TC_H_INGRESS) {
1705 if (dev_ingress_queue(dev)) {
1706 q = qdisc_create(dev, dev_ingress_queue(dev),
1707 tcm->tcm_parent, tcm->tcm_parent,
1710 NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1714 struct netdev_queue *dev_queue;
1716 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1717 dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1719 dev_queue = p->dev_queue;
1721 dev_queue = netdev_get_tx_queue(dev, 0);
1723 q = qdisc_create(dev, dev_queue,
1724 tcm->tcm_parent, tcm->tcm_handle,
1734 err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1744 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1745 struct netlink_callback *cb,
1746 int *q_idx_p, int s_q_idx, bool recur,
1747 bool dump_invisible)
1749 int ret = 0, q_idx = *q_idx_p;
1757 if (q_idx < s_q_idx) {
1760 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1761 tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1762 cb->nlh->nlmsg_seq, NLM_F_MULTI,
1763 RTM_NEWQDISC, NULL) <= 0)
1768 /* If dumping singletons, there is no qdisc_dev(root) and the singleton
1769 * itself has already been dumped.
1771 * If we've already dumped the top-level (ingress) qdisc above and the global
1772 * qdisc hashtable, we don't want to hit it again
1774 if (!qdisc_dev(root) || !recur)
1777 hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1778 if (q_idx < s_q_idx) {
1782 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1783 tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1784 cb->nlh->nlmsg_seq, NLM_F_MULTI,
1785 RTM_NEWQDISC, NULL) <= 0)
1798 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1800 struct net *net = sock_net(skb->sk);
1803 struct net_device *dev;
1804 const struct nlmsghdr *nlh = cb->nlh;
1805 struct nlattr *tca[TCA_MAX + 1];
1808 s_idx = cb->args[0];
1809 s_q_idx = q_idx = cb->args[1];
1814 err = nlmsg_parse_deprecated(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
1815 rtm_tca_policy, cb->extack);
1819 for_each_netdev(net, dev) {
1820 struct netdev_queue *dev_queue;
1828 if (tc_dump_qdisc_root(rtnl_dereference(dev->qdisc),
1829 skb, cb, &q_idx, s_q_idx,
1830 true, tca[TCA_DUMP_INVISIBLE]) < 0)
1833 dev_queue = dev_ingress_queue(dev);
1835 tc_dump_qdisc_root(rtnl_dereference(dev_queue->qdisc_sleeping),
1836 skb, cb, &q_idx, s_q_idx, false,
1837 tca[TCA_DUMP_INVISIBLE]) < 0)
1846 cb->args[1] = q_idx;
1853 /************************************************
1854 * Traffic classes manipulation. *
1855 ************************************************/
1857 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1858 unsigned long cl, u32 portid, u32 seq, u16 flags,
1859 int event, struct netlink_ext_ack *extack)
1862 struct nlmsghdr *nlh;
1863 unsigned char *b = skb_tail_pointer(skb);
1865 const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1868 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1870 goto out_nlmsg_trim;
1871 tcm = nlmsg_data(nlh);
1872 tcm->tcm_family = AF_UNSPEC;
1875 tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1876 tcm->tcm_parent = q->handle;
1877 tcm->tcm_handle = q->handle;
1879 if (nla_put_string(skb, TCA_KIND, q->ops->id))
1880 goto nla_put_failure;
1881 if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1882 goto nla_put_failure;
1884 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1885 NULL, &d, TCA_PAD) < 0)
1886 goto nla_put_failure;
1888 if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1889 goto nla_put_failure;
1891 if (gnet_stats_finish_copy(&d) < 0)
1892 goto nla_put_failure;
1894 if (extack && extack->_msg &&
1895 nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg))
1896 goto out_nlmsg_trim;
1898 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1908 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1909 struct nlmsghdr *n, struct Qdisc *q,
1910 unsigned long cl, int event, struct netlink_ext_ack *extack)
1912 struct sk_buff *skb;
1913 u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1915 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1919 if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event, extack) < 0) {
1924 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1925 n->nlmsg_flags & NLM_F_ECHO);
1928 static int tclass_del_notify(struct net *net,
1929 const struct Qdisc_class_ops *cops,
1930 struct sk_buff *oskb, struct nlmsghdr *n,
1931 struct Qdisc *q, unsigned long cl,
1932 struct netlink_ext_ack *extack)
1934 u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1935 struct sk_buff *skb;
1941 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1945 if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1946 RTM_DELTCLASS, extack) < 0) {
1951 err = cops->delete(q, cl, extack);
1957 err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1958 n->nlmsg_flags & NLM_F_ECHO);
1962 #ifdef CONFIG_NET_CLS
1964 struct tcf_bind_args {
1965 struct tcf_walker w;
1971 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
1973 struct tcf_bind_args *a = (void *)arg;
1975 if (n && tp->ops->bind_class) {
1976 struct Qdisc *q = tcf_block_q(tp->chain->block);
1979 tp->ops->bind_class(n, a->classid, a->cl, q, a->base);
1985 struct tc_bind_class_args {
1986 struct qdisc_walker w;
1987 unsigned long new_cl;
1992 static int tc_bind_class_walker(struct Qdisc *q, unsigned long cl,
1993 struct qdisc_walker *w)
1995 struct tc_bind_class_args *a = (struct tc_bind_class_args *)w;
1996 const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1997 struct tcf_block *block;
1998 struct tcf_chain *chain;
2000 block = cops->tcf_block(q, cl, NULL);
2003 for (chain = tcf_get_next_chain(block, NULL);
2005 chain = tcf_get_next_chain(block, chain)) {
2006 struct tcf_proto *tp;
2008 for (tp = tcf_get_next_proto(chain, NULL);
2009 tp; tp = tcf_get_next_proto(chain, tp)) {
2010 struct tcf_bind_args arg = {};
2012 arg.w.fn = tcf_node_bind;
2013 arg.classid = a->clid;
2016 tp->ops->walk(tp, &arg.w, true);
2023 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
2024 unsigned long new_cl)
2026 const struct Qdisc_class_ops *cops = q->ops->cl_ops;
2027 struct tc_bind_class_args args = {};
2029 if (!cops->tcf_block)
2031 args.portid = portid;
2033 args.new_cl = new_cl;
2034 args.w.fn = tc_bind_class_walker;
2035 q->ops->cl_ops->walk(q, &args.w);
2040 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
2041 unsigned long new_cl)
2047 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
2048 struct netlink_ext_ack *extack)
2050 struct net *net = sock_net(skb->sk);
2051 struct tcmsg *tcm = nlmsg_data(n);
2052 struct nlattr *tca[TCA_MAX + 1];
2053 struct net_device *dev;
2054 struct Qdisc *q = NULL;
2055 const struct Qdisc_class_ops *cops;
2056 unsigned long cl = 0;
2057 unsigned long new_cl;
2063 err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
2064 rtm_tca_policy, extack);
2068 dev = __dev_get_by_index(net, tcm->tcm_ifindex);
2073 parent == TC_H_UNSPEC - unspecified parent.
2074 parent == TC_H_ROOT - class is root, which has no parent.
2075 parent == X:0 - parent is root class.
2076 parent == X:Y - parent is a node in hierarchy.
2077 parent == 0:Y - parent is X:Y, where X:0 is qdisc.
2079 handle == 0:0 - generate handle from kernel pool.
2080 handle == 0:Y - class is X:Y, where X:0 is qdisc.
2081 handle == X:Y - clear.
2082 handle == X:0 - root class.
2085 /* Step 1. Determine qdisc handle X:0 */
2087 portid = tcm->tcm_parent;
2088 clid = tcm->tcm_handle;
2089 qid = TC_H_MAJ(clid);
2091 if (portid != TC_H_ROOT) {
2092 u32 qid1 = TC_H_MAJ(portid);
2095 /* If both majors are known, they must be identical. */
2100 } else if (qid == 0)
2101 qid = rtnl_dereference(dev->qdisc)->handle;
2103 /* Now qid is genuine qdisc handle consistent
2104 * both with parent and child.
2106 * TC_H_MAJ(portid) still may be unspecified, complete it now.
2109 portid = TC_H_MAKE(qid, portid);
2112 qid = rtnl_dereference(dev->qdisc)->handle;
2115 /* OK. Locate qdisc */
2116 q = qdisc_lookup(dev, qid);
2120 /* An check that it supports classes */
2121 cops = q->ops->cl_ops;
2125 /* Now try to get class */
2127 if (portid == TC_H_ROOT)
2130 clid = TC_H_MAKE(qid, clid);
2133 cl = cops->find(q, clid);
2137 if (n->nlmsg_type != RTM_NEWTCLASS ||
2138 !(n->nlmsg_flags & NLM_F_CREATE))
2141 switch (n->nlmsg_type) {
2144 if (n->nlmsg_flags & NLM_F_EXCL)
2148 err = tclass_del_notify(net, cops, skb, n, q, cl, extack);
2149 /* Unbind the class with flilters with 0 */
2150 tc_bind_tclass(q, portid, clid, 0);
2153 err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS, extack);
2161 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
2162 NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
2169 err = cops->change(q, clid, portid, tca, &new_cl, extack);
2171 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS, extack);
2172 /* We just create a new class, need to do reverse binding. */
2174 tc_bind_tclass(q, portid, clid, new_cl);
2180 struct qdisc_dump_args {
2181 struct qdisc_walker w;
2182 struct sk_buff *skb;
2183 struct netlink_callback *cb;
2186 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
2187 struct qdisc_walker *arg)
2189 struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
2191 return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
2192 a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
2193 RTM_NEWTCLASS, NULL);
2196 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
2197 struct tcmsg *tcm, struct netlink_callback *cb,
2200 struct qdisc_dump_args arg;
2202 if (tc_qdisc_dump_ignore(q, false) ||
2203 *t_p < s_t || !q->ops->cl_ops ||
2205 TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2210 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2211 arg.w.fn = qdisc_class_dump;
2215 arg.w.skip = cb->args[1];
2217 q->ops->cl_ops->walk(q, &arg.w);
2218 cb->args[1] = arg.w.count;
2225 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2226 struct tcmsg *tcm, struct netlink_callback *cb,
2227 int *t_p, int s_t, bool recur)
2235 if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2238 if (!qdisc_dev(root) || !recur)
2241 if (tcm->tcm_parent) {
2242 q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2243 if (q && q != root &&
2244 tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2248 hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2249 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2256 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2258 struct tcmsg *tcm = nlmsg_data(cb->nlh);
2259 struct net *net = sock_net(skb->sk);
2260 struct netdev_queue *dev_queue;
2261 struct net_device *dev;
2264 if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2266 dev = dev_get_by_index(net, tcm->tcm_ifindex);
2273 if (tc_dump_tclass_root(rtnl_dereference(dev->qdisc),
2274 skb, tcm, cb, &t, s_t, true) < 0)
2277 dev_queue = dev_ingress_queue(dev);
2279 tc_dump_tclass_root(rtnl_dereference(dev_queue->qdisc_sleeping),
2280 skb, tcm, cb, &t, s_t, false) < 0)
2290 #ifdef CONFIG_PROC_FS
2291 static int psched_show(struct seq_file *seq, void *v)
2293 seq_printf(seq, "%08x %08x %08x %08x\n",
2294 (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2296 (u32)NSEC_PER_SEC / hrtimer_resolution);
2301 static int __net_init psched_net_init(struct net *net)
2303 struct proc_dir_entry *e;
2305 e = proc_create_single("psched", 0, net->proc_net, psched_show);
2312 static void __net_exit psched_net_exit(struct net *net)
2314 remove_proc_entry("psched", net->proc_net);
2317 static int __net_init psched_net_init(struct net *net)
2322 static void __net_exit psched_net_exit(struct net *net)
2327 static struct pernet_operations psched_net_ops = {
2328 .init = psched_net_init,
2329 .exit = psched_net_exit,
2332 static int __init pktsched_init(void)
2336 err = register_pernet_subsys(&psched_net_ops);
2338 pr_err("pktsched_init: "
2339 "cannot initialize per netns operations\n");
2343 register_qdisc(&pfifo_fast_ops);
2344 register_qdisc(&pfifo_qdisc_ops);
2345 register_qdisc(&bfifo_qdisc_ops);
2346 register_qdisc(&pfifo_head_drop_qdisc_ops);
2347 register_qdisc(&mq_qdisc_ops);
2348 register_qdisc(&noqueue_qdisc_ops);
2350 rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2351 rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2352 rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2354 rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2355 rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2356 rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2362 subsys_initcall(pktsched_init);