1 // SPDX-License-Identifier: GPL-2.0-only
3 * net/sched/sch_ets.c Enhanced Transmission Selection scheduler
8 * The Enhanced Transmission Selection scheduler is a classful queuing
9 * discipline that merges functionality of PRIO and DRR qdiscs in one scheduler.
10 * ETS makes it easy to configure a set of strict and bandwidth-sharing bands to
11 * implement the transmission selection described in 802.1Qaz.
13 * Although ETS is technically classful, it's not possible to add and remove
14 * classes at will. Instead one specifies number of classes, how many are
15 * PRIO-like and how many DRR-like, and quanta for the latter.
20 * The strict classes, if any, are tried for traffic first: first band 0, if it
21 * has no traffic then band 1, etc.
23 * When there is no traffic in any of the strict queues, the bandwidth-sharing
24 * ones are tried next. Each band is assigned a deficit counter, initialized to
25 * "quantum" of that band. ETS maintains a list of active bandwidth-sharing
26 * bands whose qdiscs are non-empty. A packet is dequeued from the band at the
27 * head of the list if the packet size is smaller or equal to the deficit
28 * counter. If the counter is too small, it is increased by "quantum" and the
29 * scheduler moves on to the next band in the active list.
32 #include <linux/module.h>
33 #include <net/gen_stats.h>
34 #include <net/netlink.h>
35 #include <net/pkt_cls.h>
36 #include <net/pkt_sched.h>
37 #include <net/sch_generic.h>
40 struct list_head alist; /* In struct ets_sched.active. */
44 struct gnet_stats_basic_sync bstats;
45 struct gnet_stats_queue qstats;
49 struct list_head active;
50 struct tcf_proto __rcu *filter_list;
51 struct tcf_block *block;
54 u8 prio2band[TC_PRIO_MAX + 1];
55 struct ets_class classes[TCQ_ETS_MAX_BANDS];
58 static const struct nla_policy ets_policy[TCA_ETS_MAX + 1] = {
59 [TCA_ETS_NBANDS] = { .type = NLA_U8 },
60 [TCA_ETS_NSTRICT] = { .type = NLA_U8 },
61 [TCA_ETS_QUANTA] = { .type = NLA_NESTED },
62 [TCA_ETS_PRIOMAP] = { .type = NLA_NESTED },
65 static const struct nla_policy ets_priomap_policy[TCA_ETS_MAX + 1] = {
66 [TCA_ETS_PRIOMAP_BAND] = { .type = NLA_U8 },
69 static const struct nla_policy ets_quanta_policy[TCA_ETS_MAX + 1] = {
70 [TCA_ETS_QUANTA_BAND] = { .type = NLA_U32 },
73 static const struct nla_policy ets_class_policy[TCA_ETS_MAX + 1] = {
74 [TCA_ETS_QUANTA_BAND] = { .type = NLA_U32 },
77 static int ets_quantum_parse(struct Qdisc *sch, const struct nlattr *attr,
78 unsigned int *quantum,
79 struct netlink_ext_ack *extack)
81 *quantum = nla_get_u32(attr);
83 NL_SET_ERR_MSG(extack, "ETS quantum cannot be zero");
89 static struct ets_class *
90 ets_class_from_arg(struct Qdisc *sch, unsigned long arg)
92 struct ets_sched *q = qdisc_priv(sch);
94 return &q->classes[arg - 1];
97 static u32 ets_class_id(struct Qdisc *sch, const struct ets_class *cl)
99 struct ets_sched *q = qdisc_priv(sch);
100 int band = cl - q->classes;
102 return TC_H_MAKE(sch->handle, band + 1);
105 static void ets_offload_change(struct Qdisc *sch)
107 struct net_device *dev = qdisc_dev(sch);
108 struct ets_sched *q = qdisc_priv(sch);
109 struct tc_ets_qopt_offload qopt;
110 unsigned int w_psum_prev = 0;
111 unsigned int q_psum = 0;
112 unsigned int q_sum = 0;
113 unsigned int quantum;
118 if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
121 qopt.command = TC_ETS_REPLACE;
122 qopt.handle = sch->handle;
123 qopt.parent = sch->parent;
124 qopt.replace_params.bands = q->nbands;
125 qopt.replace_params.qstats = &sch->qstats;
126 memcpy(&qopt.replace_params.priomap,
127 q->prio2band, sizeof(q->prio2band));
129 for (i = 0; i < q->nbands; i++)
130 q_sum += q->classes[i].quantum;
132 for (i = 0; i < q->nbands; i++) {
133 quantum = q->classes[i].quantum;
135 w_psum = quantum ? q_psum * 100 / q_sum : 0;
136 weight = w_psum - w_psum_prev;
137 w_psum_prev = w_psum;
139 qopt.replace_params.quanta[i] = quantum;
140 qopt.replace_params.weights[i] = weight;
143 dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_ETS, &qopt);
146 static void ets_offload_destroy(struct Qdisc *sch)
148 struct net_device *dev = qdisc_dev(sch);
149 struct tc_ets_qopt_offload qopt;
151 if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
154 qopt.command = TC_ETS_DESTROY;
155 qopt.handle = sch->handle;
156 qopt.parent = sch->parent;
157 dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_ETS, &qopt);
160 static void ets_offload_graft(struct Qdisc *sch, struct Qdisc *new,
161 struct Qdisc *old, unsigned long arg,
162 struct netlink_ext_ack *extack)
164 struct net_device *dev = qdisc_dev(sch);
165 struct tc_ets_qopt_offload qopt;
167 qopt.command = TC_ETS_GRAFT;
168 qopt.handle = sch->handle;
169 qopt.parent = sch->parent;
170 qopt.graft_params.band = arg - 1;
171 qopt.graft_params.child_handle = new->handle;
173 qdisc_offload_graft_helper(dev, sch, new, old, TC_SETUP_QDISC_ETS,
177 static int ets_offload_dump(struct Qdisc *sch)
179 struct tc_ets_qopt_offload qopt;
181 qopt.command = TC_ETS_STATS;
182 qopt.handle = sch->handle;
183 qopt.parent = sch->parent;
184 qopt.stats.bstats = &sch->bstats;
185 qopt.stats.qstats = &sch->qstats;
187 return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_ETS, &qopt);
190 static bool ets_class_is_strict(struct ets_sched *q, const struct ets_class *cl)
192 unsigned int band = cl - q->classes;
194 return band < q->nstrict;
197 static int ets_class_change(struct Qdisc *sch, u32 classid, u32 parentid,
198 struct nlattr **tca, unsigned long *arg,
199 struct netlink_ext_ack *extack)
201 struct ets_class *cl = ets_class_from_arg(sch, *arg);
202 struct ets_sched *q = qdisc_priv(sch);
203 struct nlattr *opt = tca[TCA_OPTIONS];
204 struct nlattr *tb[TCA_ETS_MAX + 1];
205 unsigned int quantum;
208 /* Classes can be added and removed only through Qdisc_ops.change
212 NL_SET_ERR_MSG(extack, "Fine-grained class addition and removal is not supported");
217 NL_SET_ERR_MSG(extack, "ETS options are required for this operation");
221 err = nla_parse_nested(tb, TCA_ETS_MAX, opt, ets_class_policy, extack);
225 if (!tb[TCA_ETS_QUANTA_BAND])
226 /* Nothing to configure. */
229 if (ets_class_is_strict(q, cl)) {
230 NL_SET_ERR_MSG(extack, "Strict bands do not have a configurable quantum");
234 err = ets_quantum_parse(sch, tb[TCA_ETS_QUANTA_BAND], &quantum,
240 cl->quantum = quantum;
241 sch_tree_unlock(sch);
243 ets_offload_change(sch);
247 static int ets_class_graft(struct Qdisc *sch, unsigned long arg,
248 struct Qdisc *new, struct Qdisc **old,
249 struct netlink_ext_ack *extack)
251 struct ets_class *cl = ets_class_from_arg(sch, arg);
254 new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
255 ets_class_id(sch, cl), NULL);
259 qdisc_hash_add(new, true);
262 *old = qdisc_replace(sch, new, &cl->qdisc);
263 ets_offload_graft(sch, new, *old, arg, extack);
267 static struct Qdisc *ets_class_leaf(struct Qdisc *sch, unsigned long arg)
269 struct ets_class *cl = ets_class_from_arg(sch, arg);
274 static unsigned long ets_class_find(struct Qdisc *sch, u32 classid)
276 unsigned long band = TC_H_MIN(classid);
277 struct ets_sched *q = qdisc_priv(sch);
279 if (band - 1 >= q->nbands)
284 static void ets_class_qlen_notify(struct Qdisc *sch, unsigned long arg)
286 struct ets_class *cl = ets_class_from_arg(sch, arg);
287 struct ets_sched *q = qdisc_priv(sch);
289 /* We get notified about zero-length child Qdiscs as well if they are
290 * offloaded. Those aren't on the active list though, so don't attempt
293 if (!ets_class_is_strict(q, cl) && sch->q.qlen)
294 list_del(&cl->alist);
297 static int ets_class_dump(struct Qdisc *sch, unsigned long arg,
298 struct sk_buff *skb, struct tcmsg *tcm)
300 struct ets_class *cl = ets_class_from_arg(sch, arg);
301 struct ets_sched *q = qdisc_priv(sch);
304 tcm->tcm_parent = TC_H_ROOT;
305 tcm->tcm_handle = ets_class_id(sch, cl);
306 tcm->tcm_info = cl->qdisc->handle;
308 nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
310 goto nla_put_failure;
311 if (!ets_class_is_strict(q, cl)) {
312 if (nla_put_u32(skb, TCA_ETS_QUANTA_BAND, cl->quantum))
313 goto nla_put_failure;
315 return nla_nest_end(skb, nest);
318 nla_nest_cancel(skb, nest);
322 static int ets_class_dump_stats(struct Qdisc *sch, unsigned long arg,
325 struct ets_class *cl = ets_class_from_arg(sch, arg);
326 struct Qdisc *cl_q = cl->qdisc;
328 if (gnet_stats_copy_basic(d, NULL, &cl_q->bstats, true) < 0 ||
329 qdisc_qstats_copy(d, cl_q) < 0)
335 static void ets_qdisc_walk(struct Qdisc *sch, struct qdisc_walker *arg)
337 struct ets_sched *q = qdisc_priv(sch);
343 for (i = 0; i < q->nbands; i++) {
344 if (!tc_qdisc_stats_dump(sch, i + 1, arg))
349 static struct tcf_block *
350 ets_qdisc_tcf_block(struct Qdisc *sch, unsigned long cl,
351 struct netlink_ext_ack *extack)
353 struct ets_sched *q = qdisc_priv(sch);
356 NL_SET_ERR_MSG(extack, "ETS classid must be zero");
363 static unsigned long ets_qdisc_bind_tcf(struct Qdisc *sch, unsigned long parent,
366 return ets_class_find(sch, classid);
369 static void ets_qdisc_unbind_tcf(struct Qdisc *sch, unsigned long arg)
373 static struct ets_class *ets_classify(struct sk_buff *skb, struct Qdisc *sch,
376 struct ets_sched *q = qdisc_priv(sch);
377 u32 band = skb->priority;
378 struct tcf_result res;
379 struct tcf_proto *fl;
382 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
383 if (TC_H_MAJ(skb->priority) != sch->handle) {
384 fl = rcu_dereference_bh(q->filter_list);
385 err = tcf_classify(skb, NULL, fl, &res, false);
386 #ifdef CONFIG_NET_CLS_ACT
391 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
397 if (!fl || err < 0) {
400 return &q->classes[q->prio2band[band & TC_PRIO_MAX]];
404 band = TC_H_MIN(band) - 1;
405 if (band >= q->nbands)
406 return &q->classes[q->prio2band[0]];
407 return &q->classes[band];
410 static int ets_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
411 struct sk_buff **to_free)
413 unsigned int len = qdisc_pkt_len(skb);
414 struct ets_sched *q = qdisc_priv(sch);
415 struct ets_class *cl;
419 cl = ets_classify(skb, sch, &err);
421 if (err & __NET_XMIT_BYPASS)
422 qdisc_qstats_drop(sch);
423 __qdisc_drop(skb, to_free);
427 first = !cl->qdisc->q.qlen;
428 err = qdisc_enqueue(skb, cl->qdisc, to_free);
429 if (unlikely(err != NET_XMIT_SUCCESS)) {
430 if (net_xmit_drop_count(err)) {
432 qdisc_qstats_drop(sch);
437 if (first && !ets_class_is_strict(q, cl)) {
438 list_add_tail(&cl->alist, &q->active);
439 cl->deficit = cl->quantum;
442 sch->qstats.backlog += len;
447 static struct sk_buff *
448 ets_qdisc_dequeue_skb(struct Qdisc *sch, struct sk_buff *skb)
450 qdisc_bstats_update(sch, skb);
451 qdisc_qstats_backlog_dec(sch, skb);
456 static struct sk_buff *ets_qdisc_dequeue(struct Qdisc *sch)
458 struct ets_sched *q = qdisc_priv(sch);
459 struct ets_class *cl;
465 for (band = 0; band < q->nstrict; band++) {
466 cl = &q->classes[band];
467 skb = qdisc_dequeue_peeked(cl->qdisc);
469 return ets_qdisc_dequeue_skb(sch, skb);
472 if (list_empty(&q->active))
475 cl = list_first_entry(&q->active, struct ets_class, alist);
476 skb = cl->qdisc->ops->peek(cl->qdisc);
478 qdisc_warn_nonwc(__func__, cl->qdisc);
482 len = qdisc_pkt_len(skb);
483 if (len <= cl->deficit) {
485 skb = qdisc_dequeue_peeked(cl->qdisc);
488 if (cl->qdisc->q.qlen == 0)
489 list_del(&cl->alist);
490 return ets_qdisc_dequeue_skb(sch, skb);
493 cl->deficit += cl->quantum;
494 list_move_tail(&cl->alist, &q->active);
500 static int ets_qdisc_priomap_parse(struct nlattr *priomap_attr,
501 unsigned int nbands, u8 *priomap,
502 struct netlink_ext_ack *extack)
504 const struct nlattr *attr;
510 err = __nla_validate_nested(priomap_attr, TCA_ETS_MAX,
511 ets_priomap_policy, NL_VALIDATE_STRICT,
516 nla_for_each_nested(attr, priomap_attr, rem) {
517 switch (nla_type(attr)) {
518 case TCA_ETS_PRIOMAP_BAND:
519 if (prio > TC_PRIO_MAX) {
520 NL_SET_ERR_MSG_MOD(extack, "Too many priorities in ETS priomap");
523 band = nla_get_u8(attr);
524 if (band >= nbands) {
525 NL_SET_ERR_MSG_MOD(extack, "Invalid band number in ETS priomap");
528 priomap[prio++] = band;
531 WARN_ON_ONCE(1); /* Validate should have caught this. */
539 static int ets_qdisc_quanta_parse(struct Qdisc *sch, struct nlattr *quanta_attr,
540 unsigned int nbands, unsigned int nstrict,
541 unsigned int *quanta,
542 struct netlink_ext_ack *extack)
544 const struct nlattr *attr;
549 err = __nla_validate_nested(quanta_attr, TCA_ETS_MAX,
550 ets_quanta_policy, NL_VALIDATE_STRICT,
555 nla_for_each_nested(attr, quanta_attr, rem) {
556 switch (nla_type(attr)) {
557 case TCA_ETS_QUANTA_BAND:
558 if (band >= nbands) {
559 NL_SET_ERR_MSG_MOD(extack, "ETS quanta has more values than bands");
562 err = ets_quantum_parse(sch, attr, &quanta[band++],
568 WARN_ON_ONCE(1); /* Validate should have caught this. */
576 static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt,
577 struct netlink_ext_ack *extack)
579 unsigned int quanta[TCQ_ETS_MAX_BANDS] = {0};
580 struct Qdisc *queues[TCQ_ETS_MAX_BANDS];
581 struct ets_sched *q = qdisc_priv(sch);
582 struct nlattr *tb[TCA_ETS_MAX + 1];
583 unsigned int oldbands = q->nbands;
584 u8 priomap[TC_PRIO_MAX + 1];
585 unsigned int nstrict = 0;
590 err = nla_parse_nested(tb, TCA_ETS_MAX, opt, ets_policy, extack);
594 if (!tb[TCA_ETS_NBANDS]) {
595 NL_SET_ERR_MSG_MOD(extack, "Number of bands is a required argument");
598 nbands = nla_get_u8(tb[TCA_ETS_NBANDS]);
599 if (nbands < 1 || nbands > TCQ_ETS_MAX_BANDS) {
600 NL_SET_ERR_MSG_MOD(extack, "Invalid number of bands");
603 /* Unless overridden, traffic goes to the last band. */
604 memset(priomap, nbands - 1, sizeof(priomap));
606 if (tb[TCA_ETS_NSTRICT]) {
607 nstrict = nla_get_u8(tb[TCA_ETS_NSTRICT]);
608 if (nstrict > nbands) {
609 NL_SET_ERR_MSG_MOD(extack, "Invalid number of strict bands");
614 if (tb[TCA_ETS_PRIOMAP]) {
615 err = ets_qdisc_priomap_parse(tb[TCA_ETS_PRIOMAP],
616 nbands, priomap, extack);
621 if (tb[TCA_ETS_QUANTA]) {
622 err = ets_qdisc_quanta_parse(sch, tb[TCA_ETS_QUANTA],
623 nbands, nstrict, quanta, extack);
627 /* If there are more bands than strict + quanta provided, the remaining
628 * ones are ETS with quantum of MTU. Initialize the missing values here.
630 for (i = nstrict; i < nbands; i++) {
632 quanta[i] = psched_mtu(qdisc_dev(sch));
635 /* Before commit, make sure we can allocate all new qdiscs */
636 for (i = oldbands; i < nbands; i++) {
637 queues[i] = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
638 ets_class_id(sch, &q->classes[i]),
642 qdisc_put(queues[--i]);
650 for (i = nstrict; i < q->nstrict; i++) {
651 if (q->classes[i].qdisc->q.qlen) {
652 list_add_tail(&q->classes[i].alist, &q->active);
653 q->classes[i].deficit = quanta[i];
656 for (i = q->nbands; i < oldbands; i++) {
657 if (i >= q->nstrict && q->classes[i].qdisc->q.qlen)
658 list_del(&q->classes[i].alist);
659 qdisc_tree_flush_backlog(q->classes[i].qdisc);
661 q->nstrict = nstrict;
662 memcpy(q->prio2band, priomap, sizeof(priomap));
664 for (i = 0; i < q->nbands; i++)
665 q->classes[i].quantum = quanta[i];
667 for (i = oldbands; i < q->nbands; i++) {
668 q->classes[i].qdisc = queues[i];
669 if (q->classes[i].qdisc != &noop_qdisc)
670 qdisc_hash_add(q->classes[i].qdisc, true);
673 sch_tree_unlock(sch);
675 ets_offload_change(sch);
676 for (i = q->nbands; i < oldbands; i++) {
677 qdisc_put(q->classes[i].qdisc);
678 q->classes[i].qdisc = NULL;
679 q->classes[i].quantum = 0;
680 q->classes[i].deficit = 0;
681 gnet_stats_basic_sync_init(&q->classes[i].bstats);
682 memset(&q->classes[i].qstats, 0, sizeof(q->classes[i].qstats));
687 static int ets_qdisc_init(struct Qdisc *sch, struct nlattr *opt,
688 struct netlink_ext_ack *extack)
690 struct ets_sched *q = qdisc_priv(sch);
696 err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
700 INIT_LIST_HEAD(&q->active);
701 for (i = 0; i < TCQ_ETS_MAX_BANDS; i++)
702 INIT_LIST_HEAD(&q->classes[i].alist);
704 return ets_qdisc_change(sch, opt, extack);
707 static void ets_qdisc_reset(struct Qdisc *sch)
709 struct ets_sched *q = qdisc_priv(sch);
712 for (band = q->nstrict; band < q->nbands; band++) {
713 if (q->classes[band].qdisc->q.qlen)
714 list_del(&q->classes[band].alist);
716 for (band = 0; band < q->nbands; band++)
717 qdisc_reset(q->classes[band].qdisc);
720 static void ets_qdisc_destroy(struct Qdisc *sch)
722 struct ets_sched *q = qdisc_priv(sch);
725 ets_offload_destroy(sch);
726 tcf_block_put(q->block);
727 for (band = 0; band < q->nbands; band++)
728 qdisc_put(q->classes[band].qdisc);
731 static int ets_qdisc_dump(struct Qdisc *sch, struct sk_buff *skb)
733 struct ets_sched *q = qdisc_priv(sch);
740 err = ets_offload_dump(sch);
744 opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
748 if (nla_put_u8(skb, TCA_ETS_NBANDS, q->nbands))
752 nla_put_u8(skb, TCA_ETS_NSTRICT, q->nstrict))
755 if (q->nbands > q->nstrict) {
756 nest = nla_nest_start(skb, TCA_ETS_QUANTA);
760 for (band = q->nstrict; band < q->nbands; band++) {
761 if (nla_put_u32(skb, TCA_ETS_QUANTA_BAND,
762 q->classes[band].quantum))
766 nla_nest_end(skb, nest);
769 nest = nla_nest_start(skb, TCA_ETS_PRIOMAP);
773 for (prio = 0; prio <= TC_PRIO_MAX; prio++) {
774 if (nla_put_u8(skb, TCA_ETS_PRIOMAP_BAND, q->prio2band[prio]))
778 nla_nest_end(skb, nest);
780 return nla_nest_end(skb, opts);
783 nla_nest_cancel(skb, opts);
787 static const struct Qdisc_class_ops ets_class_ops = {
788 .change = ets_class_change,
789 .graft = ets_class_graft,
790 .leaf = ets_class_leaf,
791 .find = ets_class_find,
792 .qlen_notify = ets_class_qlen_notify,
793 .dump = ets_class_dump,
794 .dump_stats = ets_class_dump_stats,
795 .walk = ets_qdisc_walk,
796 .tcf_block = ets_qdisc_tcf_block,
797 .bind_tcf = ets_qdisc_bind_tcf,
798 .unbind_tcf = ets_qdisc_unbind_tcf,
801 static struct Qdisc_ops ets_qdisc_ops __read_mostly = {
802 .cl_ops = &ets_class_ops,
804 .priv_size = sizeof(struct ets_sched),
805 .enqueue = ets_qdisc_enqueue,
806 .dequeue = ets_qdisc_dequeue,
807 .peek = qdisc_peek_dequeued,
808 .change = ets_qdisc_change,
809 .init = ets_qdisc_init,
810 .reset = ets_qdisc_reset,
811 .destroy = ets_qdisc_destroy,
812 .dump = ets_qdisc_dump,
813 .owner = THIS_MODULE,
816 static int __init ets_init(void)
818 return register_qdisc(&ets_qdisc_ops);
821 static void __exit ets_exit(void)
823 unregister_qdisc(&ets_qdisc_ops);
826 module_init(ets_init);
827 module_exit(ets_exit);
828 MODULE_LICENSE("GPL");