Merge tag 'block-6.6-2023-10-20' of git://git.kernel.dk/linux
[platform/kernel/linux-rpi.git] / net / sched / sch_api.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * net/sched/sch_api.c  Packet scheduler API.
4  *
5  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
6  *
7  * Fixes:
8  *
9  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
10  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
11  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
12  */
13
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/string.h>
18 #include <linux/errno.h>
19 #include <linux/skbuff.h>
20 #include <linux/init.h>
21 #include <linux/proc_fs.h>
22 #include <linux/seq_file.h>
23 #include <linux/kmod.h>
24 #include <linux/list.h>
25 #include <linux/hrtimer.h>
26 #include <linux/slab.h>
27 #include <linux/hashtable.h>
28
29 #include <net/net_namespace.h>
30 #include <net/sock.h>
31 #include <net/netlink.h>
32 #include <net/pkt_sched.h>
33 #include <net/pkt_cls.h>
34 #include <net/tc_wrapper.h>
35
36 #include <trace/events/qdisc.h>
37
38 /*
39
40    Short review.
41    -------------
42
43    This file consists of two interrelated parts:
44
45    1. queueing disciplines manager frontend.
46    2. traffic classes manager frontend.
47
48    Generally, queueing discipline ("qdisc") is a black box,
49    which is able to enqueue packets and to dequeue them (when
50    device is ready to send something) in order and at times
51    determined by algorithm hidden in it.
52
53    qdisc's are divided to two categories:
54    - "queues", which have no internal structure visible from outside.
55    - "schedulers", which split all the packets to "traffic classes",
56      using "packet classifiers" (look at cls_api.c)
57
58    In turn, classes may have child qdiscs (as rule, queues)
59    attached to them etc. etc. etc.
60
61    The goal of the routines in this file is to translate
62    information supplied by user in the form of handles
63    to more intelligible for kernel form, to make some sanity
64    checks and part of work, which is common to all qdiscs
65    and to provide rtnetlink notifications.
66
67    All real intelligent work is done inside qdisc modules.
68
69
70
71    Every discipline has two major routines: enqueue and dequeue.
72
73    ---dequeue
74
75    dequeue usually returns a skb to send. It is allowed to return NULL,
76    but it does not mean that queue is empty, it just means that
77    discipline does not want to send anything this time.
78    Queue is really empty if q->q.qlen == 0.
79    For complicated disciplines with multiple queues q->q is not
80    real packet queue, but however q->q.qlen must be valid.
81
82    ---enqueue
83
84    enqueue returns 0, if packet was enqueued successfully.
85    If packet (this one or another one) was dropped, it returns
86    not zero error code.
87    NET_XMIT_DROP        - this packet dropped
88      Expected action: do not backoff, but wait until queue will clear.
89    NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
90      Expected action: backoff or ignore
91
92    Auxiliary routines:
93
94    ---peek
95
96    like dequeue but without removing a packet from the queue
97
98    ---reset
99
100    returns qdisc to initial state: purge all buffers, clear all
101    timers, counters (except for statistics) etc.
102
103    ---init
104
105    initializes newly created qdisc.
106
107    ---destroy
108
109    destroys resources allocated by init and during lifetime of qdisc.
110
111    ---change
112
113    changes qdisc parameters.
114  */
115
116 /* Protects list of registered TC modules. It is pure SMP lock. */
117 static DEFINE_RWLOCK(qdisc_mod_lock);
118
119
120 /************************************************
121  *      Queueing disciplines manipulation.      *
122  ************************************************/
123
124
125 /* The list of all installed queueing disciplines. */
126
127 static struct Qdisc_ops *qdisc_base;
128
129 /* Register/unregister queueing discipline */
130
131 int register_qdisc(struct Qdisc_ops *qops)
132 {
133         struct Qdisc_ops *q, **qp;
134         int rc = -EEXIST;
135
136         write_lock(&qdisc_mod_lock);
137         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
138                 if (!strcmp(qops->id, q->id))
139                         goto out;
140
141         if (qops->enqueue == NULL)
142                 qops->enqueue = noop_qdisc_ops.enqueue;
143         if (qops->peek == NULL) {
144                 if (qops->dequeue == NULL)
145                         qops->peek = noop_qdisc_ops.peek;
146                 else
147                         goto out_einval;
148         }
149         if (qops->dequeue == NULL)
150                 qops->dequeue = noop_qdisc_ops.dequeue;
151
152         if (qops->cl_ops) {
153                 const struct Qdisc_class_ops *cops = qops->cl_ops;
154
155                 if (!(cops->find && cops->walk && cops->leaf))
156                         goto out_einval;
157
158                 if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
159                         goto out_einval;
160         }
161
162         qops->next = NULL;
163         *qp = qops;
164         rc = 0;
165 out:
166         write_unlock(&qdisc_mod_lock);
167         return rc;
168
169 out_einval:
170         rc = -EINVAL;
171         goto out;
172 }
173 EXPORT_SYMBOL(register_qdisc);
174
175 void unregister_qdisc(struct Qdisc_ops *qops)
176 {
177         struct Qdisc_ops *q, **qp;
178         int err = -ENOENT;
179
180         write_lock(&qdisc_mod_lock);
181         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
182                 if (q == qops)
183                         break;
184         if (q) {
185                 *qp = q->next;
186                 q->next = NULL;
187                 err = 0;
188         }
189         write_unlock(&qdisc_mod_lock);
190
191         WARN(err, "unregister qdisc(%s) failed\n", qops->id);
192 }
193 EXPORT_SYMBOL(unregister_qdisc);
194
195 /* Get default qdisc if not otherwise specified */
196 void qdisc_get_default(char *name, size_t len)
197 {
198         read_lock(&qdisc_mod_lock);
199         strscpy(name, default_qdisc_ops->id, len);
200         read_unlock(&qdisc_mod_lock);
201 }
202
203 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
204 {
205         struct Qdisc_ops *q = NULL;
206
207         for (q = qdisc_base; q; q = q->next) {
208                 if (!strcmp(name, q->id)) {
209                         if (!try_module_get(q->owner))
210                                 q = NULL;
211                         break;
212                 }
213         }
214
215         return q;
216 }
217
218 /* Set new default qdisc to use */
219 int qdisc_set_default(const char *name)
220 {
221         const struct Qdisc_ops *ops;
222
223         if (!capable(CAP_NET_ADMIN))
224                 return -EPERM;
225
226         write_lock(&qdisc_mod_lock);
227         ops = qdisc_lookup_default(name);
228         if (!ops) {
229                 /* Not found, drop lock and try to load module */
230                 write_unlock(&qdisc_mod_lock);
231                 request_module("sch_%s", name);
232                 write_lock(&qdisc_mod_lock);
233
234                 ops = qdisc_lookup_default(name);
235         }
236
237         if (ops) {
238                 /* Set new default */
239                 module_put(default_qdisc_ops->owner);
240                 default_qdisc_ops = ops;
241         }
242         write_unlock(&qdisc_mod_lock);
243
244         return ops ? 0 : -ENOENT;
245 }
246
247 #ifdef CONFIG_NET_SCH_DEFAULT
248 /* Set default value from kernel config */
249 static int __init sch_default_qdisc(void)
250 {
251         return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
252 }
253 late_initcall(sch_default_qdisc);
254 #endif
255
256 /* We know handle. Find qdisc among all qdisc's attached to device
257  * (root qdisc, all its children, children of children etc.)
258  * Note: caller either uses rtnl or rcu_read_lock()
259  */
260
261 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
262 {
263         struct Qdisc *q;
264
265         if (!qdisc_dev(root))
266                 return (root->handle == handle ? root : NULL);
267
268         if (!(root->flags & TCQ_F_BUILTIN) &&
269             root->handle == handle)
270                 return root;
271
272         hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle,
273                                    lockdep_rtnl_is_held()) {
274                 if (q->handle == handle)
275                         return q;
276         }
277         return NULL;
278 }
279
280 void qdisc_hash_add(struct Qdisc *q, bool invisible)
281 {
282         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
283                 ASSERT_RTNL();
284                 hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
285                 if (invisible)
286                         q->flags |= TCQ_F_INVISIBLE;
287         }
288 }
289 EXPORT_SYMBOL(qdisc_hash_add);
290
291 void qdisc_hash_del(struct Qdisc *q)
292 {
293         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
294                 ASSERT_RTNL();
295                 hash_del_rcu(&q->hash);
296         }
297 }
298 EXPORT_SYMBOL(qdisc_hash_del);
299
300 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
301 {
302         struct Qdisc *q;
303
304         if (!handle)
305                 return NULL;
306         q = qdisc_match_from_root(rtnl_dereference(dev->qdisc), handle);
307         if (q)
308                 goto out;
309
310         if (dev_ingress_queue(dev))
311                 q = qdisc_match_from_root(
312                         rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping),
313                         handle);
314 out:
315         return q;
316 }
317
318 struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle)
319 {
320         struct netdev_queue *nq;
321         struct Qdisc *q;
322
323         if (!handle)
324                 return NULL;
325         q = qdisc_match_from_root(rcu_dereference(dev->qdisc), handle);
326         if (q)
327                 goto out;
328
329         nq = dev_ingress_queue_rcu(dev);
330         if (nq)
331                 q = qdisc_match_from_root(rcu_dereference(nq->qdisc_sleeping),
332                                           handle);
333 out:
334         return q;
335 }
336
337 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
338 {
339         unsigned long cl;
340         const struct Qdisc_class_ops *cops = p->ops->cl_ops;
341
342         if (cops == NULL)
343                 return NULL;
344         cl = cops->find(p, classid);
345
346         if (cl == 0)
347                 return NULL;
348         return cops->leaf(p, cl);
349 }
350
351 /* Find queueing discipline by name */
352
353 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
354 {
355         struct Qdisc_ops *q = NULL;
356
357         if (kind) {
358                 read_lock(&qdisc_mod_lock);
359                 for (q = qdisc_base; q; q = q->next) {
360                         if (nla_strcmp(kind, q->id) == 0) {
361                                 if (!try_module_get(q->owner))
362                                         q = NULL;
363                                 break;
364                         }
365                 }
366                 read_unlock(&qdisc_mod_lock);
367         }
368         return q;
369 }
370
371 /* The linklayer setting were not transferred from iproute2, in older
372  * versions, and the rate tables lookup systems have been dropped in
373  * the kernel. To keep backward compatible with older iproute2 tc
374  * utils, we detect the linklayer setting by detecting if the rate
375  * table were modified.
376  *
377  * For linklayer ATM table entries, the rate table will be aligned to
378  * 48 bytes, thus some table entries will contain the same value.  The
379  * mpu (min packet unit) is also encoded into the old rate table, thus
380  * starting from the mpu, we find low and high table entries for
381  * mapping this cell.  If these entries contain the same value, when
382  * the rate tables have been modified for linklayer ATM.
383  *
384  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
385  * and then roundup to the next cell, calc the table entry one below,
386  * and compare.
387  */
388 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
389 {
390         int low       = roundup(r->mpu, 48);
391         int high      = roundup(low+1, 48);
392         int cell_low  = low >> r->cell_log;
393         int cell_high = (high >> r->cell_log) - 1;
394
395         /* rtab is too inaccurate at rates > 100Mbit/s */
396         if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
397                 pr_debug("TC linklayer: Giving up ATM detection\n");
398                 return TC_LINKLAYER_ETHERNET;
399         }
400
401         if ((cell_high > cell_low) && (cell_high < 256)
402             && (rtab[cell_low] == rtab[cell_high])) {
403                 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
404                          cell_low, cell_high, rtab[cell_high]);
405                 return TC_LINKLAYER_ATM;
406         }
407         return TC_LINKLAYER_ETHERNET;
408 }
409
410 static struct qdisc_rate_table *qdisc_rtab_list;
411
412 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
413                                         struct nlattr *tab,
414                                         struct netlink_ext_ack *extack)
415 {
416         struct qdisc_rate_table *rtab;
417
418         if (tab == NULL || r->rate == 0 ||
419             r->cell_log == 0 || r->cell_log >= 32 ||
420             nla_len(tab) != TC_RTAB_SIZE) {
421                 NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
422                 return NULL;
423         }
424
425         for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
426                 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
427                     !memcmp(&rtab->data, nla_data(tab), 1024)) {
428                         rtab->refcnt++;
429                         return rtab;
430                 }
431         }
432
433         rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
434         if (rtab) {
435                 rtab->rate = *r;
436                 rtab->refcnt = 1;
437                 memcpy(rtab->data, nla_data(tab), 1024);
438                 if (r->linklayer == TC_LINKLAYER_UNAWARE)
439                         r->linklayer = __detect_linklayer(r, rtab->data);
440                 rtab->next = qdisc_rtab_list;
441                 qdisc_rtab_list = rtab;
442         } else {
443                 NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
444         }
445         return rtab;
446 }
447 EXPORT_SYMBOL(qdisc_get_rtab);
448
449 void qdisc_put_rtab(struct qdisc_rate_table *tab)
450 {
451         struct qdisc_rate_table *rtab, **rtabp;
452
453         if (!tab || --tab->refcnt)
454                 return;
455
456         for (rtabp = &qdisc_rtab_list;
457              (rtab = *rtabp) != NULL;
458              rtabp = &rtab->next) {
459                 if (rtab == tab) {
460                         *rtabp = rtab->next;
461                         kfree(rtab);
462                         return;
463                 }
464         }
465 }
466 EXPORT_SYMBOL(qdisc_put_rtab);
467
468 static LIST_HEAD(qdisc_stab_list);
469
470 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
471         [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
472         [TCA_STAB_DATA] = { .type = NLA_BINARY },
473 };
474
475 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
476                                                struct netlink_ext_ack *extack)
477 {
478         struct nlattr *tb[TCA_STAB_MAX + 1];
479         struct qdisc_size_table *stab;
480         struct tc_sizespec *s;
481         unsigned int tsize = 0;
482         u16 *tab = NULL;
483         int err;
484
485         err = nla_parse_nested_deprecated(tb, TCA_STAB_MAX, opt, stab_policy,
486                                           extack);
487         if (err < 0)
488                 return ERR_PTR(err);
489         if (!tb[TCA_STAB_BASE]) {
490                 NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
491                 return ERR_PTR(-EINVAL);
492         }
493
494         s = nla_data(tb[TCA_STAB_BASE]);
495
496         if (s->tsize > 0) {
497                 if (!tb[TCA_STAB_DATA]) {
498                         NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
499                         return ERR_PTR(-EINVAL);
500                 }
501                 tab = nla_data(tb[TCA_STAB_DATA]);
502                 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
503         }
504
505         if (tsize != s->tsize || (!tab && tsize > 0)) {
506                 NL_SET_ERR_MSG(extack, "Invalid size of size table");
507                 return ERR_PTR(-EINVAL);
508         }
509
510         list_for_each_entry(stab, &qdisc_stab_list, list) {
511                 if (memcmp(&stab->szopts, s, sizeof(*s)))
512                         continue;
513                 if (tsize > 0 &&
514                     memcmp(stab->data, tab, flex_array_size(stab, data, tsize)))
515                         continue;
516                 stab->refcnt++;
517                 return stab;
518         }
519
520         if (s->size_log > STAB_SIZE_LOG_MAX ||
521             s->cell_log > STAB_SIZE_LOG_MAX) {
522                 NL_SET_ERR_MSG(extack, "Invalid logarithmic size of size table");
523                 return ERR_PTR(-EINVAL);
524         }
525
526         stab = kmalloc(struct_size(stab, data, tsize), GFP_KERNEL);
527         if (!stab)
528                 return ERR_PTR(-ENOMEM);
529
530         stab->refcnt = 1;
531         stab->szopts = *s;
532         if (tsize > 0)
533                 memcpy(stab->data, tab, flex_array_size(stab, data, tsize));
534
535         list_add_tail(&stab->list, &qdisc_stab_list);
536
537         return stab;
538 }
539
540 void qdisc_put_stab(struct qdisc_size_table *tab)
541 {
542         if (!tab)
543                 return;
544
545         if (--tab->refcnt == 0) {
546                 list_del(&tab->list);
547                 kfree_rcu(tab, rcu);
548         }
549 }
550 EXPORT_SYMBOL(qdisc_put_stab);
551
552 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
553 {
554         struct nlattr *nest;
555
556         nest = nla_nest_start_noflag(skb, TCA_STAB);
557         if (nest == NULL)
558                 goto nla_put_failure;
559         if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
560                 goto nla_put_failure;
561         nla_nest_end(skb, nest);
562
563         return skb->len;
564
565 nla_put_failure:
566         return -1;
567 }
568
569 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
570                                const struct qdisc_size_table *stab)
571 {
572         int pkt_len, slot;
573
574         pkt_len = skb->len + stab->szopts.overhead;
575         if (unlikely(!stab->szopts.tsize))
576                 goto out;
577
578         slot = pkt_len + stab->szopts.cell_align;
579         if (unlikely(slot < 0))
580                 slot = 0;
581
582         slot >>= stab->szopts.cell_log;
583         if (likely(slot < stab->szopts.tsize))
584                 pkt_len = stab->data[slot];
585         else
586                 pkt_len = stab->data[stab->szopts.tsize - 1] *
587                                 (slot / stab->szopts.tsize) +
588                                 stab->data[slot % stab->szopts.tsize];
589
590         pkt_len <<= stab->szopts.size_log;
591 out:
592         if (unlikely(pkt_len < 1))
593                 pkt_len = 1;
594         qdisc_skb_cb(skb)->pkt_len = pkt_len;
595 }
596 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
597
598 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
599 {
600         if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
601                 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
602                         txt, qdisc->ops->id, qdisc->handle >> 16);
603                 qdisc->flags |= TCQ_F_WARN_NONWC;
604         }
605 }
606 EXPORT_SYMBOL(qdisc_warn_nonwc);
607
608 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
609 {
610         struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
611                                                  timer);
612
613         rcu_read_lock();
614         __netif_schedule(qdisc_root(wd->qdisc));
615         rcu_read_unlock();
616
617         return HRTIMER_NORESTART;
618 }
619
620 void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
621                                  clockid_t clockid)
622 {
623         hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
624         wd->timer.function = qdisc_watchdog;
625         wd->qdisc = qdisc;
626 }
627 EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
628
629 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
630 {
631         qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
632 }
633 EXPORT_SYMBOL(qdisc_watchdog_init);
634
635 void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires,
636                                       u64 delta_ns)
637 {
638         bool deactivated;
639
640         rcu_read_lock();
641         deactivated = test_bit(__QDISC_STATE_DEACTIVATED,
642                                &qdisc_root_sleeping(wd->qdisc)->state);
643         rcu_read_unlock();
644         if (deactivated)
645                 return;
646
647         if (hrtimer_is_queued(&wd->timer)) {
648                 u64 softexpires;
649
650                 softexpires = ktime_to_ns(hrtimer_get_softexpires(&wd->timer));
651                 /* If timer is already set in [expires, expires + delta_ns],
652                  * do not reprogram it.
653                  */
654                 if (softexpires - expires <= delta_ns)
655                         return;
656         }
657
658         hrtimer_start_range_ns(&wd->timer,
659                                ns_to_ktime(expires),
660                                delta_ns,
661                                HRTIMER_MODE_ABS_PINNED);
662 }
663 EXPORT_SYMBOL(qdisc_watchdog_schedule_range_ns);
664
665 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
666 {
667         hrtimer_cancel(&wd->timer);
668 }
669 EXPORT_SYMBOL(qdisc_watchdog_cancel);
670
671 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
672 {
673         struct hlist_head *h;
674         unsigned int i;
675
676         h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
677
678         if (h != NULL) {
679                 for (i = 0; i < n; i++)
680                         INIT_HLIST_HEAD(&h[i]);
681         }
682         return h;
683 }
684
685 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
686 {
687         struct Qdisc_class_common *cl;
688         struct hlist_node *next;
689         struct hlist_head *nhash, *ohash;
690         unsigned int nsize, nmask, osize;
691         unsigned int i, h;
692
693         /* Rehash when load factor exceeds 0.75 */
694         if (clhash->hashelems * 4 <= clhash->hashsize * 3)
695                 return;
696         nsize = clhash->hashsize * 2;
697         nmask = nsize - 1;
698         nhash = qdisc_class_hash_alloc(nsize);
699         if (nhash == NULL)
700                 return;
701
702         ohash = clhash->hash;
703         osize = clhash->hashsize;
704
705         sch_tree_lock(sch);
706         for (i = 0; i < osize; i++) {
707                 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
708                         h = qdisc_class_hash(cl->classid, nmask);
709                         hlist_add_head(&cl->hnode, &nhash[h]);
710                 }
711         }
712         clhash->hash     = nhash;
713         clhash->hashsize = nsize;
714         clhash->hashmask = nmask;
715         sch_tree_unlock(sch);
716
717         kvfree(ohash);
718 }
719 EXPORT_SYMBOL(qdisc_class_hash_grow);
720
721 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
722 {
723         unsigned int size = 4;
724
725         clhash->hash = qdisc_class_hash_alloc(size);
726         if (!clhash->hash)
727                 return -ENOMEM;
728         clhash->hashsize  = size;
729         clhash->hashmask  = size - 1;
730         clhash->hashelems = 0;
731         return 0;
732 }
733 EXPORT_SYMBOL(qdisc_class_hash_init);
734
735 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
736 {
737         kvfree(clhash->hash);
738 }
739 EXPORT_SYMBOL(qdisc_class_hash_destroy);
740
741 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
742                              struct Qdisc_class_common *cl)
743 {
744         unsigned int h;
745
746         INIT_HLIST_NODE(&cl->hnode);
747         h = qdisc_class_hash(cl->classid, clhash->hashmask);
748         hlist_add_head(&cl->hnode, &clhash->hash[h]);
749         clhash->hashelems++;
750 }
751 EXPORT_SYMBOL(qdisc_class_hash_insert);
752
753 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
754                              struct Qdisc_class_common *cl)
755 {
756         hlist_del(&cl->hnode);
757         clhash->hashelems--;
758 }
759 EXPORT_SYMBOL(qdisc_class_hash_remove);
760
761 /* Allocate an unique handle from space managed by kernel
762  * Possible range is [8000-FFFF]:0000 (0x8000 values)
763  */
764 static u32 qdisc_alloc_handle(struct net_device *dev)
765 {
766         int i = 0x8000;
767         static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
768
769         do {
770                 autohandle += TC_H_MAKE(0x10000U, 0);
771                 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
772                         autohandle = TC_H_MAKE(0x80000000U, 0);
773                 if (!qdisc_lookup(dev, autohandle))
774                         return autohandle;
775                 cond_resched();
776         } while (--i > 0);
777
778         return 0;
779 }
780
781 void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len)
782 {
783         bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
784         const struct Qdisc_class_ops *cops;
785         unsigned long cl;
786         u32 parentid;
787         bool notify;
788         int drops;
789
790         if (n == 0 && len == 0)
791                 return;
792         drops = max_t(int, n, 0);
793         rcu_read_lock();
794         while ((parentid = sch->parent)) {
795                 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
796                         break;
797
798                 if (sch->flags & TCQ_F_NOPARENT)
799                         break;
800                 /* Notify parent qdisc only if child qdisc becomes empty.
801                  *
802                  * If child was empty even before update then backlog
803                  * counter is screwed and we skip notification because
804                  * parent class is already passive.
805                  *
806                  * If the original child was offloaded then it is allowed
807                  * to be seem as empty, so the parent is notified anyway.
808                  */
809                 notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
810                                                        !qdisc_is_offloaded);
811                 /* TODO: perform the search on a per txq basis */
812                 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
813                 if (sch == NULL) {
814                         WARN_ON_ONCE(parentid != TC_H_ROOT);
815                         break;
816                 }
817                 cops = sch->ops->cl_ops;
818                 if (notify && cops->qlen_notify) {
819                         cl = cops->find(sch, parentid);
820                         cops->qlen_notify(sch, cl);
821                 }
822                 sch->q.qlen -= n;
823                 sch->qstats.backlog -= len;
824                 __qdisc_qstats_drop(sch, drops);
825         }
826         rcu_read_unlock();
827 }
828 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
829
830 int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type,
831                               void *type_data)
832 {
833         struct net_device *dev = qdisc_dev(sch);
834         int err;
835
836         sch->flags &= ~TCQ_F_OFFLOADED;
837         if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
838                 return 0;
839
840         err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
841         if (err == -EOPNOTSUPP)
842                 return 0;
843
844         if (!err)
845                 sch->flags |= TCQ_F_OFFLOADED;
846
847         return err;
848 }
849 EXPORT_SYMBOL(qdisc_offload_dump_helper);
850
851 void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
852                                 struct Qdisc *new, struct Qdisc *old,
853                                 enum tc_setup_type type, void *type_data,
854                                 struct netlink_ext_ack *extack)
855 {
856         bool any_qdisc_is_offloaded;
857         int err;
858
859         if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
860                 return;
861
862         err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
863
864         /* Don't report error if the graft is part of destroy operation. */
865         if (!err || !new || new == &noop_qdisc)
866                 return;
867
868         /* Don't report error if the parent, the old child and the new
869          * one are not offloaded.
870          */
871         any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED;
872         any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED;
873         any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED;
874
875         if (any_qdisc_is_offloaded)
876                 NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
877 }
878 EXPORT_SYMBOL(qdisc_offload_graft_helper);
879
880 void qdisc_offload_query_caps(struct net_device *dev,
881                               enum tc_setup_type type,
882                               void *caps, size_t caps_len)
883 {
884         const struct net_device_ops *ops = dev->netdev_ops;
885         struct tc_query_caps_base base = {
886                 .type = type,
887                 .caps = caps,
888         };
889
890         memset(caps, 0, caps_len);
891
892         if (ops->ndo_setup_tc)
893                 ops->ndo_setup_tc(dev, TC_QUERY_CAPS, &base);
894 }
895 EXPORT_SYMBOL(qdisc_offload_query_caps);
896
897 static void qdisc_offload_graft_root(struct net_device *dev,
898                                      struct Qdisc *new, struct Qdisc *old,
899                                      struct netlink_ext_ack *extack)
900 {
901         struct tc_root_qopt_offload graft_offload = {
902                 .command        = TC_ROOT_GRAFT,
903                 .handle         = new ? new->handle : 0,
904                 .ingress        = (new && new->flags & TCQ_F_INGRESS) ||
905                                   (old && old->flags & TCQ_F_INGRESS),
906         };
907
908         qdisc_offload_graft_helper(dev, NULL, new, old,
909                                    TC_SETUP_ROOT_QDISC, &graft_offload, extack);
910 }
911
912 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
913                          u32 portid, u32 seq, u16 flags, int event,
914                          struct netlink_ext_ack *extack)
915 {
916         struct gnet_stats_basic_sync __percpu *cpu_bstats = NULL;
917         struct gnet_stats_queue __percpu *cpu_qstats = NULL;
918         struct tcmsg *tcm;
919         struct nlmsghdr  *nlh;
920         unsigned char *b = skb_tail_pointer(skb);
921         struct gnet_dump d;
922         struct qdisc_size_table *stab;
923         u32 block_index;
924         __u32 qlen;
925
926         cond_resched();
927         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
928         if (!nlh)
929                 goto out_nlmsg_trim;
930         tcm = nlmsg_data(nlh);
931         tcm->tcm_family = AF_UNSPEC;
932         tcm->tcm__pad1 = 0;
933         tcm->tcm__pad2 = 0;
934         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
935         tcm->tcm_parent = clid;
936         tcm->tcm_handle = q->handle;
937         tcm->tcm_info = refcount_read(&q->refcnt);
938         if (nla_put_string(skb, TCA_KIND, q->ops->id))
939                 goto nla_put_failure;
940         if (q->ops->ingress_block_get) {
941                 block_index = q->ops->ingress_block_get(q);
942                 if (block_index &&
943                     nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
944                         goto nla_put_failure;
945         }
946         if (q->ops->egress_block_get) {
947                 block_index = q->ops->egress_block_get(q);
948                 if (block_index &&
949                     nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
950                         goto nla_put_failure;
951         }
952         if (q->ops->dump && q->ops->dump(q, skb) < 0)
953                 goto nla_put_failure;
954         if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
955                 goto nla_put_failure;
956         qlen = qdisc_qlen_sum(q);
957
958         stab = rtnl_dereference(q->stab);
959         if (stab && qdisc_dump_stab(skb, stab) < 0)
960                 goto nla_put_failure;
961
962         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
963                                          NULL, &d, TCA_PAD) < 0)
964                 goto nla_put_failure;
965
966         if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
967                 goto nla_put_failure;
968
969         if (qdisc_is_percpu_stats(q)) {
970                 cpu_bstats = q->cpu_bstats;
971                 cpu_qstats = q->cpu_qstats;
972         }
973
974         if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats, true) < 0 ||
975             gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
976             gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
977                 goto nla_put_failure;
978
979         if (gnet_stats_finish_copy(&d) < 0)
980                 goto nla_put_failure;
981
982         if (extack && extack->_msg &&
983             nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg))
984                 goto out_nlmsg_trim;
985
986         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
987
988         return skb->len;
989
990 out_nlmsg_trim:
991 nla_put_failure:
992         nlmsg_trim(skb, b);
993         return -1;
994 }
995
996 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
997 {
998         if (q->flags & TCQ_F_BUILTIN)
999                 return true;
1000         if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
1001                 return true;
1002
1003         return false;
1004 }
1005
1006 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1007                         struct nlmsghdr *n, u32 clid,
1008                         struct Qdisc *old, struct Qdisc *new,
1009                         struct netlink_ext_ack *extack)
1010 {
1011         struct sk_buff *skb;
1012         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1013
1014         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1015         if (!skb)
1016                 return -ENOBUFS;
1017
1018         if (old && !tc_qdisc_dump_ignore(old, false)) {
1019                 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
1020                                   0, RTM_DELQDISC, extack) < 0)
1021                         goto err_out;
1022         }
1023         if (new && !tc_qdisc_dump_ignore(new, false)) {
1024                 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
1025                                   old ? NLM_F_REPLACE : 0, RTM_NEWQDISC, extack) < 0)
1026                         goto err_out;
1027         }
1028
1029         if (skb->len)
1030                 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1031                                       n->nlmsg_flags & NLM_F_ECHO);
1032
1033 err_out:
1034         kfree_skb(skb);
1035         return -EINVAL;
1036 }
1037
1038 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
1039                                struct nlmsghdr *n, u32 clid,
1040                                struct Qdisc *old, struct Qdisc *new,
1041                                struct netlink_ext_ack *extack)
1042 {
1043         if (new || old)
1044                 qdisc_notify(net, skb, n, clid, old, new, extack);
1045
1046         if (old)
1047                 qdisc_put(old);
1048 }
1049
1050 static void qdisc_clear_nolock(struct Qdisc *sch)
1051 {
1052         sch->flags &= ~TCQ_F_NOLOCK;
1053         if (!(sch->flags & TCQ_F_CPUSTATS))
1054                 return;
1055
1056         free_percpu(sch->cpu_bstats);
1057         free_percpu(sch->cpu_qstats);
1058         sch->cpu_bstats = NULL;
1059         sch->cpu_qstats = NULL;
1060         sch->flags &= ~TCQ_F_CPUSTATS;
1061 }
1062
1063 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
1064  * to device "dev".
1065  *
1066  * When appropriate send a netlink notification using 'skb'
1067  * and "n".
1068  *
1069  * On success, destroy old qdisc.
1070  */
1071
1072 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
1073                        struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
1074                        struct Qdisc *new, struct Qdisc *old,
1075                        struct netlink_ext_ack *extack)
1076 {
1077         struct Qdisc *q = old;
1078         struct net *net = dev_net(dev);
1079
1080         if (parent == NULL) {
1081                 unsigned int i, num_q, ingress;
1082                 struct netdev_queue *dev_queue;
1083
1084                 ingress = 0;
1085                 num_q = dev->num_tx_queues;
1086                 if ((q && q->flags & TCQ_F_INGRESS) ||
1087                     (new && new->flags & TCQ_F_INGRESS)) {
1088                         ingress = 1;
1089                         dev_queue = dev_ingress_queue(dev);
1090                         if (!dev_queue) {
1091                                 NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
1092                                 return -ENOENT;
1093                         }
1094
1095                         q = rtnl_dereference(dev_queue->qdisc_sleeping);
1096
1097                         /* This is the counterpart of that qdisc_refcount_inc_nz() call in
1098                          * __tcf_qdisc_find() for filter requests.
1099                          */
1100                         if (!qdisc_refcount_dec_if_one(q)) {
1101                                 NL_SET_ERR_MSG(extack,
1102                                                "Current ingress or clsact Qdisc has ongoing filter requests");
1103                                 return -EBUSY;
1104                         }
1105                 }
1106
1107                 if (dev->flags & IFF_UP)
1108                         dev_deactivate(dev);
1109
1110                 qdisc_offload_graft_root(dev, new, old, extack);
1111
1112                 if (new && new->ops->attach && !ingress)
1113                         goto skip;
1114
1115                 if (!ingress) {
1116                         for (i = 0; i < num_q; i++) {
1117                                 dev_queue = netdev_get_tx_queue(dev, i);
1118                                 old = dev_graft_qdisc(dev_queue, new);
1119
1120                                 if (new && i > 0)
1121                                         qdisc_refcount_inc(new);
1122                                 qdisc_put(old);
1123                         }
1124                 } else {
1125                         old = dev_graft_qdisc(dev_queue, NULL);
1126
1127                         /* {ingress,clsact}_destroy() @old before grafting @new to avoid
1128                          * unprotected concurrent accesses to net_device::miniq_{in,e}gress
1129                          * pointer(s) in mini_qdisc_pair_swap().
1130                          */
1131                         qdisc_notify(net, skb, n, classid, old, new, extack);
1132                         qdisc_destroy(old);
1133
1134                         dev_graft_qdisc(dev_queue, new);
1135                 }
1136
1137 skip:
1138                 if (!ingress) {
1139                         old = rtnl_dereference(dev->qdisc);
1140                         if (new && !new->ops->attach)
1141                                 qdisc_refcount_inc(new);
1142                         rcu_assign_pointer(dev->qdisc, new ? : &noop_qdisc);
1143
1144                         notify_and_destroy(net, skb, n, classid, old, new, extack);
1145
1146                         if (new && new->ops->attach)
1147                                 new->ops->attach(new);
1148                 }
1149
1150                 if (dev->flags & IFF_UP)
1151                         dev_activate(dev);
1152         } else {
1153                 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
1154                 unsigned long cl;
1155                 int err;
1156
1157                 /* Only support running class lockless if parent is lockless */
1158                 if (new && (new->flags & TCQ_F_NOLOCK) && !(parent->flags & TCQ_F_NOLOCK))
1159                         qdisc_clear_nolock(new);
1160
1161                 if (!cops || !cops->graft)
1162                         return -EOPNOTSUPP;
1163
1164                 cl = cops->find(parent, classid);
1165                 if (!cl) {
1166                         NL_SET_ERR_MSG(extack, "Specified class not found");
1167                         return -ENOENT;
1168                 }
1169
1170                 if (new && new->ops == &noqueue_qdisc_ops) {
1171                         NL_SET_ERR_MSG(extack, "Cannot assign noqueue to a class");
1172                         return -EINVAL;
1173                 }
1174
1175                 err = cops->graft(parent, cl, new, &old, extack);
1176                 if (err)
1177                         return err;
1178                 notify_and_destroy(net, skb, n, classid, old, new, extack);
1179         }
1180         return 0;
1181 }
1182
1183 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1184                                    struct netlink_ext_ack *extack)
1185 {
1186         u32 block_index;
1187
1188         if (tca[TCA_INGRESS_BLOCK]) {
1189                 block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1190
1191                 if (!block_index) {
1192                         NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1193                         return -EINVAL;
1194                 }
1195                 if (!sch->ops->ingress_block_set) {
1196                         NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1197                         return -EOPNOTSUPP;
1198                 }
1199                 sch->ops->ingress_block_set(sch, block_index);
1200         }
1201         if (tca[TCA_EGRESS_BLOCK]) {
1202                 block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1203
1204                 if (!block_index) {
1205                         NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1206                         return -EINVAL;
1207                 }
1208                 if (!sch->ops->egress_block_set) {
1209                         NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1210                         return -EOPNOTSUPP;
1211                 }
1212                 sch->ops->egress_block_set(sch, block_index);
1213         }
1214         return 0;
1215 }
1216
1217 /*
1218    Allocate and initialize new qdisc.
1219
1220    Parameters are passed via opt.
1221  */
1222
1223 static struct Qdisc *qdisc_create(struct net_device *dev,
1224                                   struct netdev_queue *dev_queue,
1225                                   u32 parent, u32 handle,
1226                                   struct nlattr **tca, int *errp,
1227                                   struct netlink_ext_ack *extack)
1228 {
1229         int err;
1230         struct nlattr *kind = tca[TCA_KIND];
1231         struct Qdisc *sch;
1232         struct Qdisc_ops *ops;
1233         struct qdisc_size_table *stab;
1234
1235         ops = qdisc_lookup_ops(kind);
1236 #ifdef CONFIG_MODULES
1237         if (ops == NULL && kind != NULL) {
1238                 char name[IFNAMSIZ];
1239                 if (nla_strscpy(name, kind, IFNAMSIZ) >= 0) {
1240                         /* We dropped the RTNL semaphore in order to
1241                          * perform the module load.  So, even if we
1242                          * succeeded in loading the module we have to
1243                          * tell the caller to replay the request.  We
1244                          * indicate this using -EAGAIN.
1245                          * We replay the request because the device may
1246                          * go away in the mean time.
1247                          */
1248                         rtnl_unlock();
1249                         request_module("sch_%s", name);
1250                         rtnl_lock();
1251                         ops = qdisc_lookup_ops(kind);
1252                         if (ops != NULL) {
1253                                 /* We will try again qdisc_lookup_ops,
1254                                  * so don't keep a reference.
1255                                  */
1256                                 module_put(ops->owner);
1257                                 err = -EAGAIN;
1258                                 goto err_out;
1259                         }
1260                 }
1261         }
1262 #endif
1263
1264         err = -ENOENT;
1265         if (!ops) {
1266                 NL_SET_ERR_MSG(extack, "Specified qdisc kind is unknown");
1267                 goto err_out;
1268         }
1269
1270         sch = qdisc_alloc(dev_queue, ops, extack);
1271         if (IS_ERR(sch)) {
1272                 err = PTR_ERR(sch);
1273                 goto err_out2;
1274         }
1275
1276         sch->parent = parent;
1277
1278         if (handle == TC_H_INGRESS) {
1279                 if (!(sch->flags & TCQ_F_INGRESS)) {
1280                         NL_SET_ERR_MSG(extack,
1281                                        "Specified parent ID is reserved for ingress and clsact Qdiscs");
1282                         err = -EINVAL;
1283                         goto err_out3;
1284                 }
1285                 handle = TC_H_MAKE(TC_H_INGRESS, 0);
1286         } else {
1287                 if (handle == 0) {
1288                         handle = qdisc_alloc_handle(dev);
1289                         if (handle == 0) {
1290                                 NL_SET_ERR_MSG(extack, "Maximum number of qdisc handles was exceeded");
1291                                 err = -ENOSPC;
1292                                 goto err_out3;
1293                         }
1294                 }
1295                 if (!netif_is_multiqueue(dev))
1296                         sch->flags |= TCQ_F_ONETXQUEUE;
1297         }
1298
1299         sch->handle = handle;
1300
1301         /* This exist to keep backward compatible with a userspace
1302          * loophole, what allowed userspace to get IFF_NO_QUEUE
1303          * facility on older kernels by setting tx_queue_len=0 (prior
1304          * to qdisc init), and then forgot to reinit tx_queue_len
1305          * before again attaching a qdisc.
1306          */
1307         if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1308                 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1309                 netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1310         }
1311
1312         err = qdisc_block_indexes_set(sch, tca, extack);
1313         if (err)
1314                 goto err_out3;
1315
1316         if (tca[TCA_STAB]) {
1317                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1318                 if (IS_ERR(stab)) {
1319                         err = PTR_ERR(stab);
1320                         goto err_out3;
1321                 }
1322                 rcu_assign_pointer(sch->stab, stab);
1323         }
1324
1325         if (ops->init) {
1326                 err = ops->init(sch, tca[TCA_OPTIONS], extack);
1327                 if (err != 0)
1328                         goto err_out4;
1329         }
1330
1331         if (tca[TCA_RATE]) {
1332                 err = -EOPNOTSUPP;
1333                 if (sch->flags & TCQ_F_MQROOT) {
1334                         NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1335                         goto err_out4;
1336                 }
1337
1338                 err = gen_new_estimator(&sch->bstats,
1339                                         sch->cpu_bstats,
1340                                         &sch->rate_est,
1341                                         NULL,
1342                                         true,
1343                                         tca[TCA_RATE]);
1344                 if (err) {
1345                         NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1346                         goto err_out4;
1347                 }
1348         }
1349
1350         qdisc_hash_add(sch, false);
1351         trace_qdisc_create(ops, dev, parent);
1352
1353         return sch;
1354
1355 err_out4:
1356         /* Even if ops->init() failed, we call ops->destroy()
1357          * like qdisc_create_dflt().
1358          */
1359         if (ops->destroy)
1360                 ops->destroy(sch);
1361         qdisc_put_stab(rtnl_dereference(sch->stab));
1362 err_out3:
1363         netdev_put(dev, &sch->dev_tracker);
1364         qdisc_free(sch);
1365 err_out2:
1366         module_put(ops->owner);
1367 err_out:
1368         *errp = err;
1369         return NULL;
1370 }
1371
1372 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1373                         struct netlink_ext_ack *extack)
1374 {
1375         struct qdisc_size_table *ostab, *stab = NULL;
1376         int err = 0;
1377
1378         if (tca[TCA_OPTIONS]) {
1379                 if (!sch->ops->change) {
1380                         NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1381                         return -EINVAL;
1382                 }
1383                 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1384                         NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1385                         return -EOPNOTSUPP;
1386                 }
1387                 err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1388                 if (err)
1389                         return err;
1390         }
1391
1392         if (tca[TCA_STAB]) {
1393                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1394                 if (IS_ERR(stab))
1395                         return PTR_ERR(stab);
1396         }
1397
1398         ostab = rtnl_dereference(sch->stab);
1399         rcu_assign_pointer(sch->stab, stab);
1400         qdisc_put_stab(ostab);
1401
1402         if (tca[TCA_RATE]) {
1403                 /* NB: ignores errors from replace_estimator
1404                    because change can't be undone. */
1405                 if (sch->flags & TCQ_F_MQROOT)
1406                         goto out;
1407                 gen_replace_estimator(&sch->bstats,
1408                                       sch->cpu_bstats,
1409                                       &sch->rate_est,
1410                                       NULL,
1411                                       true,
1412                                       tca[TCA_RATE]);
1413         }
1414 out:
1415         return 0;
1416 }
1417
1418 struct check_loop_arg {
1419         struct qdisc_walker     w;
1420         struct Qdisc            *p;
1421         int                     depth;
1422 };
1423
1424 static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1425                          struct qdisc_walker *w);
1426
1427 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1428 {
1429         struct check_loop_arg   arg;
1430
1431         if (q->ops->cl_ops == NULL)
1432                 return 0;
1433
1434         arg.w.stop = arg.w.skip = arg.w.count = 0;
1435         arg.w.fn = check_loop_fn;
1436         arg.depth = depth;
1437         arg.p = p;
1438         q->ops->cl_ops->walk(q, &arg.w);
1439         return arg.w.stop ? -ELOOP : 0;
1440 }
1441
1442 static int
1443 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1444 {
1445         struct Qdisc *leaf;
1446         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1447         struct check_loop_arg *arg = (struct check_loop_arg *)w;
1448
1449         leaf = cops->leaf(q, cl);
1450         if (leaf) {
1451                 if (leaf == arg->p || arg->depth > 7)
1452                         return -ELOOP;
1453                 return check_loop(leaf, arg->p, arg->depth + 1);
1454         }
1455         return 0;
1456 }
1457
1458 const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
1459         [TCA_KIND]              = { .type = NLA_STRING },
1460         [TCA_RATE]              = { .type = NLA_BINARY,
1461                                     .len = sizeof(struct tc_estimator) },
1462         [TCA_STAB]              = { .type = NLA_NESTED },
1463         [TCA_DUMP_INVISIBLE]    = { .type = NLA_FLAG },
1464         [TCA_CHAIN]             = { .type = NLA_U32 },
1465         [TCA_INGRESS_BLOCK]     = { .type = NLA_U32 },
1466         [TCA_EGRESS_BLOCK]      = { .type = NLA_U32 },
1467 };
1468
1469 /*
1470  * Delete/get qdisc.
1471  */
1472
1473 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1474                         struct netlink_ext_ack *extack)
1475 {
1476         struct net *net = sock_net(skb->sk);
1477         struct tcmsg *tcm = nlmsg_data(n);
1478         struct nlattr *tca[TCA_MAX + 1];
1479         struct net_device *dev;
1480         u32 clid;
1481         struct Qdisc *q = NULL;
1482         struct Qdisc *p = NULL;
1483         int err;
1484
1485         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1486                                      rtm_tca_policy, extack);
1487         if (err < 0)
1488                 return err;
1489
1490         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1491         if (!dev)
1492                 return -ENODEV;
1493
1494         clid = tcm->tcm_parent;
1495         if (clid) {
1496                 if (clid != TC_H_ROOT) {
1497                         if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1498                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1499                                 if (!p) {
1500                                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1501                                         return -ENOENT;
1502                                 }
1503                                 q = qdisc_leaf(p, clid);
1504                         } else if (dev_ingress_queue(dev)) {
1505                                 q = rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping);
1506                         }
1507                 } else {
1508                         q = rtnl_dereference(dev->qdisc);
1509                 }
1510                 if (!q) {
1511                         NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1512                         return -ENOENT;
1513                 }
1514
1515                 if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1516                         NL_SET_ERR_MSG(extack, "Invalid handle");
1517                         return -EINVAL;
1518                 }
1519         } else {
1520                 q = qdisc_lookup(dev, tcm->tcm_handle);
1521                 if (!q) {
1522                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1523                         return -ENOENT;
1524                 }
1525         }
1526
1527         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1528                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1529                 return -EINVAL;
1530         }
1531
1532         if (n->nlmsg_type == RTM_DELQDISC) {
1533                 if (!clid) {
1534                         NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1535                         return -EINVAL;
1536                 }
1537                 if (q->handle == 0) {
1538                         NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1539                         return -ENOENT;
1540                 }
1541                 err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1542                 if (err != 0)
1543                         return err;
1544         } else {
1545                 qdisc_notify(net, skb, n, clid, NULL, q, NULL);
1546         }
1547         return 0;
1548 }
1549
1550 static bool req_create_or_replace(struct nlmsghdr *n)
1551 {
1552         return (n->nlmsg_flags & NLM_F_CREATE &&
1553                 n->nlmsg_flags & NLM_F_REPLACE);
1554 }
1555
1556 static bool req_create_exclusive(struct nlmsghdr *n)
1557 {
1558         return (n->nlmsg_flags & NLM_F_CREATE &&
1559                 n->nlmsg_flags & NLM_F_EXCL);
1560 }
1561
1562 static bool req_change(struct nlmsghdr *n)
1563 {
1564         return (!(n->nlmsg_flags & NLM_F_CREATE) &&
1565                 !(n->nlmsg_flags & NLM_F_REPLACE) &&
1566                 !(n->nlmsg_flags & NLM_F_EXCL));
1567 }
1568
1569 /*
1570  * Create/change qdisc.
1571  */
1572 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1573                            struct netlink_ext_ack *extack)
1574 {
1575         struct net *net = sock_net(skb->sk);
1576         struct tcmsg *tcm;
1577         struct nlattr *tca[TCA_MAX + 1];
1578         struct net_device *dev;
1579         u32 clid;
1580         struct Qdisc *q, *p;
1581         int err;
1582
1583 replay:
1584         /* Reinit, just in case something touches this. */
1585         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1586                                      rtm_tca_policy, extack);
1587         if (err < 0)
1588                 return err;
1589
1590         tcm = nlmsg_data(n);
1591         clid = tcm->tcm_parent;
1592         q = p = NULL;
1593
1594         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1595         if (!dev)
1596                 return -ENODEV;
1597
1598
1599         if (clid) {
1600                 if (clid != TC_H_ROOT) {
1601                         if (clid != TC_H_INGRESS) {
1602                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1603                                 if (!p) {
1604                                         NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1605                                         return -ENOENT;
1606                                 }
1607                                 q = qdisc_leaf(p, clid);
1608                         } else if (dev_ingress_queue_create(dev)) {
1609                                 q = rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping);
1610                         }
1611                 } else {
1612                         q = rtnl_dereference(dev->qdisc);
1613                 }
1614
1615                 /* It may be default qdisc, ignore it */
1616                 if (q && q->handle == 0)
1617                         q = NULL;
1618
1619                 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1620                         if (tcm->tcm_handle) {
1621                                 if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1622                                         NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1623                                         return -EEXIST;
1624                                 }
1625                                 if (TC_H_MIN(tcm->tcm_handle)) {
1626                                         NL_SET_ERR_MSG(extack, "Invalid minor handle");
1627                                         return -EINVAL;
1628                                 }
1629                                 q = qdisc_lookup(dev, tcm->tcm_handle);
1630                                 if (!q)
1631                                         goto create_n_graft;
1632                                 if (n->nlmsg_flags & NLM_F_EXCL) {
1633                                         NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1634                                         return -EEXIST;
1635                                 }
1636                                 if (tca[TCA_KIND] &&
1637                                     nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1638                                         NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1639                                         return -EINVAL;
1640                                 }
1641                                 if (q->flags & TCQ_F_INGRESS) {
1642                                         NL_SET_ERR_MSG(extack,
1643                                                        "Cannot regraft ingress or clsact Qdiscs");
1644                                         return -EINVAL;
1645                                 }
1646                                 if (q == p ||
1647                                     (p && check_loop(q, p, 0))) {
1648                                         NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1649                                         return -ELOOP;
1650                                 }
1651                                 if (clid == TC_H_INGRESS) {
1652                                         NL_SET_ERR_MSG(extack, "Ingress cannot graft directly");
1653                                         return -EINVAL;
1654                                 }
1655                                 qdisc_refcount_inc(q);
1656                                 goto graft;
1657                         } else {
1658                                 if (!q)
1659                                         goto create_n_graft;
1660
1661                                 /* This magic test requires explanation.
1662                                  *
1663                                  *   We know, that some child q is already
1664                                  *   attached to this parent and have choice:
1665                                  *   1) change it or 2) create/graft new one.
1666                                  *   If the requested qdisc kind is different
1667                                  *   than the existing one, then we choose graft.
1668                                  *   If they are the same then this is "change"
1669                                  *   operation - just let it fallthrough..
1670                                  *
1671                                  *   1. We are allowed to create/graft only
1672                                  *   if the request is explicitly stating
1673                                  *   "please create if it doesn't exist".
1674                                  *
1675                                  *   2. If the request is to exclusive create
1676                                  *   then the qdisc tcm_handle is not expected
1677                                  *   to exist, so that we choose create/graft too.
1678                                  *
1679                                  *   3. The last case is when no flags are set.
1680                                  *   This will happen when for example tc
1681                                  *   utility issues a "change" command.
1682                                  *   Alas, it is sort of hole in API, we
1683                                  *   cannot decide what to do unambiguously.
1684                                  *   For now we select create/graft.
1685                                  */
1686                                 if (tca[TCA_KIND] &&
1687                                     nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1688                                         if (req_create_or_replace(n) ||
1689                                             req_create_exclusive(n))
1690                                                 goto create_n_graft;
1691                                         else if (req_change(n))
1692                                                 goto create_n_graft2;
1693                                 }
1694                         }
1695                 }
1696         } else {
1697                 if (!tcm->tcm_handle) {
1698                         NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1699                         return -EINVAL;
1700                 }
1701                 q = qdisc_lookup(dev, tcm->tcm_handle);
1702         }
1703
1704         /* Change qdisc parameters */
1705         if (!q) {
1706                 NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1707                 return -ENOENT;
1708         }
1709         if (n->nlmsg_flags & NLM_F_EXCL) {
1710                 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1711                 return -EEXIST;
1712         }
1713         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1714                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1715                 return -EINVAL;
1716         }
1717         err = qdisc_change(q, tca, extack);
1718         if (err == 0)
1719                 qdisc_notify(net, skb, n, clid, NULL, q, extack);
1720         return err;
1721
1722 create_n_graft:
1723         if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1724                 NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1725                 return -ENOENT;
1726         }
1727 create_n_graft2:
1728         if (clid == TC_H_INGRESS) {
1729                 if (dev_ingress_queue(dev)) {
1730                         q = qdisc_create(dev, dev_ingress_queue(dev),
1731                                          tcm->tcm_parent, tcm->tcm_parent,
1732                                          tca, &err, extack);
1733                 } else {
1734                         NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1735                         err = -ENOENT;
1736                 }
1737         } else {
1738                 struct netdev_queue *dev_queue;
1739
1740                 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1741                         dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1742                 else if (p)
1743                         dev_queue = p->dev_queue;
1744                 else
1745                         dev_queue = netdev_get_tx_queue(dev, 0);
1746
1747                 q = qdisc_create(dev, dev_queue,
1748                                  tcm->tcm_parent, tcm->tcm_handle,
1749                                  tca, &err, extack);
1750         }
1751         if (q == NULL) {
1752                 if (err == -EAGAIN)
1753                         goto replay;
1754                 return err;
1755         }
1756
1757 graft:
1758         err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1759         if (err) {
1760                 if (q)
1761                         qdisc_put(q);
1762                 return err;
1763         }
1764
1765         return 0;
1766 }
1767
1768 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1769                               struct netlink_callback *cb,
1770                               int *q_idx_p, int s_q_idx, bool recur,
1771                               bool dump_invisible)
1772 {
1773         int ret = 0, q_idx = *q_idx_p;
1774         struct Qdisc *q;
1775         int b;
1776
1777         if (!root)
1778                 return 0;
1779
1780         q = root;
1781         if (q_idx < s_q_idx) {
1782                 q_idx++;
1783         } else {
1784                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1785                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1786                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1787                                   RTM_NEWQDISC, NULL) <= 0)
1788                         goto done;
1789                 q_idx++;
1790         }
1791
1792         /* If dumping singletons, there is no qdisc_dev(root) and the singleton
1793          * itself has already been dumped.
1794          *
1795          * If we've already dumped the top-level (ingress) qdisc above and the global
1796          * qdisc hashtable, we don't want to hit it again
1797          */
1798         if (!qdisc_dev(root) || !recur)
1799                 goto out;
1800
1801         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1802                 if (q_idx < s_q_idx) {
1803                         q_idx++;
1804                         continue;
1805                 }
1806                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1807                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1808                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1809                                   RTM_NEWQDISC, NULL) <= 0)
1810                         goto done;
1811                 q_idx++;
1812         }
1813
1814 out:
1815         *q_idx_p = q_idx;
1816         return ret;
1817 done:
1818         ret = -1;
1819         goto out;
1820 }
1821
1822 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1823 {
1824         struct net *net = sock_net(skb->sk);
1825         int idx, q_idx;
1826         int s_idx, s_q_idx;
1827         struct net_device *dev;
1828         const struct nlmsghdr *nlh = cb->nlh;
1829         struct nlattr *tca[TCA_MAX + 1];
1830         int err;
1831
1832         s_idx = cb->args[0];
1833         s_q_idx = q_idx = cb->args[1];
1834
1835         idx = 0;
1836         ASSERT_RTNL();
1837
1838         err = nlmsg_parse_deprecated(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
1839                                      rtm_tca_policy, cb->extack);
1840         if (err < 0)
1841                 return err;
1842
1843         for_each_netdev(net, dev) {
1844                 struct netdev_queue *dev_queue;
1845
1846                 if (idx < s_idx)
1847                         goto cont;
1848                 if (idx > s_idx)
1849                         s_q_idx = 0;
1850                 q_idx = 0;
1851
1852                 if (tc_dump_qdisc_root(rtnl_dereference(dev->qdisc),
1853                                        skb, cb, &q_idx, s_q_idx,
1854                                        true, tca[TCA_DUMP_INVISIBLE]) < 0)
1855                         goto done;
1856
1857                 dev_queue = dev_ingress_queue(dev);
1858                 if (dev_queue &&
1859                     tc_dump_qdisc_root(rtnl_dereference(dev_queue->qdisc_sleeping),
1860                                        skb, cb, &q_idx, s_q_idx, false,
1861                                        tca[TCA_DUMP_INVISIBLE]) < 0)
1862                         goto done;
1863
1864 cont:
1865                 idx++;
1866         }
1867
1868 done:
1869         cb->args[0] = idx;
1870         cb->args[1] = q_idx;
1871
1872         return skb->len;
1873 }
1874
1875
1876
1877 /************************************************
1878  *      Traffic classes manipulation.           *
1879  ************************************************/
1880
1881 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1882                           unsigned long cl, u32 portid, u32 seq, u16 flags,
1883                           int event, struct netlink_ext_ack *extack)
1884 {
1885         struct tcmsg *tcm;
1886         struct nlmsghdr  *nlh;
1887         unsigned char *b = skb_tail_pointer(skb);
1888         struct gnet_dump d;
1889         const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1890
1891         cond_resched();
1892         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1893         if (!nlh)
1894                 goto out_nlmsg_trim;
1895         tcm = nlmsg_data(nlh);
1896         tcm->tcm_family = AF_UNSPEC;
1897         tcm->tcm__pad1 = 0;
1898         tcm->tcm__pad2 = 0;
1899         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1900         tcm->tcm_parent = q->handle;
1901         tcm->tcm_handle = q->handle;
1902         tcm->tcm_info = 0;
1903         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1904                 goto nla_put_failure;
1905         if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1906                 goto nla_put_failure;
1907
1908         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1909                                          NULL, &d, TCA_PAD) < 0)
1910                 goto nla_put_failure;
1911
1912         if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1913                 goto nla_put_failure;
1914
1915         if (gnet_stats_finish_copy(&d) < 0)
1916                 goto nla_put_failure;
1917
1918         if (extack && extack->_msg &&
1919             nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg))
1920                 goto out_nlmsg_trim;
1921
1922         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1923
1924         return skb->len;
1925
1926 out_nlmsg_trim:
1927 nla_put_failure:
1928         nlmsg_trim(skb, b);
1929         return -1;
1930 }
1931
1932 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1933                          struct nlmsghdr *n, struct Qdisc *q,
1934                          unsigned long cl, int event, struct netlink_ext_ack *extack)
1935 {
1936         struct sk_buff *skb;
1937         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1938
1939         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1940         if (!skb)
1941                 return -ENOBUFS;
1942
1943         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event, extack) < 0) {
1944                 kfree_skb(skb);
1945                 return -EINVAL;
1946         }
1947
1948         return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1949                               n->nlmsg_flags & NLM_F_ECHO);
1950 }
1951
1952 static int tclass_del_notify(struct net *net,
1953                              const struct Qdisc_class_ops *cops,
1954                              struct sk_buff *oskb, struct nlmsghdr *n,
1955                              struct Qdisc *q, unsigned long cl,
1956                              struct netlink_ext_ack *extack)
1957 {
1958         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1959         struct sk_buff *skb;
1960         int err = 0;
1961
1962         if (!cops->delete)
1963                 return -EOPNOTSUPP;
1964
1965         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1966         if (!skb)
1967                 return -ENOBUFS;
1968
1969         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1970                            RTM_DELTCLASS, extack) < 0) {
1971                 kfree_skb(skb);
1972                 return -EINVAL;
1973         }
1974
1975         err = cops->delete(q, cl, extack);
1976         if (err) {
1977                 kfree_skb(skb);
1978                 return err;
1979         }
1980
1981         err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1982                              n->nlmsg_flags & NLM_F_ECHO);
1983         return err;
1984 }
1985
1986 #ifdef CONFIG_NET_CLS
1987
1988 struct tcf_bind_args {
1989         struct tcf_walker w;
1990         unsigned long base;
1991         unsigned long cl;
1992         u32 classid;
1993 };
1994
1995 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
1996 {
1997         struct tcf_bind_args *a = (void *)arg;
1998
1999         if (n && tp->ops->bind_class) {
2000                 struct Qdisc *q = tcf_block_q(tp->chain->block);
2001
2002                 sch_tree_lock(q);
2003                 tp->ops->bind_class(n, a->classid, a->cl, q, a->base);
2004                 sch_tree_unlock(q);
2005         }
2006         return 0;
2007 }
2008
2009 struct tc_bind_class_args {
2010         struct qdisc_walker w;
2011         unsigned long new_cl;
2012         u32 portid;
2013         u32 clid;
2014 };
2015
2016 static int tc_bind_class_walker(struct Qdisc *q, unsigned long cl,
2017                                 struct qdisc_walker *w)
2018 {
2019         struct tc_bind_class_args *a = (struct tc_bind_class_args *)w;
2020         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
2021         struct tcf_block *block;
2022         struct tcf_chain *chain;
2023
2024         block = cops->tcf_block(q, cl, NULL);
2025         if (!block)
2026                 return 0;
2027         for (chain = tcf_get_next_chain(block, NULL);
2028              chain;
2029              chain = tcf_get_next_chain(block, chain)) {
2030                 struct tcf_proto *tp;
2031
2032                 for (tp = tcf_get_next_proto(chain, NULL);
2033                      tp; tp = tcf_get_next_proto(chain, tp)) {
2034                         struct tcf_bind_args arg = {};
2035
2036                         arg.w.fn = tcf_node_bind;
2037                         arg.classid = a->clid;
2038                         arg.base = cl;
2039                         arg.cl = a->new_cl;
2040                         tp->ops->walk(tp, &arg.w, true);
2041                 }
2042         }
2043
2044         return 0;
2045 }
2046
2047 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
2048                            unsigned long new_cl)
2049 {
2050         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
2051         struct tc_bind_class_args args = {};
2052
2053         if (!cops->tcf_block)
2054                 return;
2055         args.portid = portid;
2056         args.clid = clid;
2057         args.new_cl = new_cl;
2058         args.w.fn = tc_bind_class_walker;
2059         q->ops->cl_ops->walk(q, &args.w);
2060 }
2061
2062 #else
2063
2064 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
2065                            unsigned long new_cl)
2066 {
2067 }
2068
2069 #endif
2070
2071 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
2072                          struct netlink_ext_ack *extack)
2073 {
2074         struct net *net = sock_net(skb->sk);
2075         struct tcmsg *tcm = nlmsg_data(n);
2076         struct nlattr *tca[TCA_MAX + 1];
2077         struct net_device *dev;
2078         struct Qdisc *q = NULL;
2079         const struct Qdisc_class_ops *cops;
2080         unsigned long cl = 0;
2081         unsigned long new_cl;
2082         u32 portid;
2083         u32 clid;
2084         u32 qid;
2085         int err;
2086
2087         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
2088                                      rtm_tca_policy, extack);
2089         if (err < 0)
2090                 return err;
2091
2092         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
2093         if (!dev)
2094                 return -ENODEV;
2095
2096         /*
2097            parent == TC_H_UNSPEC - unspecified parent.
2098            parent == TC_H_ROOT   - class is root, which has no parent.
2099            parent == X:0         - parent is root class.
2100            parent == X:Y         - parent is a node in hierarchy.
2101            parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
2102
2103            handle == 0:0         - generate handle from kernel pool.
2104            handle == 0:Y         - class is X:Y, where X:0 is qdisc.
2105            handle == X:Y         - clear.
2106            handle == X:0         - root class.
2107          */
2108
2109         /* Step 1. Determine qdisc handle X:0 */
2110
2111         portid = tcm->tcm_parent;
2112         clid = tcm->tcm_handle;
2113         qid = TC_H_MAJ(clid);
2114
2115         if (portid != TC_H_ROOT) {
2116                 u32 qid1 = TC_H_MAJ(portid);
2117
2118                 if (qid && qid1) {
2119                         /* If both majors are known, they must be identical. */
2120                         if (qid != qid1)
2121                                 return -EINVAL;
2122                 } else if (qid1) {
2123                         qid = qid1;
2124                 } else if (qid == 0)
2125                         qid = rtnl_dereference(dev->qdisc)->handle;
2126
2127                 /* Now qid is genuine qdisc handle consistent
2128                  * both with parent and child.
2129                  *
2130                  * TC_H_MAJ(portid) still may be unspecified, complete it now.
2131                  */
2132                 if (portid)
2133                         portid = TC_H_MAKE(qid, portid);
2134         } else {
2135                 if (qid == 0)
2136                         qid = rtnl_dereference(dev->qdisc)->handle;
2137         }
2138
2139         /* OK. Locate qdisc */
2140         q = qdisc_lookup(dev, qid);
2141         if (!q)
2142                 return -ENOENT;
2143
2144         /* An check that it supports classes */
2145         cops = q->ops->cl_ops;
2146         if (cops == NULL)
2147                 return -EINVAL;
2148
2149         /* Now try to get class */
2150         if (clid == 0) {
2151                 if (portid == TC_H_ROOT)
2152                         clid = qid;
2153         } else
2154                 clid = TC_H_MAKE(qid, clid);
2155
2156         if (clid)
2157                 cl = cops->find(q, clid);
2158
2159         if (cl == 0) {
2160                 err = -ENOENT;
2161                 if (n->nlmsg_type != RTM_NEWTCLASS ||
2162                     !(n->nlmsg_flags & NLM_F_CREATE))
2163                         goto out;
2164         } else {
2165                 switch (n->nlmsg_type) {
2166                 case RTM_NEWTCLASS:
2167                         err = -EEXIST;
2168                         if (n->nlmsg_flags & NLM_F_EXCL)
2169                                 goto out;
2170                         break;
2171                 case RTM_DELTCLASS:
2172                         err = tclass_del_notify(net, cops, skb, n, q, cl, extack);
2173                         /* Unbind the class with flilters with 0 */
2174                         tc_bind_tclass(q, portid, clid, 0);
2175                         goto out;
2176                 case RTM_GETTCLASS:
2177                         err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS, extack);
2178                         goto out;
2179                 default:
2180                         err = -EINVAL;
2181                         goto out;
2182                 }
2183         }
2184
2185         if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
2186                 NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
2187                 return -EOPNOTSUPP;
2188         }
2189
2190         new_cl = cl;
2191         err = -EOPNOTSUPP;
2192         if (cops->change)
2193                 err = cops->change(q, clid, portid, tca, &new_cl, extack);
2194         if (err == 0) {
2195                 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS, extack);
2196                 /* We just create a new class, need to do reverse binding. */
2197                 if (cl != new_cl)
2198                         tc_bind_tclass(q, portid, clid, new_cl);
2199         }
2200 out:
2201         return err;
2202 }
2203
2204 struct qdisc_dump_args {
2205         struct qdisc_walker     w;
2206         struct sk_buff          *skb;
2207         struct netlink_callback *cb;
2208 };
2209
2210 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
2211                             struct qdisc_walker *arg)
2212 {
2213         struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
2214
2215         return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
2216                               a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
2217                               RTM_NEWTCLASS, NULL);
2218 }
2219
2220 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
2221                                 struct tcmsg *tcm, struct netlink_callback *cb,
2222                                 int *t_p, int s_t)
2223 {
2224         struct qdisc_dump_args arg;
2225
2226         if (tc_qdisc_dump_ignore(q, false) ||
2227             *t_p < s_t || !q->ops->cl_ops ||
2228             (tcm->tcm_parent &&
2229              TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2230                 (*t_p)++;
2231                 return 0;
2232         }
2233         if (*t_p > s_t)
2234                 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2235         arg.w.fn = qdisc_class_dump;
2236         arg.skb = skb;
2237         arg.cb = cb;
2238         arg.w.stop  = 0;
2239         arg.w.skip = cb->args[1];
2240         arg.w.count = 0;
2241         q->ops->cl_ops->walk(q, &arg.w);
2242         cb->args[1] = arg.w.count;
2243         if (arg.w.stop)
2244                 return -1;
2245         (*t_p)++;
2246         return 0;
2247 }
2248
2249 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2250                                struct tcmsg *tcm, struct netlink_callback *cb,
2251                                int *t_p, int s_t, bool recur)
2252 {
2253         struct Qdisc *q;
2254         int b;
2255
2256         if (!root)
2257                 return 0;
2258
2259         if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2260                 return -1;
2261
2262         if (!qdisc_dev(root) || !recur)
2263                 return 0;
2264
2265         if (tcm->tcm_parent) {
2266                 q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2267                 if (q && q != root &&
2268                     tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2269                         return -1;
2270                 return 0;
2271         }
2272         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2273                 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2274                         return -1;
2275         }
2276
2277         return 0;
2278 }
2279
2280 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2281 {
2282         struct tcmsg *tcm = nlmsg_data(cb->nlh);
2283         struct net *net = sock_net(skb->sk);
2284         struct netdev_queue *dev_queue;
2285         struct net_device *dev;
2286         int t, s_t;
2287
2288         if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2289                 return 0;
2290         dev = dev_get_by_index(net, tcm->tcm_ifindex);
2291         if (!dev)
2292                 return 0;
2293
2294         s_t = cb->args[0];
2295         t = 0;
2296
2297         if (tc_dump_tclass_root(rtnl_dereference(dev->qdisc),
2298                                 skb, tcm, cb, &t, s_t, true) < 0)
2299                 goto done;
2300
2301         dev_queue = dev_ingress_queue(dev);
2302         if (dev_queue &&
2303             tc_dump_tclass_root(rtnl_dereference(dev_queue->qdisc_sleeping),
2304                                 skb, tcm, cb, &t, s_t, false) < 0)
2305                 goto done;
2306
2307 done:
2308         cb->args[0] = t;
2309
2310         dev_put(dev);
2311         return skb->len;
2312 }
2313
2314 #ifdef CONFIG_PROC_FS
2315 static int psched_show(struct seq_file *seq, void *v)
2316 {
2317         seq_printf(seq, "%08x %08x %08x %08x\n",
2318                    (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2319                    1000000,
2320                    (u32)NSEC_PER_SEC / hrtimer_resolution);
2321
2322         return 0;
2323 }
2324
2325 static int __net_init psched_net_init(struct net *net)
2326 {
2327         struct proc_dir_entry *e;
2328
2329         e = proc_create_single("psched", 0, net->proc_net, psched_show);
2330         if (e == NULL)
2331                 return -ENOMEM;
2332
2333         return 0;
2334 }
2335
2336 static void __net_exit psched_net_exit(struct net *net)
2337 {
2338         remove_proc_entry("psched", net->proc_net);
2339 }
2340 #else
2341 static int __net_init psched_net_init(struct net *net)
2342 {
2343         return 0;
2344 }
2345
2346 static void __net_exit psched_net_exit(struct net *net)
2347 {
2348 }
2349 #endif
2350
2351 static struct pernet_operations psched_net_ops = {
2352         .init = psched_net_init,
2353         .exit = psched_net_exit,
2354 };
2355
2356 #if IS_ENABLED(CONFIG_RETPOLINE)
2357 DEFINE_STATIC_KEY_FALSE(tc_skip_wrapper);
2358 #endif
2359
2360 static int __init pktsched_init(void)
2361 {
2362         int err;
2363
2364         err = register_pernet_subsys(&psched_net_ops);
2365         if (err) {
2366                 pr_err("pktsched_init: "
2367                        "cannot initialize per netns operations\n");
2368                 return err;
2369         }
2370
2371         register_qdisc(&pfifo_fast_ops);
2372         register_qdisc(&pfifo_qdisc_ops);
2373         register_qdisc(&bfifo_qdisc_ops);
2374         register_qdisc(&pfifo_head_drop_qdisc_ops);
2375         register_qdisc(&mq_qdisc_ops);
2376         register_qdisc(&noqueue_qdisc_ops);
2377
2378         rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2379         rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2380         rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2381                       0);
2382         rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2383         rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2384         rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2385                       0);
2386
2387         tc_wrapper_init();
2388
2389         return 0;
2390 }
2391
2392 subsys_initcall(pktsched_init);