net/sched: qdisc_destroy() old ingress and clsact Qdiscs before grafting
[platform/kernel/linux-starfive.git] / net / sched / sch_api.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * net/sched/sch_api.c  Packet scheduler API.
4  *
5  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
6  *
7  * Fixes:
8  *
9  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
10  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
11  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
12  */
13
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/string.h>
18 #include <linux/errno.h>
19 #include <linux/skbuff.h>
20 #include <linux/init.h>
21 #include <linux/proc_fs.h>
22 #include <linux/seq_file.h>
23 #include <linux/kmod.h>
24 #include <linux/list.h>
25 #include <linux/hrtimer.h>
26 #include <linux/slab.h>
27 #include <linux/hashtable.h>
28
29 #include <net/net_namespace.h>
30 #include <net/sock.h>
31 #include <net/netlink.h>
32 #include <net/pkt_sched.h>
33 #include <net/pkt_cls.h>
34 #include <net/tc_wrapper.h>
35
36 #include <trace/events/qdisc.h>
37
38 /*
39
40    Short review.
41    -------------
42
43    This file consists of two interrelated parts:
44
45    1. queueing disciplines manager frontend.
46    2. traffic classes manager frontend.
47
48    Generally, queueing discipline ("qdisc") is a black box,
49    which is able to enqueue packets and to dequeue them (when
50    device is ready to send something) in order and at times
51    determined by algorithm hidden in it.
52
53    qdisc's are divided to two categories:
54    - "queues", which have no internal structure visible from outside.
55    - "schedulers", which split all the packets to "traffic classes",
56      using "packet classifiers" (look at cls_api.c)
57
58    In turn, classes may have child qdiscs (as rule, queues)
59    attached to them etc. etc. etc.
60
61    The goal of the routines in this file is to translate
62    information supplied by user in the form of handles
63    to more intelligible for kernel form, to make some sanity
64    checks and part of work, which is common to all qdiscs
65    and to provide rtnetlink notifications.
66
67    All real intelligent work is done inside qdisc modules.
68
69
70
71    Every discipline has two major routines: enqueue and dequeue.
72
73    ---dequeue
74
75    dequeue usually returns a skb to send. It is allowed to return NULL,
76    but it does not mean that queue is empty, it just means that
77    discipline does not want to send anything this time.
78    Queue is really empty if q->q.qlen == 0.
79    For complicated disciplines with multiple queues q->q is not
80    real packet queue, but however q->q.qlen must be valid.
81
82    ---enqueue
83
84    enqueue returns 0, if packet was enqueued successfully.
85    If packet (this one or another one) was dropped, it returns
86    not zero error code.
87    NET_XMIT_DROP        - this packet dropped
88      Expected action: do not backoff, but wait until queue will clear.
89    NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
90      Expected action: backoff or ignore
91
92    Auxiliary routines:
93
94    ---peek
95
96    like dequeue but without removing a packet from the queue
97
98    ---reset
99
100    returns qdisc to initial state: purge all buffers, clear all
101    timers, counters (except for statistics) etc.
102
103    ---init
104
105    initializes newly created qdisc.
106
107    ---destroy
108
109    destroys resources allocated by init and during lifetime of qdisc.
110
111    ---change
112
113    changes qdisc parameters.
114  */
115
116 /* Protects list of registered TC modules. It is pure SMP lock. */
117 static DEFINE_RWLOCK(qdisc_mod_lock);
118
119
120 /************************************************
121  *      Queueing disciplines manipulation.      *
122  ************************************************/
123
124
125 /* The list of all installed queueing disciplines. */
126
127 static struct Qdisc_ops *qdisc_base;
128
129 /* Register/unregister queueing discipline */
130
131 int register_qdisc(struct Qdisc_ops *qops)
132 {
133         struct Qdisc_ops *q, **qp;
134         int rc = -EEXIST;
135
136         write_lock(&qdisc_mod_lock);
137         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
138                 if (!strcmp(qops->id, q->id))
139                         goto out;
140
141         if (qops->enqueue == NULL)
142                 qops->enqueue = noop_qdisc_ops.enqueue;
143         if (qops->peek == NULL) {
144                 if (qops->dequeue == NULL)
145                         qops->peek = noop_qdisc_ops.peek;
146                 else
147                         goto out_einval;
148         }
149         if (qops->dequeue == NULL)
150                 qops->dequeue = noop_qdisc_ops.dequeue;
151
152         if (qops->cl_ops) {
153                 const struct Qdisc_class_ops *cops = qops->cl_ops;
154
155                 if (!(cops->find && cops->walk && cops->leaf))
156                         goto out_einval;
157
158                 if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
159                         goto out_einval;
160         }
161
162         qops->next = NULL;
163         *qp = qops;
164         rc = 0;
165 out:
166         write_unlock(&qdisc_mod_lock);
167         return rc;
168
169 out_einval:
170         rc = -EINVAL;
171         goto out;
172 }
173 EXPORT_SYMBOL(register_qdisc);
174
175 void unregister_qdisc(struct Qdisc_ops *qops)
176 {
177         struct Qdisc_ops *q, **qp;
178         int err = -ENOENT;
179
180         write_lock(&qdisc_mod_lock);
181         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
182                 if (q == qops)
183                         break;
184         if (q) {
185                 *qp = q->next;
186                 q->next = NULL;
187                 err = 0;
188         }
189         write_unlock(&qdisc_mod_lock);
190
191         WARN(err, "unregister qdisc(%s) failed\n", qops->id);
192 }
193 EXPORT_SYMBOL(unregister_qdisc);
194
195 /* Get default qdisc if not otherwise specified */
196 void qdisc_get_default(char *name, size_t len)
197 {
198         read_lock(&qdisc_mod_lock);
199         strscpy(name, default_qdisc_ops->id, len);
200         read_unlock(&qdisc_mod_lock);
201 }
202
203 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
204 {
205         struct Qdisc_ops *q = NULL;
206
207         for (q = qdisc_base; q; q = q->next) {
208                 if (!strcmp(name, q->id)) {
209                         if (!try_module_get(q->owner))
210                                 q = NULL;
211                         break;
212                 }
213         }
214
215         return q;
216 }
217
218 /* Set new default qdisc to use */
219 int qdisc_set_default(const char *name)
220 {
221         const struct Qdisc_ops *ops;
222
223         if (!capable(CAP_NET_ADMIN))
224                 return -EPERM;
225
226         write_lock(&qdisc_mod_lock);
227         ops = qdisc_lookup_default(name);
228         if (!ops) {
229                 /* Not found, drop lock and try to load module */
230                 write_unlock(&qdisc_mod_lock);
231                 request_module("sch_%s", name);
232                 write_lock(&qdisc_mod_lock);
233
234                 ops = qdisc_lookup_default(name);
235         }
236
237         if (ops) {
238                 /* Set new default */
239                 module_put(default_qdisc_ops->owner);
240                 default_qdisc_ops = ops;
241         }
242         write_unlock(&qdisc_mod_lock);
243
244         return ops ? 0 : -ENOENT;
245 }
246
247 #ifdef CONFIG_NET_SCH_DEFAULT
248 /* Set default value from kernel config */
249 static int __init sch_default_qdisc(void)
250 {
251         return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
252 }
253 late_initcall(sch_default_qdisc);
254 #endif
255
256 /* We know handle. Find qdisc among all qdisc's attached to device
257  * (root qdisc, all its children, children of children etc.)
258  * Note: caller either uses rtnl or rcu_read_lock()
259  */
260
261 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
262 {
263         struct Qdisc *q;
264
265         if (!qdisc_dev(root))
266                 return (root->handle == handle ? root : NULL);
267
268         if (!(root->flags & TCQ_F_BUILTIN) &&
269             root->handle == handle)
270                 return root;
271
272         hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle,
273                                    lockdep_rtnl_is_held()) {
274                 if (q->handle == handle)
275                         return q;
276         }
277         return NULL;
278 }
279
280 void qdisc_hash_add(struct Qdisc *q, bool invisible)
281 {
282         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
283                 ASSERT_RTNL();
284                 hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
285                 if (invisible)
286                         q->flags |= TCQ_F_INVISIBLE;
287         }
288 }
289 EXPORT_SYMBOL(qdisc_hash_add);
290
291 void qdisc_hash_del(struct Qdisc *q)
292 {
293         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
294                 ASSERT_RTNL();
295                 hash_del_rcu(&q->hash);
296         }
297 }
298 EXPORT_SYMBOL(qdisc_hash_del);
299
300 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
301 {
302         struct Qdisc *q;
303
304         if (!handle)
305                 return NULL;
306         q = qdisc_match_from_root(rtnl_dereference(dev->qdisc), handle);
307         if (q)
308                 goto out;
309
310         if (dev_ingress_queue(dev))
311                 q = qdisc_match_from_root(
312                         rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping),
313                         handle);
314 out:
315         return q;
316 }
317
318 struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle)
319 {
320         struct netdev_queue *nq;
321         struct Qdisc *q;
322
323         if (!handle)
324                 return NULL;
325         q = qdisc_match_from_root(rcu_dereference(dev->qdisc), handle);
326         if (q)
327                 goto out;
328
329         nq = dev_ingress_queue_rcu(dev);
330         if (nq)
331                 q = qdisc_match_from_root(rcu_dereference(nq->qdisc_sleeping),
332                                           handle);
333 out:
334         return q;
335 }
336
337 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
338 {
339         unsigned long cl;
340         const struct Qdisc_class_ops *cops = p->ops->cl_ops;
341
342         if (cops == NULL)
343                 return NULL;
344         cl = cops->find(p, classid);
345
346         if (cl == 0)
347                 return NULL;
348         return cops->leaf(p, cl);
349 }
350
351 /* Find queueing discipline by name */
352
353 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
354 {
355         struct Qdisc_ops *q = NULL;
356
357         if (kind) {
358                 read_lock(&qdisc_mod_lock);
359                 for (q = qdisc_base; q; q = q->next) {
360                         if (nla_strcmp(kind, q->id) == 0) {
361                                 if (!try_module_get(q->owner))
362                                         q = NULL;
363                                 break;
364                         }
365                 }
366                 read_unlock(&qdisc_mod_lock);
367         }
368         return q;
369 }
370
371 /* The linklayer setting were not transferred from iproute2, in older
372  * versions, and the rate tables lookup systems have been dropped in
373  * the kernel. To keep backward compatible with older iproute2 tc
374  * utils, we detect the linklayer setting by detecting if the rate
375  * table were modified.
376  *
377  * For linklayer ATM table entries, the rate table will be aligned to
378  * 48 bytes, thus some table entries will contain the same value.  The
379  * mpu (min packet unit) is also encoded into the old rate table, thus
380  * starting from the mpu, we find low and high table entries for
381  * mapping this cell.  If these entries contain the same value, when
382  * the rate tables have been modified for linklayer ATM.
383  *
384  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
385  * and then roundup to the next cell, calc the table entry one below,
386  * and compare.
387  */
388 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
389 {
390         int low       = roundup(r->mpu, 48);
391         int high      = roundup(low+1, 48);
392         int cell_low  = low >> r->cell_log;
393         int cell_high = (high >> r->cell_log) - 1;
394
395         /* rtab is too inaccurate at rates > 100Mbit/s */
396         if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
397                 pr_debug("TC linklayer: Giving up ATM detection\n");
398                 return TC_LINKLAYER_ETHERNET;
399         }
400
401         if ((cell_high > cell_low) && (cell_high < 256)
402             && (rtab[cell_low] == rtab[cell_high])) {
403                 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
404                          cell_low, cell_high, rtab[cell_high]);
405                 return TC_LINKLAYER_ATM;
406         }
407         return TC_LINKLAYER_ETHERNET;
408 }
409
410 static struct qdisc_rate_table *qdisc_rtab_list;
411
412 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
413                                         struct nlattr *tab,
414                                         struct netlink_ext_ack *extack)
415 {
416         struct qdisc_rate_table *rtab;
417
418         if (tab == NULL || r->rate == 0 ||
419             r->cell_log == 0 || r->cell_log >= 32 ||
420             nla_len(tab) != TC_RTAB_SIZE) {
421                 NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
422                 return NULL;
423         }
424
425         for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
426                 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
427                     !memcmp(&rtab->data, nla_data(tab), 1024)) {
428                         rtab->refcnt++;
429                         return rtab;
430                 }
431         }
432
433         rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
434         if (rtab) {
435                 rtab->rate = *r;
436                 rtab->refcnt = 1;
437                 memcpy(rtab->data, nla_data(tab), 1024);
438                 if (r->linklayer == TC_LINKLAYER_UNAWARE)
439                         r->linklayer = __detect_linklayer(r, rtab->data);
440                 rtab->next = qdisc_rtab_list;
441                 qdisc_rtab_list = rtab;
442         } else {
443                 NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
444         }
445         return rtab;
446 }
447 EXPORT_SYMBOL(qdisc_get_rtab);
448
449 void qdisc_put_rtab(struct qdisc_rate_table *tab)
450 {
451         struct qdisc_rate_table *rtab, **rtabp;
452
453         if (!tab || --tab->refcnt)
454                 return;
455
456         for (rtabp = &qdisc_rtab_list;
457              (rtab = *rtabp) != NULL;
458              rtabp = &rtab->next) {
459                 if (rtab == tab) {
460                         *rtabp = rtab->next;
461                         kfree(rtab);
462                         return;
463                 }
464         }
465 }
466 EXPORT_SYMBOL(qdisc_put_rtab);
467
468 static LIST_HEAD(qdisc_stab_list);
469
470 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
471         [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
472         [TCA_STAB_DATA] = { .type = NLA_BINARY },
473 };
474
475 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
476                                                struct netlink_ext_ack *extack)
477 {
478         struct nlattr *tb[TCA_STAB_MAX + 1];
479         struct qdisc_size_table *stab;
480         struct tc_sizespec *s;
481         unsigned int tsize = 0;
482         u16 *tab = NULL;
483         int err;
484
485         err = nla_parse_nested_deprecated(tb, TCA_STAB_MAX, opt, stab_policy,
486                                           extack);
487         if (err < 0)
488                 return ERR_PTR(err);
489         if (!tb[TCA_STAB_BASE]) {
490                 NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
491                 return ERR_PTR(-EINVAL);
492         }
493
494         s = nla_data(tb[TCA_STAB_BASE]);
495
496         if (s->tsize > 0) {
497                 if (!tb[TCA_STAB_DATA]) {
498                         NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
499                         return ERR_PTR(-EINVAL);
500                 }
501                 tab = nla_data(tb[TCA_STAB_DATA]);
502                 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
503         }
504
505         if (tsize != s->tsize || (!tab && tsize > 0)) {
506                 NL_SET_ERR_MSG(extack, "Invalid size of size table");
507                 return ERR_PTR(-EINVAL);
508         }
509
510         list_for_each_entry(stab, &qdisc_stab_list, list) {
511                 if (memcmp(&stab->szopts, s, sizeof(*s)))
512                         continue;
513                 if (tsize > 0 &&
514                     memcmp(stab->data, tab, flex_array_size(stab, data, tsize)))
515                         continue;
516                 stab->refcnt++;
517                 return stab;
518         }
519
520         if (s->size_log > STAB_SIZE_LOG_MAX ||
521             s->cell_log > STAB_SIZE_LOG_MAX) {
522                 NL_SET_ERR_MSG(extack, "Invalid logarithmic size of size table");
523                 return ERR_PTR(-EINVAL);
524         }
525
526         stab = kmalloc(struct_size(stab, data, tsize), GFP_KERNEL);
527         if (!stab)
528                 return ERR_PTR(-ENOMEM);
529
530         stab->refcnt = 1;
531         stab->szopts = *s;
532         if (tsize > 0)
533                 memcpy(stab->data, tab, flex_array_size(stab, data, tsize));
534
535         list_add_tail(&stab->list, &qdisc_stab_list);
536
537         return stab;
538 }
539
540 void qdisc_put_stab(struct qdisc_size_table *tab)
541 {
542         if (!tab)
543                 return;
544
545         if (--tab->refcnt == 0) {
546                 list_del(&tab->list);
547                 kfree_rcu(tab, rcu);
548         }
549 }
550 EXPORT_SYMBOL(qdisc_put_stab);
551
552 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
553 {
554         struct nlattr *nest;
555
556         nest = nla_nest_start_noflag(skb, TCA_STAB);
557         if (nest == NULL)
558                 goto nla_put_failure;
559         if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
560                 goto nla_put_failure;
561         nla_nest_end(skb, nest);
562
563         return skb->len;
564
565 nla_put_failure:
566         return -1;
567 }
568
569 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
570                                const struct qdisc_size_table *stab)
571 {
572         int pkt_len, slot;
573
574         pkt_len = skb->len + stab->szopts.overhead;
575         if (unlikely(!stab->szopts.tsize))
576                 goto out;
577
578         slot = pkt_len + stab->szopts.cell_align;
579         if (unlikely(slot < 0))
580                 slot = 0;
581
582         slot >>= stab->szopts.cell_log;
583         if (likely(slot < stab->szopts.tsize))
584                 pkt_len = stab->data[slot];
585         else
586                 pkt_len = stab->data[stab->szopts.tsize - 1] *
587                                 (slot / stab->szopts.tsize) +
588                                 stab->data[slot % stab->szopts.tsize];
589
590         pkt_len <<= stab->szopts.size_log;
591 out:
592         if (unlikely(pkt_len < 1))
593                 pkt_len = 1;
594         qdisc_skb_cb(skb)->pkt_len = pkt_len;
595 }
596 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
597
598 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
599 {
600         if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
601                 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
602                         txt, qdisc->ops->id, qdisc->handle >> 16);
603                 qdisc->flags |= TCQ_F_WARN_NONWC;
604         }
605 }
606 EXPORT_SYMBOL(qdisc_warn_nonwc);
607
608 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
609 {
610         struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
611                                                  timer);
612
613         rcu_read_lock();
614         __netif_schedule(qdisc_root(wd->qdisc));
615         rcu_read_unlock();
616
617         return HRTIMER_NORESTART;
618 }
619
620 void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
621                                  clockid_t clockid)
622 {
623         hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
624         wd->timer.function = qdisc_watchdog;
625         wd->qdisc = qdisc;
626 }
627 EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
628
629 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
630 {
631         qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
632 }
633 EXPORT_SYMBOL(qdisc_watchdog_init);
634
635 void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires,
636                                       u64 delta_ns)
637 {
638         bool deactivated;
639
640         rcu_read_lock();
641         deactivated = test_bit(__QDISC_STATE_DEACTIVATED,
642                                &qdisc_root_sleeping(wd->qdisc)->state);
643         rcu_read_unlock();
644         if (deactivated)
645                 return;
646
647         if (hrtimer_is_queued(&wd->timer)) {
648                 u64 softexpires;
649
650                 softexpires = ktime_to_ns(hrtimer_get_softexpires(&wd->timer));
651                 /* If timer is already set in [expires, expires + delta_ns],
652                  * do not reprogram it.
653                  */
654                 if (softexpires - expires <= delta_ns)
655                         return;
656         }
657
658         hrtimer_start_range_ns(&wd->timer,
659                                ns_to_ktime(expires),
660                                delta_ns,
661                                HRTIMER_MODE_ABS_PINNED);
662 }
663 EXPORT_SYMBOL(qdisc_watchdog_schedule_range_ns);
664
665 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
666 {
667         hrtimer_cancel(&wd->timer);
668 }
669 EXPORT_SYMBOL(qdisc_watchdog_cancel);
670
671 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
672 {
673         struct hlist_head *h;
674         unsigned int i;
675
676         h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
677
678         if (h != NULL) {
679                 for (i = 0; i < n; i++)
680                         INIT_HLIST_HEAD(&h[i]);
681         }
682         return h;
683 }
684
685 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
686 {
687         struct Qdisc_class_common *cl;
688         struct hlist_node *next;
689         struct hlist_head *nhash, *ohash;
690         unsigned int nsize, nmask, osize;
691         unsigned int i, h;
692
693         /* Rehash when load factor exceeds 0.75 */
694         if (clhash->hashelems * 4 <= clhash->hashsize * 3)
695                 return;
696         nsize = clhash->hashsize * 2;
697         nmask = nsize - 1;
698         nhash = qdisc_class_hash_alloc(nsize);
699         if (nhash == NULL)
700                 return;
701
702         ohash = clhash->hash;
703         osize = clhash->hashsize;
704
705         sch_tree_lock(sch);
706         for (i = 0; i < osize; i++) {
707                 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
708                         h = qdisc_class_hash(cl->classid, nmask);
709                         hlist_add_head(&cl->hnode, &nhash[h]);
710                 }
711         }
712         clhash->hash     = nhash;
713         clhash->hashsize = nsize;
714         clhash->hashmask = nmask;
715         sch_tree_unlock(sch);
716
717         kvfree(ohash);
718 }
719 EXPORT_SYMBOL(qdisc_class_hash_grow);
720
721 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
722 {
723         unsigned int size = 4;
724
725         clhash->hash = qdisc_class_hash_alloc(size);
726         if (!clhash->hash)
727                 return -ENOMEM;
728         clhash->hashsize  = size;
729         clhash->hashmask  = size - 1;
730         clhash->hashelems = 0;
731         return 0;
732 }
733 EXPORT_SYMBOL(qdisc_class_hash_init);
734
735 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
736 {
737         kvfree(clhash->hash);
738 }
739 EXPORT_SYMBOL(qdisc_class_hash_destroy);
740
741 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
742                              struct Qdisc_class_common *cl)
743 {
744         unsigned int h;
745
746         INIT_HLIST_NODE(&cl->hnode);
747         h = qdisc_class_hash(cl->classid, clhash->hashmask);
748         hlist_add_head(&cl->hnode, &clhash->hash[h]);
749         clhash->hashelems++;
750 }
751 EXPORT_SYMBOL(qdisc_class_hash_insert);
752
753 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
754                              struct Qdisc_class_common *cl)
755 {
756         hlist_del(&cl->hnode);
757         clhash->hashelems--;
758 }
759 EXPORT_SYMBOL(qdisc_class_hash_remove);
760
761 /* Allocate an unique handle from space managed by kernel
762  * Possible range is [8000-FFFF]:0000 (0x8000 values)
763  */
764 static u32 qdisc_alloc_handle(struct net_device *dev)
765 {
766         int i = 0x8000;
767         static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
768
769         do {
770                 autohandle += TC_H_MAKE(0x10000U, 0);
771                 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
772                         autohandle = TC_H_MAKE(0x80000000U, 0);
773                 if (!qdisc_lookup(dev, autohandle))
774                         return autohandle;
775                 cond_resched();
776         } while (--i > 0);
777
778         return 0;
779 }
780
781 void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len)
782 {
783         bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
784         const struct Qdisc_class_ops *cops;
785         unsigned long cl;
786         u32 parentid;
787         bool notify;
788         int drops;
789
790         if (n == 0 && len == 0)
791                 return;
792         drops = max_t(int, n, 0);
793         rcu_read_lock();
794         while ((parentid = sch->parent)) {
795                 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
796                         break;
797
798                 if (sch->flags & TCQ_F_NOPARENT)
799                         break;
800                 /* Notify parent qdisc only if child qdisc becomes empty.
801                  *
802                  * If child was empty even before update then backlog
803                  * counter is screwed and we skip notification because
804                  * parent class is already passive.
805                  *
806                  * If the original child was offloaded then it is allowed
807                  * to be seem as empty, so the parent is notified anyway.
808                  */
809                 notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
810                                                        !qdisc_is_offloaded);
811                 /* TODO: perform the search on a per txq basis */
812                 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
813                 if (sch == NULL) {
814                         WARN_ON_ONCE(parentid != TC_H_ROOT);
815                         break;
816                 }
817                 cops = sch->ops->cl_ops;
818                 if (notify && cops->qlen_notify) {
819                         cl = cops->find(sch, parentid);
820                         cops->qlen_notify(sch, cl);
821                 }
822                 sch->q.qlen -= n;
823                 sch->qstats.backlog -= len;
824                 __qdisc_qstats_drop(sch, drops);
825         }
826         rcu_read_unlock();
827 }
828 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
829
830 int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type,
831                               void *type_data)
832 {
833         struct net_device *dev = qdisc_dev(sch);
834         int err;
835
836         sch->flags &= ~TCQ_F_OFFLOADED;
837         if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
838                 return 0;
839
840         err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
841         if (err == -EOPNOTSUPP)
842                 return 0;
843
844         if (!err)
845                 sch->flags |= TCQ_F_OFFLOADED;
846
847         return err;
848 }
849 EXPORT_SYMBOL(qdisc_offload_dump_helper);
850
851 void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
852                                 struct Qdisc *new, struct Qdisc *old,
853                                 enum tc_setup_type type, void *type_data,
854                                 struct netlink_ext_ack *extack)
855 {
856         bool any_qdisc_is_offloaded;
857         int err;
858
859         if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
860                 return;
861
862         err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
863
864         /* Don't report error if the graft is part of destroy operation. */
865         if (!err || !new || new == &noop_qdisc)
866                 return;
867
868         /* Don't report error if the parent, the old child and the new
869          * one are not offloaded.
870          */
871         any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED;
872         any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED;
873         any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED;
874
875         if (any_qdisc_is_offloaded)
876                 NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
877 }
878 EXPORT_SYMBOL(qdisc_offload_graft_helper);
879
880 void qdisc_offload_query_caps(struct net_device *dev,
881                               enum tc_setup_type type,
882                               void *caps, size_t caps_len)
883 {
884         const struct net_device_ops *ops = dev->netdev_ops;
885         struct tc_query_caps_base base = {
886                 .type = type,
887                 .caps = caps,
888         };
889
890         memset(caps, 0, caps_len);
891
892         if (ops->ndo_setup_tc)
893                 ops->ndo_setup_tc(dev, TC_QUERY_CAPS, &base);
894 }
895 EXPORT_SYMBOL(qdisc_offload_query_caps);
896
897 static void qdisc_offload_graft_root(struct net_device *dev,
898                                      struct Qdisc *new, struct Qdisc *old,
899                                      struct netlink_ext_ack *extack)
900 {
901         struct tc_root_qopt_offload graft_offload = {
902                 .command        = TC_ROOT_GRAFT,
903                 .handle         = new ? new->handle : 0,
904                 .ingress        = (new && new->flags & TCQ_F_INGRESS) ||
905                                   (old && old->flags & TCQ_F_INGRESS),
906         };
907
908         qdisc_offload_graft_helper(dev, NULL, new, old,
909                                    TC_SETUP_ROOT_QDISC, &graft_offload, extack);
910 }
911
912 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
913                          u32 portid, u32 seq, u16 flags, int event,
914                          struct netlink_ext_ack *extack)
915 {
916         struct gnet_stats_basic_sync __percpu *cpu_bstats = NULL;
917         struct gnet_stats_queue __percpu *cpu_qstats = NULL;
918         struct tcmsg *tcm;
919         struct nlmsghdr  *nlh;
920         unsigned char *b = skb_tail_pointer(skb);
921         struct gnet_dump d;
922         struct qdisc_size_table *stab;
923         u32 block_index;
924         __u32 qlen;
925
926         cond_resched();
927         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
928         if (!nlh)
929                 goto out_nlmsg_trim;
930         tcm = nlmsg_data(nlh);
931         tcm->tcm_family = AF_UNSPEC;
932         tcm->tcm__pad1 = 0;
933         tcm->tcm__pad2 = 0;
934         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
935         tcm->tcm_parent = clid;
936         tcm->tcm_handle = q->handle;
937         tcm->tcm_info = refcount_read(&q->refcnt);
938         if (nla_put_string(skb, TCA_KIND, q->ops->id))
939                 goto nla_put_failure;
940         if (q->ops->ingress_block_get) {
941                 block_index = q->ops->ingress_block_get(q);
942                 if (block_index &&
943                     nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
944                         goto nla_put_failure;
945         }
946         if (q->ops->egress_block_get) {
947                 block_index = q->ops->egress_block_get(q);
948                 if (block_index &&
949                     nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
950                         goto nla_put_failure;
951         }
952         if (q->ops->dump && q->ops->dump(q, skb) < 0)
953                 goto nla_put_failure;
954         if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
955                 goto nla_put_failure;
956         qlen = qdisc_qlen_sum(q);
957
958         stab = rtnl_dereference(q->stab);
959         if (stab && qdisc_dump_stab(skb, stab) < 0)
960                 goto nla_put_failure;
961
962         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
963                                          NULL, &d, TCA_PAD) < 0)
964                 goto nla_put_failure;
965
966         if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
967                 goto nla_put_failure;
968
969         if (qdisc_is_percpu_stats(q)) {
970                 cpu_bstats = q->cpu_bstats;
971                 cpu_qstats = q->cpu_qstats;
972         }
973
974         if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats, true) < 0 ||
975             gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
976             gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
977                 goto nla_put_failure;
978
979         if (gnet_stats_finish_copy(&d) < 0)
980                 goto nla_put_failure;
981
982         if (extack && extack->_msg &&
983             nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg))
984                 goto out_nlmsg_trim;
985
986         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
987
988         return skb->len;
989
990 out_nlmsg_trim:
991 nla_put_failure:
992         nlmsg_trim(skb, b);
993         return -1;
994 }
995
996 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
997 {
998         if (q->flags & TCQ_F_BUILTIN)
999                 return true;
1000         if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
1001                 return true;
1002
1003         return false;
1004 }
1005
1006 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1007                         struct nlmsghdr *n, u32 clid,
1008                         struct Qdisc *old, struct Qdisc *new,
1009                         struct netlink_ext_ack *extack)
1010 {
1011         struct sk_buff *skb;
1012         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1013
1014         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1015         if (!skb)
1016                 return -ENOBUFS;
1017
1018         if (old && !tc_qdisc_dump_ignore(old, false)) {
1019                 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
1020                                   0, RTM_DELQDISC, extack) < 0)
1021                         goto err_out;
1022         }
1023         if (new && !tc_qdisc_dump_ignore(new, false)) {
1024                 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
1025                                   old ? NLM_F_REPLACE : 0, RTM_NEWQDISC, extack) < 0)
1026                         goto err_out;
1027         }
1028
1029         if (skb->len)
1030                 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1031                                       n->nlmsg_flags & NLM_F_ECHO);
1032
1033 err_out:
1034         kfree_skb(skb);
1035         return -EINVAL;
1036 }
1037
1038 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
1039                                struct nlmsghdr *n, u32 clid,
1040                                struct Qdisc *old, struct Qdisc *new,
1041                                struct netlink_ext_ack *extack)
1042 {
1043         if (new || old)
1044                 qdisc_notify(net, skb, n, clid, old, new, extack);
1045
1046         if (old)
1047                 qdisc_put(old);
1048 }
1049
1050 static void qdisc_clear_nolock(struct Qdisc *sch)
1051 {
1052         sch->flags &= ~TCQ_F_NOLOCK;
1053         if (!(sch->flags & TCQ_F_CPUSTATS))
1054                 return;
1055
1056         free_percpu(sch->cpu_bstats);
1057         free_percpu(sch->cpu_qstats);
1058         sch->cpu_bstats = NULL;
1059         sch->cpu_qstats = NULL;
1060         sch->flags &= ~TCQ_F_CPUSTATS;
1061 }
1062
1063 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
1064  * to device "dev".
1065  *
1066  * When appropriate send a netlink notification using 'skb'
1067  * and "n".
1068  *
1069  * On success, destroy old qdisc.
1070  */
1071
1072 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
1073                        struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
1074                        struct Qdisc *new, struct Qdisc *old,
1075                        struct netlink_ext_ack *extack)
1076 {
1077         struct Qdisc *q = old;
1078         struct net *net = dev_net(dev);
1079
1080         if (parent == NULL) {
1081                 unsigned int i, num_q, ingress;
1082                 struct netdev_queue *dev_queue;
1083
1084                 ingress = 0;
1085                 num_q = dev->num_tx_queues;
1086                 if ((q && q->flags & TCQ_F_INGRESS) ||
1087                     (new && new->flags & TCQ_F_INGRESS)) {
1088                         ingress = 1;
1089                         dev_queue = dev_ingress_queue(dev);
1090                         if (!dev_queue) {
1091                                 NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
1092                                 return -ENOENT;
1093                         }
1094
1095                         q = rtnl_dereference(dev_queue->qdisc_sleeping);
1096
1097                         /* This is the counterpart of that qdisc_refcount_inc_nz() call in
1098                          * __tcf_qdisc_find() for filter requests.
1099                          */
1100                         if (!qdisc_refcount_dec_if_one(q)) {
1101                                 NL_SET_ERR_MSG(extack,
1102                                                "Current ingress or clsact Qdisc has ongoing filter requests");
1103                                 return -EBUSY;
1104                         }
1105                 }
1106
1107                 if (dev->flags & IFF_UP)
1108                         dev_deactivate(dev);
1109
1110                 qdisc_offload_graft_root(dev, new, old, extack);
1111
1112                 if (new && new->ops->attach && !ingress)
1113                         goto skip;
1114
1115                 if (!ingress) {
1116                         for (i = 0; i < num_q; i++) {
1117                                 dev_queue = netdev_get_tx_queue(dev, i);
1118                                 old = dev_graft_qdisc(dev_queue, new);
1119
1120                                 if (new && i > 0)
1121                                         qdisc_refcount_inc(new);
1122                                 qdisc_put(old);
1123                         }
1124                 } else {
1125                         old = dev_graft_qdisc(dev_queue, NULL);
1126
1127                         /* {ingress,clsact}_destroy() @old before grafting @new to avoid
1128                          * unprotected concurrent accesses to net_device::miniq_{in,e}gress
1129                          * pointer(s) in mini_qdisc_pair_swap().
1130                          */
1131                         qdisc_notify(net, skb, n, classid, old, new, extack);
1132                         qdisc_destroy(old);
1133
1134                         dev_graft_qdisc(dev_queue, new);
1135                 }
1136
1137 skip:
1138                 if (!ingress) {
1139                         old = rtnl_dereference(dev->qdisc);
1140                         if (new && !new->ops->attach)
1141                                 qdisc_refcount_inc(new);
1142                         rcu_assign_pointer(dev->qdisc, new ? : &noop_qdisc);
1143
1144                         notify_and_destroy(net, skb, n, classid, old, new, extack);
1145
1146                         if (new && new->ops->attach)
1147                                 new->ops->attach(new);
1148                 }
1149
1150                 if (dev->flags & IFF_UP)
1151                         dev_activate(dev);
1152         } else {
1153                 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
1154                 unsigned long cl;
1155                 int err;
1156
1157                 /* Only support running class lockless if parent is lockless */
1158                 if (new && (new->flags & TCQ_F_NOLOCK) && !(parent->flags & TCQ_F_NOLOCK))
1159                         qdisc_clear_nolock(new);
1160
1161                 if (!cops || !cops->graft)
1162                         return -EOPNOTSUPP;
1163
1164                 cl = cops->find(parent, classid);
1165                 if (!cl) {
1166                         NL_SET_ERR_MSG(extack, "Specified class not found");
1167                         return -ENOENT;
1168                 }
1169
1170                 if (new && new->ops == &noqueue_qdisc_ops) {
1171                         NL_SET_ERR_MSG(extack, "Cannot assign noqueue to a class");
1172                         return -EINVAL;
1173                 }
1174
1175                 err = cops->graft(parent, cl, new, &old, extack);
1176                 if (err)
1177                         return err;
1178                 notify_and_destroy(net, skb, n, classid, old, new, extack);
1179         }
1180         return 0;
1181 }
1182
1183 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1184                                    struct netlink_ext_ack *extack)
1185 {
1186         u32 block_index;
1187
1188         if (tca[TCA_INGRESS_BLOCK]) {
1189                 block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1190
1191                 if (!block_index) {
1192                         NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1193                         return -EINVAL;
1194                 }
1195                 if (!sch->ops->ingress_block_set) {
1196                         NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1197                         return -EOPNOTSUPP;
1198                 }
1199                 sch->ops->ingress_block_set(sch, block_index);
1200         }
1201         if (tca[TCA_EGRESS_BLOCK]) {
1202                 block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1203
1204                 if (!block_index) {
1205                         NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1206                         return -EINVAL;
1207                 }
1208                 if (!sch->ops->egress_block_set) {
1209                         NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1210                         return -EOPNOTSUPP;
1211                 }
1212                 sch->ops->egress_block_set(sch, block_index);
1213         }
1214         return 0;
1215 }
1216
1217 /*
1218    Allocate and initialize new qdisc.
1219
1220    Parameters are passed via opt.
1221  */
1222
1223 static struct Qdisc *qdisc_create(struct net_device *dev,
1224                                   struct netdev_queue *dev_queue,
1225                                   u32 parent, u32 handle,
1226                                   struct nlattr **tca, int *errp,
1227                                   struct netlink_ext_ack *extack)
1228 {
1229         int err;
1230         struct nlattr *kind = tca[TCA_KIND];
1231         struct Qdisc *sch;
1232         struct Qdisc_ops *ops;
1233         struct qdisc_size_table *stab;
1234
1235         ops = qdisc_lookup_ops(kind);
1236 #ifdef CONFIG_MODULES
1237         if (ops == NULL && kind != NULL) {
1238                 char name[IFNAMSIZ];
1239                 if (nla_strscpy(name, kind, IFNAMSIZ) >= 0) {
1240                         /* We dropped the RTNL semaphore in order to
1241                          * perform the module load.  So, even if we
1242                          * succeeded in loading the module we have to
1243                          * tell the caller to replay the request.  We
1244                          * indicate this using -EAGAIN.
1245                          * We replay the request because the device may
1246                          * go away in the mean time.
1247                          */
1248                         rtnl_unlock();
1249                         request_module("sch_%s", name);
1250                         rtnl_lock();
1251                         ops = qdisc_lookup_ops(kind);
1252                         if (ops != NULL) {
1253                                 /* We will try again qdisc_lookup_ops,
1254                                  * so don't keep a reference.
1255                                  */
1256                                 module_put(ops->owner);
1257                                 err = -EAGAIN;
1258                                 goto err_out;
1259                         }
1260                 }
1261         }
1262 #endif
1263
1264         err = -ENOENT;
1265         if (!ops) {
1266                 NL_SET_ERR_MSG(extack, "Specified qdisc kind is unknown");
1267                 goto err_out;
1268         }
1269
1270         sch = qdisc_alloc(dev_queue, ops, extack);
1271         if (IS_ERR(sch)) {
1272                 err = PTR_ERR(sch);
1273                 goto err_out2;
1274         }
1275
1276         sch->parent = parent;
1277
1278         if (handle == TC_H_INGRESS) {
1279                 if (!(sch->flags & TCQ_F_INGRESS)) {
1280                         NL_SET_ERR_MSG(extack,
1281                                        "Specified parent ID is reserved for ingress and clsact Qdiscs");
1282                         err = -EINVAL;
1283                         goto err_out3;
1284                 }
1285                 handle = TC_H_MAKE(TC_H_INGRESS, 0);
1286         } else {
1287                 if (handle == 0) {
1288                         handle = qdisc_alloc_handle(dev);
1289                         if (handle == 0) {
1290                                 NL_SET_ERR_MSG(extack, "Maximum number of qdisc handles was exceeded");
1291                                 err = -ENOSPC;
1292                                 goto err_out3;
1293                         }
1294                 }
1295                 if (!netif_is_multiqueue(dev))
1296                         sch->flags |= TCQ_F_ONETXQUEUE;
1297         }
1298
1299         sch->handle = handle;
1300
1301         /* This exist to keep backward compatible with a userspace
1302          * loophole, what allowed userspace to get IFF_NO_QUEUE
1303          * facility on older kernels by setting tx_queue_len=0 (prior
1304          * to qdisc init), and then forgot to reinit tx_queue_len
1305          * before again attaching a qdisc.
1306          */
1307         if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1308                 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1309                 netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1310         }
1311
1312         err = qdisc_block_indexes_set(sch, tca, extack);
1313         if (err)
1314                 goto err_out3;
1315
1316         if (tca[TCA_STAB]) {
1317                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1318                 if (IS_ERR(stab)) {
1319                         err = PTR_ERR(stab);
1320                         goto err_out3;
1321                 }
1322                 rcu_assign_pointer(sch->stab, stab);
1323         }
1324
1325         if (ops->init) {
1326                 err = ops->init(sch, tca[TCA_OPTIONS], extack);
1327                 if (err != 0)
1328                         goto err_out4;
1329         }
1330
1331         if (tca[TCA_RATE]) {
1332                 err = -EOPNOTSUPP;
1333                 if (sch->flags & TCQ_F_MQROOT) {
1334                         NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1335                         goto err_out4;
1336                 }
1337
1338                 err = gen_new_estimator(&sch->bstats,
1339                                         sch->cpu_bstats,
1340                                         &sch->rate_est,
1341                                         NULL,
1342                                         true,
1343                                         tca[TCA_RATE]);
1344                 if (err) {
1345                         NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1346                         goto err_out4;
1347                 }
1348         }
1349
1350         qdisc_hash_add(sch, false);
1351         trace_qdisc_create(ops, dev, parent);
1352
1353         return sch;
1354
1355 err_out4:
1356         /* Even if ops->init() failed, we call ops->destroy()
1357          * like qdisc_create_dflt().
1358          */
1359         if (ops->destroy)
1360                 ops->destroy(sch);
1361         qdisc_put_stab(rtnl_dereference(sch->stab));
1362 err_out3:
1363         netdev_put(dev, &sch->dev_tracker);
1364         qdisc_free(sch);
1365 err_out2:
1366         module_put(ops->owner);
1367 err_out:
1368         *errp = err;
1369         return NULL;
1370 }
1371
1372 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1373                         struct netlink_ext_ack *extack)
1374 {
1375         struct qdisc_size_table *ostab, *stab = NULL;
1376         int err = 0;
1377
1378         if (tca[TCA_OPTIONS]) {
1379                 if (!sch->ops->change) {
1380                         NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1381                         return -EINVAL;
1382                 }
1383                 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1384                         NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1385                         return -EOPNOTSUPP;
1386                 }
1387                 err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1388                 if (err)
1389                         return err;
1390         }
1391
1392         if (tca[TCA_STAB]) {
1393                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1394                 if (IS_ERR(stab))
1395                         return PTR_ERR(stab);
1396         }
1397
1398         ostab = rtnl_dereference(sch->stab);
1399         rcu_assign_pointer(sch->stab, stab);
1400         qdisc_put_stab(ostab);
1401
1402         if (tca[TCA_RATE]) {
1403                 /* NB: ignores errors from replace_estimator
1404                    because change can't be undone. */
1405                 if (sch->flags & TCQ_F_MQROOT)
1406                         goto out;
1407                 gen_replace_estimator(&sch->bstats,
1408                                       sch->cpu_bstats,
1409                                       &sch->rate_est,
1410                                       NULL,
1411                                       true,
1412                                       tca[TCA_RATE]);
1413         }
1414 out:
1415         return 0;
1416 }
1417
1418 struct check_loop_arg {
1419         struct qdisc_walker     w;
1420         struct Qdisc            *p;
1421         int                     depth;
1422 };
1423
1424 static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1425                          struct qdisc_walker *w);
1426
1427 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1428 {
1429         struct check_loop_arg   arg;
1430
1431         if (q->ops->cl_ops == NULL)
1432                 return 0;
1433
1434         arg.w.stop = arg.w.skip = arg.w.count = 0;
1435         arg.w.fn = check_loop_fn;
1436         arg.depth = depth;
1437         arg.p = p;
1438         q->ops->cl_ops->walk(q, &arg.w);
1439         return arg.w.stop ? -ELOOP : 0;
1440 }
1441
1442 static int
1443 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1444 {
1445         struct Qdisc *leaf;
1446         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1447         struct check_loop_arg *arg = (struct check_loop_arg *)w;
1448
1449         leaf = cops->leaf(q, cl);
1450         if (leaf) {
1451                 if (leaf == arg->p || arg->depth > 7)
1452                         return -ELOOP;
1453                 return check_loop(leaf, arg->p, arg->depth + 1);
1454         }
1455         return 0;
1456 }
1457
1458 const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
1459         [TCA_KIND]              = { .type = NLA_STRING },
1460         [TCA_RATE]              = { .type = NLA_BINARY,
1461                                     .len = sizeof(struct tc_estimator) },
1462         [TCA_STAB]              = { .type = NLA_NESTED },
1463         [TCA_DUMP_INVISIBLE]    = { .type = NLA_FLAG },
1464         [TCA_CHAIN]             = { .type = NLA_U32 },
1465         [TCA_INGRESS_BLOCK]     = { .type = NLA_U32 },
1466         [TCA_EGRESS_BLOCK]      = { .type = NLA_U32 },
1467 };
1468
1469 /*
1470  * Delete/get qdisc.
1471  */
1472
1473 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1474                         struct netlink_ext_ack *extack)
1475 {
1476         struct net *net = sock_net(skb->sk);
1477         struct tcmsg *tcm = nlmsg_data(n);
1478         struct nlattr *tca[TCA_MAX + 1];
1479         struct net_device *dev;
1480         u32 clid;
1481         struct Qdisc *q = NULL;
1482         struct Qdisc *p = NULL;
1483         int err;
1484
1485         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1486                                      rtm_tca_policy, extack);
1487         if (err < 0)
1488                 return err;
1489
1490         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1491         if (!dev)
1492                 return -ENODEV;
1493
1494         clid = tcm->tcm_parent;
1495         if (clid) {
1496                 if (clid != TC_H_ROOT) {
1497                         if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1498                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1499                                 if (!p) {
1500                                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1501                                         return -ENOENT;
1502                                 }
1503                                 q = qdisc_leaf(p, clid);
1504                         } else if (dev_ingress_queue(dev)) {
1505                                 q = rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping);
1506                         }
1507                 } else {
1508                         q = rtnl_dereference(dev->qdisc);
1509                 }
1510                 if (!q) {
1511                         NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1512                         return -ENOENT;
1513                 }
1514
1515                 if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1516                         NL_SET_ERR_MSG(extack, "Invalid handle");
1517                         return -EINVAL;
1518                 }
1519         } else {
1520                 q = qdisc_lookup(dev, tcm->tcm_handle);
1521                 if (!q) {
1522                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1523                         return -ENOENT;
1524                 }
1525         }
1526
1527         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1528                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1529                 return -EINVAL;
1530         }
1531
1532         if (n->nlmsg_type == RTM_DELQDISC) {
1533                 if (!clid) {
1534                         NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1535                         return -EINVAL;
1536                 }
1537                 if (q->handle == 0) {
1538                         NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1539                         return -ENOENT;
1540                 }
1541                 err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1542                 if (err != 0)
1543                         return err;
1544         } else {
1545                 qdisc_notify(net, skb, n, clid, NULL, q, NULL);
1546         }
1547         return 0;
1548 }
1549
1550 /*
1551  * Create/change qdisc.
1552  */
1553
1554 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1555                            struct netlink_ext_ack *extack)
1556 {
1557         struct net *net = sock_net(skb->sk);
1558         struct tcmsg *tcm;
1559         struct nlattr *tca[TCA_MAX + 1];
1560         struct net_device *dev;
1561         u32 clid;
1562         struct Qdisc *q, *p;
1563         int err;
1564
1565 replay:
1566         /* Reinit, just in case something touches this. */
1567         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1568                                      rtm_tca_policy, extack);
1569         if (err < 0)
1570                 return err;
1571
1572         tcm = nlmsg_data(n);
1573         clid = tcm->tcm_parent;
1574         q = p = NULL;
1575
1576         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1577         if (!dev)
1578                 return -ENODEV;
1579
1580
1581         if (clid) {
1582                 if (clid != TC_H_ROOT) {
1583                         if (clid != TC_H_INGRESS) {
1584                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1585                                 if (!p) {
1586                                         NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1587                                         return -ENOENT;
1588                                 }
1589                                 q = qdisc_leaf(p, clid);
1590                         } else if (dev_ingress_queue_create(dev)) {
1591                                 q = rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping);
1592                         }
1593                 } else {
1594                         q = rtnl_dereference(dev->qdisc);
1595                 }
1596
1597                 /* It may be default qdisc, ignore it */
1598                 if (q && q->handle == 0)
1599                         q = NULL;
1600
1601                 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1602                         if (tcm->tcm_handle) {
1603                                 if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1604                                         NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1605                                         return -EEXIST;
1606                                 }
1607                                 if (TC_H_MIN(tcm->tcm_handle)) {
1608                                         NL_SET_ERR_MSG(extack, "Invalid minor handle");
1609                                         return -EINVAL;
1610                                 }
1611                                 q = qdisc_lookup(dev, tcm->tcm_handle);
1612                                 if (!q)
1613                                         goto create_n_graft;
1614                                 if (n->nlmsg_flags & NLM_F_EXCL) {
1615                                         NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1616                                         return -EEXIST;
1617                                 }
1618                                 if (tca[TCA_KIND] &&
1619                                     nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1620                                         NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1621                                         return -EINVAL;
1622                                 }
1623                                 if (q->flags & TCQ_F_INGRESS) {
1624                                         NL_SET_ERR_MSG(extack,
1625                                                        "Cannot regraft ingress or clsact Qdiscs");
1626                                         return -EINVAL;
1627                                 }
1628                                 if (q == p ||
1629                                     (p && check_loop(q, p, 0))) {
1630                                         NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1631                                         return -ELOOP;
1632                                 }
1633                                 if (clid == TC_H_INGRESS) {
1634                                         NL_SET_ERR_MSG(extack, "Ingress cannot graft directly");
1635                                         return -EINVAL;
1636                                 }
1637                                 qdisc_refcount_inc(q);
1638                                 goto graft;
1639                         } else {
1640                                 if (!q)
1641                                         goto create_n_graft;
1642
1643                                 /* This magic test requires explanation.
1644                                  *
1645                                  *   We know, that some child q is already
1646                                  *   attached to this parent and have choice:
1647                                  *   either to change it or to create/graft new one.
1648                                  *
1649                                  *   1. We are allowed to create/graft only
1650                                  *   if CREATE and REPLACE flags are set.
1651                                  *
1652                                  *   2. If EXCL is set, requestor wanted to say,
1653                                  *   that qdisc tcm_handle is not expected
1654                                  *   to exist, so that we choose create/graft too.
1655                                  *
1656                                  *   3. The last case is when no flags are set.
1657                                  *   Alas, it is sort of hole in API, we
1658                                  *   cannot decide what to do unambiguously.
1659                                  *   For now we select create/graft, if
1660                                  *   user gave KIND, which does not match existing.
1661                                  */
1662                                 if ((n->nlmsg_flags & NLM_F_CREATE) &&
1663                                     (n->nlmsg_flags & NLM_F_REPLACE) &&
1664                                     ((n->nlmsg_flags & NLM_F_EXCL) ||
1665                                      (tca[TCA_KIND] &&
1666                                       nla_strcmp(tca[TCA_KIND], q->ops->id))))
1667                                         goto create_n_graft;
1668                         }
1669                 }
1670         } else {
1671                 if (!tcm->tcm_handle) {
1672                         NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1673                         return -EINVAL;
1674                 }
1675                 q = qdisc_lookup(dev, tcm->tcm_handle);
1676         }
1677
1678         /* Change qdisc parameters */
1679         if (!q) {
1680                 NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1681                 return -ENOENT;
1682         }
1683         if (n->nlmsg_flags & NLM_F_EXCL) {
1684                 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1685                 return -EEXIST;
1686         }
1687         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1688                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1689                 return -EINVAL;
1690         }
1691         err = qdisc_change(q, tca, extack);
1692         if (err == 0)
1693                 qdisc_notify(net, skb, n, clid, NULL, q, extack);
1694         return err;
1695
1696 create_n_graft:
1697         if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1698                 NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1699                 return -ENOENT;
1700         }
1701         if (clid == TC_H_INGRESS) {
1702                 if (dev_ingress_queue(dev)) {
1703                         q = qdisc_create(dev, dev_ingress_queue(dev),
1704                                          tcm->tcm_parent, tcm->tcm_parent,
1705                                          tca, &err, extack);
1706                 } else {
1707                         NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1708                         err = -ENOENT;
1709                 }
1710         } else {
1711                 struct netdev_queue *dev_queue;
1712
1713                 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1714                         dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1715                 else if (p)
1716                         dev_queue = p->dev_queue;
1717                 else
1718                         dev_queue = netdev_get_tx_queue(dev, 0);
1719
1720                 q = qdisc_create(dev, dev_queue,
1721                                  tcm->tcm_parent, tcm->tcm_handle,
1722                                  tca, &err, extack);
1723         }
1724         if (q == NULL) {
1725                 if (err == -EAGAIN)
1726                         goto replay;
1727                 return err;
1728         }
1729
1730 graft:
1731         err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1732         if (err) {
1733                 if (q)
1734                         qdisc_put(q);
1735                 return err;
1736         }
1737
1738         return 0;
1739 }
1740
1741 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1742                               struct netlink_callback *cb,
1743                               int *q_idx_p, int s_q_idx, bool recur,
1744                               bool dump_invisible)
1745 {
1746         int ret = 0, q_idx = *q_idx_p;
1747         struct Qdisc *q;
1748         int b;
1749
1750         if (!root)
1751                 return 0;
1752
1753         q = root;
1754         if (q_idx < s_q_idx) {
1755                 q_idx++;
1756         } else {
1757                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1758                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1759                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1760                                   RTM_NEWQDISC, NULL) <= 0)
1761                         goto done;
1762                 q_idx++;
1763         }
1764
1765         /* If dumping singletons, there is no qdisc_dev(root) and the singleton
1766          * itself has already been dumped.
1767          *
1768          * If we've already dumped the top-level (ingress) qdisc above and the global
1769          * qdisc hashtable, we don't want to hit it again
1770          */
1771         if (!qdisc_dev(root) || !recur)
1772                 goto out;
1773
1774         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1775                 if (q_idx < s_q_idx) {
1776                         q_idx++;
1777                         continue;
1778                 }
1779                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1780                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1781                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1782                                   RTM_NEWQDISC, NULL) <= 0)
1783                         goto done;
1784                 q_idx++;
1785         }
1786
1787 out:
1788         *q_idx_p = q_idx;
1789         return ret;
1790 done:
1791         ret = -1;
1792         goto out;
1793 }
1794
1795 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1796 {
1797         struct net *net = sock_net(skb->sk);
1798         int idx, q_idx;
1799         int s_idx, s_q_idx;
1800         struct net_device *dev;
1801         const struct nlmsghdr *nlh = cb->nlh;
1802         struct nlattr *tca[TCA_MAX + 1];
1803         int err;
1804
1805         s_idx = cb->args[0];
1806         s_q_idx = q_idx = cb->args[1];
1807
1808         idx = 0;
1809         ASSERT_RTNL();
1810
1811         err = nlmsg_parse_deprecated(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
1812                                      rtm_tca_policy, cb->extack);
1813         if (err < 0)
1814                 return err;
1815
1816         for_each_netdev(net, dev) {
1817                 struct netdev_queue *dev_queue;
1818
1819                 if (idx < s_idx)
1820                         goto cont;
1821                 if (idx > s_idx)
1822                         s_q_idx = 0;
1823                 q_idx = 0;
1824
1825                 if (tc_dump_qdisc_root(rtnl_dereference(dev->qdisc),
1826                                        skb, cb, &q_idx, s_q_idx,
1827                                        true, tca[TCA_DUMP_INVISIBLE]) < 0)
1828                         goto done;
1829
1830                 dev_queue = dev_ingress_queue(dev);
1831                 if (dev_queue &&
1832                     tc_dump_qdisc_root(rtnl_dereference(dev_queue->qdisc_sleeping),
1833                                        skb, cb, &q_idx, s_q_idx, false,
1834                                        tca[TCA_DUMP_INVISIBLE]) < 0)
1835                         goto done;
1836
1837 cont:
1838                 idx++;
1839         }
1840
1841 done:
1842         cb->args[0] = idx;
1843         cb->args[1] = q_idx;
1844
1845         return skb->len;
1846 }
1847
1848
1849
1850 /************************************************
1851  *      Traffic classes manipulation.           *
1852  ************************************************/
1853
1854 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1855                           unsigned long cl, u32 portid, u32 seq, u16 flags,
1856                           int event, struct netlink_ext_ack *extack)
1857 {
1858         struct tcmsg *tcm;
1859         struct nlmsghdr  *nlh;
1860         unsigned char *b = skb_tail_pointer(skb);
1861         struct gnet_dump d;
1862         const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1863
1864         cond_resched();
1865         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1866         if (!nlh)
1867                 goto out_nlmsg_trim;
1868         tcm = nlmsg_data(nlh);
1869         tcm->tcm_family = AF_UNSPEC;
1870         tcm->tcm__pad1 = 0;
1871         tcm->tcm__pad2 = 0;
1872         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1873         tcm->tcm_parent = q->handle;
1874         tcm->tcm_handle = q->handle;
1875         tcm->tcm_info = 0;
1876         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1877                 goto nla_put_failure;
1878         if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1879                 goto nla_put_failure;
1880
1881         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1882                                          NULL, &d, TCA_PAD) < 0)
1883                 goto nla_put_failure;
1884
1885         if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1886                 goto nla_put_failure;
1887
1888         if (gnet_stats_finish_copy(&d) < 0)
1889                 goto nla_put_failure;
1890
1891         if (extack && extack->_msg &&
1892             nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg))
1893                 goto out_nlmsg_trim;
1894
1895         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1896
1897         return skb->len;
1898
1899 out_nlmsg_trim:
1900 nla_put_failure:
1901         nlmsg_trim(skb, b);
1902         return -1;
1903 }
1904
1905 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1906                          struct nlmsghdr *n, struct Qdisc *q,
1907                          unsigned long cl, int event, struct netlink_ext_ack *extack)
1908 {
1909         struct sk_buff *skb;
1910         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1911
1912         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1913         if (!skb)
1914                 return -ENOBUFS;
1915
1916         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event, extack) < 0) {
1917                 kfree_skb(skb);
1918                 return -EINVAL;
1919         }
1920
1921         return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1922                               n->nlmsg_flags & NLM_F_ECHO);
1923 }
1924
1925 static int tclass_del_notify(struct net *net,
1926                              const struct Qdisc_class_ops *cops,
1927                              struct sk_buff *oskb, struct nlmsghdr *n,
1928                              struct Qdisc *q, unsigned long cl,
1929                              struct netlink_ext_ack *extack)
1930 {
1931         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1932         struct sk_buff *skb;
1933         int err = 0;
1934
1935         if (!cops->delete)
1936                 return -EOPNOTSUPP;
1937
1938         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1939         if (!skb)
1940                 return -ENOBUFS;
1941
1942         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1943                            RTM_DELTCLASS, extack) < 0) {
1944                 kfree_skb(skb);
1945                 return -EINVAL;
1946         }
1947
1948         err = cops->delete(q, cl, extack);
1949         if (err) {
1950                 kfree_skb(skb);
1951                 return err;
1952         }
1953
1954         err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1955                              n->nlmsg_flags & NLM_F_ECHO);
1956         return err;
1957 }
1958
1959 #ifdef CONFIG_NET_CLS
1960
1961 struct tcf_bind_args {
1962         struct tcf_walker w;
1963         unsigned long base;
1964         unsigned long cl;
1965         u32 classid;
1966 };
1967
1968 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
1969 {
1970         struct tcf_bind_args *a = (void *)arg;
1971
1972         if (n && tp->ops->bind_class) {
1973                 struct Qdisc *q = tcf_block_q(tp->chain->block);
1974
1975                 sch_tree_lock(q);
1976                 tp->ops->bind_class(n, a->classid, a->cl, q, a->base);
1977                 sch_tree_unlock(q);
1978         }
1979         return 0;
1980 }
1981
1982 struct tc_bind_class_args {
1983         struct qdisc_walker w;
1984         unsigned long new_cl;
1985         u32 portid;
1986         u32 clid;
1987 };
1988
1989 static int tc_bind_class_walker(struct Qdisc *q, unsigned long cl,
1990                                 struct qdisc_walker *w)
1991 {
1992         struct tc_bind_class_args *a = (struct tc_bind_class_args *)w;
1993         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1994         struct tcf_block *block;
1995         struct tcf_chain *chain;
1996
1997         block = cops->tcf_block(q, cl, NULL);
1998         if (!block)
1999                 return 0;
2000         for (chain = tcf_get_next_chain(block, NULL);
2001              chain;
2002              chain = tcf_get_next_chain(block, chain)) {
2003                 struct tcf_proto *tp;
2004
2005                 for (tp = tcf_get_next_proto(chain, NULL);
2006                      tp; tp = tcf_get_next_proto(chain, tp)) {
2007                         struct tcf_bind_args arg = {};
2008
2009                         arg.w.fn = tcf_node_bind;
2010                         arg.classid = a->clid;
2011                         arg.base = cl;
2012                         arg.cl = a->new_cl;
2013                         tp->ops->walk(tp, &arg.w, true);
2014                 }
2015         }
2016
2017         return 0;
2018 }
2019
2020 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
2021                            unsigned long new_cl)
2022 {
2023         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
2024         struct tc_bind_class_args args = {};
2025
2026         if (!cops->tcf_block)
2027                 return;
2028         args.portid = portid;
2029         args.clid = clid;
2030         args.new_cl = new_cl;
2031         args.w.fn = tc_bind_class_walker;
2032         q->ops->cl_ops->walk(q, &args.w);
2033 }
2034
2035 #else
2036
2037 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
2038                            unsigned long new_cl)
2039 {
2040 }
2041
2042 #endif
2043
2044 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
2045                          struct netlink_ext_ack *extack)
2046 {
2047         struct net *net = sock_net(skb->sk);
2048         struct tcmsg *tcm = nlmsg_data(n);
2049         struct nlattr *tca[TCA_MAX + 1];
2050         struct net_device *dev;
2051         struct Qdisc *q = NULL;
2052         const struct Qdisc_class_ops *cops;
2053         unsigned long cl = 0;
2054         unsigned long new_cl;
2055         u32 portid;
2056         u32 clid;
2057         u32 qid;
2058         int err;
2059
2060         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
2061                                      rtm_tca_policy, extack);
2062         if (err < 0)
2063                 return err;
2064
2065         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
2066         if (!dev)
2067                 return -ENODEV;
2068
2069         /*
2070            parent == TC_H_UNSPEC - unspecified parent.
2071            parent == TC_H_ROOT   - class is root, which has no parent.
2072            parent == X:0         - parent is root class.
2073            parent == X:Y         - parent is a node in hierarchy.
2074            parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
2075
2076            handle == 0:0         - generate handle from kernel pool.
2077            handle == 0:Y         - class is X:Y, where X:0 is qdisc.
2078            handle == X:Y         - clear.
2079            handle == X:0         - root class.
2080          */
2081
2082         /* Step 1. Determine qdisc handle X:0 */
2083
2084         portid = tcm->tcm_parent;
2085         clid = tcm->tcm_handle;
2086         qid = TC_H_MAJ(clid);
2087
2088         if (portid != TC_H_ROOT) {
2089                 u32 qid1 = TC_H_MAJ(portid);
2090
2091                 if (qid && qid1) {
2092                         /* If both majors are known, they must be identical. */
2093                         if (qid != qid1)
2094                                 return -EINVAL;
2095                 } else if (qid1) {
2096                         qid = qid1;
2097                 } else if (qid == 0)
2098                         qid = rtnl_dereference(dev->qdisc)->handle;
2099
2100                 /* Now qid is genuine qdisc handle consistent
2101                  * both with parent and child.
2102                  *
2103                  * TC_H_MAJ(portid) still may be unspecified, complete it now.
2104                  */
2105                 if (portid)
2106                         portid = TC_H_MAKE(qid, portid);
2107         } else {
2108                 if (qid == 0)
2109                         qid = rtnl_dereference(dev->qdisc)->handle;
2110         }
2111
2112         /* OK. Locate qdisc */
2113         q = qdisc_lookup(dev, qid);
2114         if (!q)
2115                 return -ENOENT;
2116
2117         /* An check that it supports classes */
2118         cops = q->ops->cl_ops;
2119         if (cops == NULL)
2120                 return -EINVAL;
2121
2122         /* Now try to get class */
2123         if (clid == 0) {
2124                 if (portid == TC_H_ROOT)
2125                         clid = qid;
2126         } else
2127                 clid = TC_H_MAKE(qid, clid);
2128
2129         if (clid)
2130                 cl = cops->find(q, clid);
2131
2132         if (cl == 0) {
2133                 err = -ENOENT;
2134                 if (n->nlmsg_type != RTM_NEWTCLASS ||
2135                     !(n->nlmsg_flags & NLM_F_CREATE))
2136                         goto out;
2137         } else {
2138                 switch (n->nlmsg_type) {
2139                 case RTM_NEWTCLASS:
2140                         err = -EEXIST;
2141                         if (n->nlmsg_flags & NLM_F_EXCL)
2142                                 goto out;
2143                         break;
2144                 case RTM_DELTCLASS:
2145                         err = tclass_del_notify(net, cops, skb, n, q, cl, extack);
2146                         /* Unbind the class with flilters with 0 */
2147                         tc_bind_tclass(q, portid, clid, 0);
2148                         goto out;
2149                 case RTM_GETTCLASS:
2150                         err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS, extack);
2151                         goto out;
2152                 default:
2153                         err = -EINVAL;
2154                         goto out;
2155                 }
2156         }
2157
2158         if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
2159                 NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
2160                 return -EOPNOTSUPP;
2161         }
2162
2163         new_cl = cl;
2164         err = -EOPNOTSUPP;
2165         if (cops->change)
2166                 err = cops->change(q, clid, portid, tca, &new_cl, extack);
2167         if (err == 0) {
2168                 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS, extack);
2169                 /* We just create a new class, need to do reverse binding. */
2170                 if (cl != new_cl)
2171                         tc_bind_tclass(q, portid, clid, new_cl);
2172         }
2173 out:
2174         return err;
2175 }
2176
2177 struct qdisc_dump_args {
2178         struct qdisc_walker     w;
2179         struct sk_buff          *skb;
2180         struct netlink_callback *cb;
2181 };
2182
2183 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
2184                             struct qdisc_walker *arg)
2185 {
2186         struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
2187
2188         return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
2189                               a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
2190                               RTM_NEWTCLASS, NULL);
2191 }
2192
2193 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
2194                                 struct tcmsg *tcm, struct netlink_callback *cb,
2195                                 int *t_p, int s_t)
2196 {
2197         struct qdisc_dump_args arg;
2198
2199         if (tc_qdisc_dump_ignore(q, false) ||
2200             *t_p < s_t || !q->ops->cl_ops ||
2201             (tcm->tcm_parent &&
2202              TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2203                 (*t_p)++;
2204                 return 0;
2205         }
2206         if (*t_p > s_t)
2207                 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2208         arg.w.fn = qdisc_class_dump;
2209         arg.skb = skb;
2210         arg.cb = cb;
2211         arg.w.stop  = 0;
2212         arg.w.skip = cb->args[1];
2213         arg.w.count = 0;
2214         q->ops->cl_ops->walk(q, &arg.w);
2215         cb->args[1] = arg.w.count;
2216         if (arg.w.stop)
2217                 return -1;
2218         (*t_p)++;
2219         return 0;
2220 }
2221
2222 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2223                                struct tcmsg *tcm, struct netlink_callback *cb,
2224                                int *t_p, int s_t, bool recur)
2225 {
2226         struct Qdisc *q;
2227         int b;
2228
2229         if (!root)
2230                 return 0;
2231
2232         if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2233                 return -1;
2234
2235         if (!qdisc_dev(root) || !recur)
2236                 return 0;
2237
2238         if (tcm->tcm_parent) {
2239                 q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2240                 if (q && q != root &&
2241                     tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2242                         return -1;
2243                 return 0;
2244         }
2245         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2246                 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2247                         return -1;
2248         }
2249
2250         return 0;
2251 }
2252
2253 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2254 {
2255         struct tcmsg *tcm = nlmsg_data(cb->nlh);
2256         struct net *net = sock_net(skb->sk);
2257         struct netdev_queue *dev_queue;
2258         struct net_device *dev;
2259         int t, s_t;
2260
2261         if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2262                 return 0;
2263         dev = dev_get_by_index(net, tcm->tcm_ifindex);
2264         if (!dev)
2265                 return 0;
2266
2267         s_t = cb->args[0];
2268         t = 0;
2269
2270         if (tc_dump_tclass_root(rtnl_dereference(dev->qdisc),
2271                                 skb, tcm, cb, &t, s_t, true) < 0)
2272                 goto done;
2273
2274         dev_queue = dev_ingress_queue(dev);
2275         if (dev_queue &&
2276             tc_dump_tclass_root(rtnl_dereference(dev_queue->qdisc_sleeping),
2277                                 skb, tcm, cb, &t, s_t, false) < 0)
2278                 goto done;
2279
2280 done:
2281         cb->args[0] = t;
2282
2283         dev_put(dev);
2284         return skb->len;
2285 }
2286
2287 #ifdef CONFIG_PROC_FS
2288 static int psched_show(struct seq_file *seq, void *v)
2289 {
2290         seq_printf(seq, "%08x %08x %08x %08x\n",
2291                    (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2292                    1000000,
2293                    (u32)NSEC_PER_SEC / hrtimer_resolution);
2294
2295         return 0;
2296 }
2297
2298 static int __net_init psched_net_init(struct net *net)
2299 {
2300         struct proc_dir_entry *e;
2301
2302         e = proc_create_single("psched", 0, net->proc_net, psched_show);
2303         if (e == NULL)
2304                 return -ENOMEM;
2305
2306         return 0;
2307 }
2308
2309 static void __net_exit psched_net_exit(struct net *net)
2310 {
2311         remove_proc_entry("psched", net->proc_net);
2312 }
2313 #else
2314 static int __net_init psched_net_init(struct net *net)
2315 {
2316         return 0;
2317 }
2318
2319 static void __net_exit psched_net_exit(struct net *net)
2320 {
2321 }
2322 #endif
2323
2324 static struct pernet_operations psched_net_ops = {
2325         .init = psched_net_init,
2326         .exit = psched_net_exit,
2327 };
2328
2329 #if IS_ENABLED(CONFIG_RETPOLINE)
2330 DEFINE_STATIC_KEY_FALSE(tc_skip_wrapper);
2331 #endif
2332
2333 static int __init pktsched_init(void)
2334 {
2335         int err;
2336
2337         err = register_pernet_subsys(&psched_net_ops);
2338         if (err) {
2339                 pr_err("pktsched_init: "
2340                        "cannot initialize per netns operations\n");
2341                 return err;
2342         }
2343
2344         register_qdisc(&pfifo_fast_ops);
2345         register_qdisc(&pfifo_qdisc_ops);
2346         register_qdisc(&bfifo_qdisc_ops);
2347         register_qdisc(&pfifo_head_drop_qdisc_ops);
2348         register_qdisc(&mq_qdisc_ops);
2349         register_qdisc(&noqueue_qdisc_ops);
2350
2351         rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2352         rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2353         rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2354                       0);
2355         rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2356         rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2357         rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2358                       0);
2359
2360         tc_wrapper_init();
2361
2362         return 0;
2363 }
2364
2365 subsys_initcall(pktsched_init);