net: sched: add rcu annotations around qdisc->qdisc_sleeping
[platform/kernel/linux-starfive.git] / net / sched / sch_api.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * net/sched/sch_api.c  Packet scheduler API.
4  *
5  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
6  *
7  * Fixes:
8  *
9  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
10  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
11  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
12  */
13
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/string.h>
18 #include <linux/errno.h>
19 #include <linux/skbuff.h>
20 #include <linux/init.h>
21 #include <linux/proc_fs.h>
22 #include <linux/seq_file.h>
23 #include <linux/kmod.h>
24 #include <linux/list.h>
25 #include <linux/hrtimer.h>
26 #include <linux/slab.h>
27 #include <linux/hashtable.h>
28
29 #include <net/net_namespace.h>
30 #include <net/sock.h>
31 #include <net/netlink.h>
32 #include <net/pkt_sched.h>
33 #include <net/pkt_cls.h>
34
35 #include <trace/events/qdisc.h>
36
37 /*
38
39    Short review.
40    -------------
41
42    This file consists of two interrelated parts:
43
44    1. queueing disciplines manager frontend.
45    2. traffic classes manager frontend.
46
47    Generally, queueing discipline ("qdisc") is a black box,
48    which is able to enqueue packets and to dequeue them (when
49    device is ready to send something) in order and at times
50    determined by algorithm hidden in it.
51
52    qdisc's are divided to two categories:
53    - "queues", which have no internal structure visible from outside.
54    - "schedulers", which split all the packets to "traffic classes",
55      using "packet classifiers" (look at cls_api.c)
56
57    In turn, classes may have child qdiscs (as rule, queues)
58    attached to them etc. etc. etc.
59
60    The goal of the routines in this file is to translate
61    information supplied by user in the form of handles
62    to more intelligible for kernel form, to make some sanity
63    checks and part of work, which is common to all qdiscs
64    and to provide rtnetlink notifications.
65
66    All real intelligent work is done inside qdisc modules.
67
68
69
70    Every discipline has two major routines: enqueue and dequeue.
71
72    ---dequeue
73
74    dequeue usually returns a skb to send. It is allowed to return NULL,
75    but it does not mean that queue is empty, it just means that
76    discipline does not want to send anything this time.
77    Queue is really empty if q->q.qlen == 0.
78    For complicated disciplines with multiple queues q->q is not
79    real packet queue, but however q->q.qlen must be valid.
80
81    ---enqueue
82
83    enqueue returns 0, if packet was enqueued successfully.
84    If packet (this one or another one) was dropped, it returns
85    not zero error code.
86    NET_XMIT_DROP        - this packet dropped
87      Expected action: do not backoff, but wait until queue will clear.
88    NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
89      Expected action: backoff or ignore
90
91    Auxiliary routines:
92
93    ---peek
94
95    like dequeue but without removing a packet from the queue
96
97    ---reset
98
99    returns qdisc to initial state: purge all buffers, clear all
100    timers, counters (except for statistics) etc.
101
102    ---init
103
104    initializes newly created qdisc.
105
106    ---destroy
107
108    destroys resources allocated by init and during lifetime of qdisc.
109
110    ---change
111
112    changes qdisc parameters.
113  */
114
115 /* Protects list of registered TC modules. It is pure SMP lock. */
116 static DEFINE_RWLOCK(qdisc_mod_lock);
117
118
119 /************************************************
120  *      Queueing disciplines manipulation.      *
121  ************************************************/
122
123
124 /* The list of all installed queueing disciplines. */
125
126 static struct Qdisc_ops *qdisc_base;
127
128 /* Register/unregister queueing discipline */
129
130 int register_qdisc(struct Qdisc_ops *qops)
131 {
132         struct Qdisc_ops *q, **qp;
133         int rc = -EEXIST;
134
135         write_lock(&qdisc_mod_lock);
136         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
137                 if (!strcmp(qops->id, q->id))
138                         goto out;
139
140         if (qops->enqueue == NULL)
141                 qops->enqueue = noop_qdisc_ops.enqueue;
142         if (qops->peek == NULL) {
143                 if (qops->dequeue == NULL)
144                         qops->peek = noop_qdisc_ops.peek;
145                 else
146                         goto out_einval;
147         }
148         if (qops->dequeue == NULL)
149                 qops->dequeue = noop_qdisc_ops.dequeue;
150
151         if (qops->cl_ops) {
152                 const struct Qdisc_class_ops *cops = qops->cl_ops;
153
154                 if (!(cops->find && cops->walk && cops->leaf))
155                         goto out_einval;
156
157                 if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
158                         goto out_einval;
159         }
160
161         qops->next = NULL;
162         *qp = qops;
163         rc = 0;
164 out:
165         write_unlock(&qdisc_mod_lock);
166         return rc;
167
168 out_einval:
169         rc = -EINVAL;
170         goto out;
171 }
172 EXPORT_SYMBOL(register_qdisc);
173
174 void unregister_qdisc(struct Qdisc_ops *qops)
175 {
176         struct Qdisc_ops *q, **qp;
177         int err = -ENOENT;
178
179         write_lock(&qdisc_mod_lock);
180         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
181                 if (q == qops)
182                         break;
183         if (q) {
184                 *qp = q->next;
185                 q->next = NULL;
186                 err = 0;
187         }
188         write_unlock(&qdisc_mod_lock);
189
190         WARN(err, "unregister qdisc(%s) failed\n", qops->id);
191 }
192 EXPORT_SYMBOL(unregister_qdisc);
193
194 /* Get default qdisc if not otherwise specified */
195 void qdisc_get_default(char *name, size_t len)
196 {
197         read_lock(&qdisc_mod_lock);
198         strscpy(name, default_qdisc_ops->id, len);
199         read_unlock(&qdisc_mod_lock);
200 }
201
202 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
203 {
204         struct Qdisc_ops *q = NULL;
205
206         for (q = qdisc_base; q; q = q->next) {
207                 if (!strcmp(name, q->id)) {
208                         if (!try_module_get(q->owner))
209                                 q = NULL;
210                         break;
211                 }
212         }
213
214         return q;
215 }
216
217 /* Set new default qdisc to use */
218 int qdisc_set_default(const char *name)
219 {
220         const struct Qdisc_ops *ops;
221
222         if (!capable(CAP_NET_ADMIN))
223                 return -EPERM;
224
225         write_lock(&qdisc_mod_lock);
226         ops = qdisc_lookup_default(name);
227         if (!ops) {
228                 /* Not found, drop lock and try to load module */
229                 write_unlock(&qdisc_mod_lock);
230                 request_module("sch_%s", name);
231                 write_lock(&qdisc_mod_lock);
232
233                 ops = qdisc_lookup_default(name);
234         }
235
236         if (ops) {
237                 /* Set new default */
238                 module_put(default_qdisc_ops->owner);
239                 default_qdisc_ops = ops;
240         }
241         write_unlock(&qdisc_mod_lock);
242
243         return ops ? 0 : -ENOENT;
244 }
245
246 #ifdef CONFIG_NET_SCH_DEFAULT
247 /* Set default value from kernel config */
248 static int __init sch_default_qdisc(void)
249 {
250         return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
251 }
252 late_initcall(sch_default_qdisc);
253 #endif
254
255 /* We know handle. Find qdisc among all qdisc's attached to device
256  * (root qdisc, all its children, children of children etc.)
257  * Note: caller either uses rtnl or rcu_read_lock()
258  */
259
260 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
261 {
262         struct Qdisc *q;
263
264         if (!qdisc_dev(root))
265                 return (root->handle == handle ? root : NULL);
266
267         if (!(root->flags & TCQ_F_BUILTIN) &&
268             root->handle == handle)
269                 return root;
270
271         hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle,
272                                    lockdep_rtnl_is_held()) {
273                 if (q->handle == handle)
274                         return q;
275         }
276         return NULL;
277 }
278
279 void qdisc_hash_add(struct Qdisc *q, bool invisible)
280 {
281         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
282                 ASSERT_RTNL();
283                 hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
284                 if (invisible)
285                         q->flags |= TCQ_F_INVISIBLE;
286         }
287 }
288 EXPORT_SYMBOL(qdisc_hash_add);
289
290 void qdisc_hash_del(struct Qdisc *q)
291 {
292         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
293                 ASSERT_RTNL();
294                 hash_del_rcu(&q->hash);
295         }
296 }
297 EXPORT_SYMBOL(qdisc_hash_del);
298
299 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
300 {
301         struct Qdisc *q;
302
303         if (!handle)
304                 return NULL;
305         q = qdisc_match_from_root(rtnl_dereference(dev->qdisc), handle);
306         if (q)
307                 goto out;
308
309         if (dev_ingress_queue(dev))
310                 q = qdisc_match_from_root(
311                         rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping),
312                         handle);
313 out:
314         return q;
315 }
316
317 struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle)
318 {
319         struct netdev_queue *nq;
320         struct Qdisc *q;
321
322         if (!handle)
323                 return NULL;
324         q = qdisc_match_from_root(rcu_dereference(dev->qdisc), handle);
325         if (q)
326                 goto out;
327
328         nq = dev_ingress_queue_rcu(dev);
329         if (nq)
330                 q = qdisc_match_from_root(rcu_dereference(nq->qdisc_sleeping),
331                                           handle);
332 out:
333         return q;
334 }
335
336 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
337 {
338         unsigned long cl;
339         const struct Qdisc_class_ops *cops = p->ops->cl_ops;
340
341         if (cops == NULL)
342                 return NULL;
343         cl = cops->find(p, classid);
344
345         if (cl == 0)
346                 return NULL;
347         return cops->leaf(p, cl);
348 }
349
350 /* Find queueing discipline by name */
351
352 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
353 {
354         struct Qdisc_ops *q = NULL;
355
356         if (kind) {
357                 read_lock(&qdisc_mod_lock);
358                 for (q = qdisc_base; q; q = q->next) {
359                         if (nla_strcmp(kind, q->id) == 0) {
360                                 if (!try_module_get(q->owner))
361                                         q = NULL;
362                                 break;
363                         }
364                 }
365                 read_unlock(&qdisc_mod_lock);
366         }
367         return q;
368 }
369
370 /* The linklayer setting were not transferred from iproute2, in older
371  * versions, and the rate tables lookup systems have been dropped in
372  * the kernel. To keep backward compatible with older iproute2 tc
373  * utils, we detect the linklayer setting by detecting if the rate
374  * table were modified.
375  *
376  * For linklayer ATM table entries, the rate table will be aligned to
377  * 48 bytes, thus some table entries will contain the same value.  The
378  * mpu (min packet unit) is also encoded into the old rate table, thus
379  * starting from the mpu, we find low and high table entries for
380  * mapping this cell.  If these entries contain the same value, when
381  * the rate tables have been modified for linklayer ATM.
382  *
383  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
384  * and then roundup to the next cell, calc the table entry one below,
385  * and compare.
386  */
387 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
388 {
389         int low       = roundup(r->mpu, 48);
390         int high      = roundup(low+1, 48);
391         int cell_low  = low >> r->cell_log;
392         int cell_high = (high >> r->cell_log) - 1;
393
394         /* rtab is too inaccurate at rates > 100Mbit/s */
395         if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
396                 pr_debug("TC linklayer: Giving up ATM detection\n");
397                 return TC_LINKLAYER_ETHERNET;
398         }
399
400         if ((cell_high > cell_low) && (cell_high < 256)
401             && (rtab[cell_low] == rtab[cell_high])) {
402                 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
403                          cell_low, cell_high, rtab[cell_high]);
404                 return TC_LINKLAYER_ATM;
405         }
406         return TC_LINKLAYER_ETHERNET;
407 }
408
409 static struct qdisc_rate_table *qdisc_rtab_list;
410
411 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
412                                         struct nlattr *tab,
413                                         struct netlink_ext_ack *extack)
414 {
415         struct qdisc_rate_table *rtab;
416
417         if (tab == NULL || r->rate == 0 ||
418             r->cell_log == 0 || r->cell_log >= 32 ||
419             nla_len(tab) != TC_RTAB_SIZE) {
420                 NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
421                 return NULL;
422         }
423
424         for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
425                 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
426                     !memcmp(&rtab->data, nla_data(tab), 1024)) {
427                         rtab->refcnt++;
428                         return rtab;
429                 }
430         }
431
432         rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
433         if (rtab) {
434                 rtab->rate = *r;
435                 rtab->refcnt = 1;
436                 memcpy(rtab->data, nla_data(tab), 1024);
437                 if (r->linklayer == TC_LINKLAYER_UNAWARE)
438                         r->linklayer = __detect_linklayer(r, rtab->data);
439                 rtab->next = qdisc_rtab_list;
440                 qdisc_rtab_list = rtab;
441         } else {
442                 NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
443         }
444         return rtab;
445 }
446 EXPORT_SYMBOL(qdisc_get_rtab);
447
448 void qdisc_put_rtab(struct qdisc_rate_table *tab)
449 {
450         struct qdisc_rate_table *rtab, **rtabp;
451
452         if (!tab || --tab->refcnt)
453                 return;
454
455         for (rtabp = &qdisc_rtab_list;
456              (rtab = *rtabp) != NULL;
457              rtabp = &rtab->next) {
458                 if (rtab == tab) {
459                         *rtabp = rtab->next;
460                         kfree(rtab);
461                         return;
462                 }
463         }
464 }
465 EXPORT_SYMBOL(qdisc_put_rtab);
466
467 static LIST_HEAD(qdisc_stab_list);
468
469 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
470         [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
471         [TCA_STAB_DATA] = { .type = NLA_BINARY },
472 };
473
474 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
475                                                struct netlink_ext_ack *extack)
476 {
477         struct nlattr *tb[TCA_STAB_MAX + 1];
478         struct qdisc_size_table *stab;
479         struct tc_sizespec *s;
480         unsigned int tsize = 0;
481         u16 *tab = NULL;
482         int err;
483
484         err = nla_parse_nested_deprecated(tb, TCA_STAB_MAX, opt, stab_policy,
485                                           extack);
486         if (err < 0)
487                 return ERR_PTR(err);
488         if (!tb[TCA_STAB_BASE]) {
489                 NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
490                 return ERR_PTR(-EINVAL);
491         }
492
493         s = nla_data(tb[TCA_STAB_BASE]);
494
495         if (s->tsize > 0) {
496                 if (!tb[TCA_STAB_DATA]) {
497                         NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
498                         return ERR_PTR(-EINVAL);
499                 }
500                 tab = nla_data(tb[TCA_STAB_DATA]);
501                 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
502         }
503
504         if (tsize != s->tsize || (!tab && tsize > 0)) {
505                 NL_SET_ERR_MSG(extack, "Invalid size of size table");
506                 return ERR_PTR(-EINVAL);
507         }
508
509         list_for_each_entry(stab, &qdisc_stab_list, list) {
510                 if (memcmp(&stab->szopts, s, sizeof(*s)))
511                         continue;
512                 if (tsize > 0 &&
513                     memcmp(stab->data, tab, flex_array_size(stab, data, tsize)))
514                         continue;
515                 stab->refcnt++;
516                 return stab;
517         }
518
519         if (s->size_log > STAB_SIZE_LOG_MAX ||
520             s->cell_log > STAB_SIZE_LOG_MAX) {
521                 NL_SET_ERR_MSG(extack, "Invalid logarithmic size of size table");
522                 return ERR_PTR(-EINVAL);
523         }
524
525         stab = kmalloc(struct_size(stab, data, tsize), GFP_KERNEL);
526         if (!stab)
527                 return ERR_PTR(-ENOMEM);
528
529         stab->refcnt = 1;
530         stab->szopts = *s;
531         if (tsize > 0)
532                 memcpy(stab->data, tab, flex_array_size(stab, data, tsize));
533
534         list_add_tail(&stab->list, &qdisc_stab_list);
535
536         return stab;
537 }
538
539 void qdisc_put_stab(struct qdisc_size_table *tab)
540 {
541         if (!tab)
542                 return;
543
544         if (--tab->refcnt == 0) {
545                 list_del(&tab->list);
546                 kfree_rcu(tab, rcu);
547         }
548 }
549 EXPORT_SYMBOL(qdisc_put_stab);
550
551 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
552 {
553         struct nlattr *nest;
554
555         nest = nla_nest_start_noflag(skb, TCA_STAB);
556         if (nest == NULL)
557                 goto nla_put_failure;
558         if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
559                 goto nla_put_failure;
560         nla_nest_end(skb, nest);
561
562         return skb->len;
563
564 nla_put_failure:
565         return -1;
566 }
567
568 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
569                                const struct qdisc_size_table *stab)
570 {
571         int pkt_len, slot;
572
573         pkt_len = skb->len + stab->szopts.overhead;
574         if (unlikely(!stab->szopts.tsize))
575                 goto out;
576
577         slot = pkt_len + stab->szopts.cell_align;
578         if (unlikely(slot < 0))
579                 slot = 0;
580
581         slot >>= stab->szopts.cell_log;
582         if (likely(slot < stab->szopts.tsize))
583                 pkt_len = stab->data[slot];
584         else
585                 pkt_len = stab->data[stab->szopts.tsize - 1] *
586                                 (slot / stab->szopts.tsize) +
587                                 stab->data[slot % stab->szopts.tsize];
588
589         pkt_len <<= stab->szopts.size_log;
590 out:
591         if (unlikely(pkt_len < 1))
592                 pkt_len = 1;
593         qdisc_skb_cb(skb)->pkt_len = pkt_len;
594 }
595 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
596
597 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
598 {
599         if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
600                 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
601                         txt, qdisc->ops->id, qdisc->handle >> 16);
602                 qdisc->flags |= TCQ_F_WARN_NONWC;
603         }
604 }
605 EXPORT_SYMBOL(qdisc_warn_nonwc);
606
607 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
608 {
609         struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
610                                                  timer);
611
612         rcu_read_lock();
613         __netif_schedule(qdisc_root(wd->qdisc));
614         rcu_read_unlock();
615
616         return HRTIMER_NORESTART;
617 }
618
619 void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
620                                  clockid_t clockid)
621 {
622         hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
623         wd->timer.function = qdisc_watchdog;
624         wd->qdisc = qdisc;
625 }
626 EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
627
628 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
629 {
630         qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
631 }
632 EXPORT_SYMBOL(qdisc_watchdog_init);
633
634 void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires,
635                                       u64 delta_ns)
636 {
637         bool deactivated;
638
639         rcu_read_lock();
640         deactivated = test_bit(__QDISC_STATE_DEACTIVATED,
641                                &qdisc_root_sleeping(wd->qdisc)->state);
642         rcu_read_unlock();
643         if (deactivated)
644                 return;
645
646         if (hrtimer_is_queued(&wd->timer)) {
647                 /* If timer is already set in [expires, expires + delta_ns],
648                  * do not reprogram it.
649                  */
650                 if (wd->last_expires - expires <= delta_ns)
651                         return;
652         }
653
654         wd->last_expires = expires;
655         hrtimer_start_range_ns(&wd->timer,
656                                ns_to_ktime(expires),
657                                delta_ns,
658                                HRTIMER_MODE_ABS_PINNED);
659 }
660 EXPORT_SYMBOL(qdisc_watchdog_schedule_range_ns);
661
662 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
663 {
664         hrtimer_cancel(&wd->timer);
665 }
666 EXPORT_SYMBOL(qdisc_watchdog_cancel);
667
668 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
669 {
670         struct hlist_head *h;
671         unsigned int i;
672
673         h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
674
675         if (h != NULL) {
676                 for (i = 0; i < n; i++)
677                         INIT_HLIST_HEAD(&h[i]);
678         }
679         return h;
680 }
681
682 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
683 {
684         struct Qdisc_class_common *cl;
685         struct hlist_node *next;
686         struct hlist_head *nhash, *ohash;
687         unsigned int nsize, nmask, osize;
688         unsigned int i, h;
689
690         /* Rehash when load factor exceeds 0.75 */
691         if (clhash->hashelems * 4 <= clhash->hashsize * 3)
692                 return;
693         nsize = clhash->hashsize * 2;
694         nmask = nsize - 1;
695         nhash = qdisc_class_hash_alloc(nsize);
696         if (nhash == NULL)
697                 return;
698
699         ohash = clhash->hash;
700         osize = clhash->hashsize;
701
702         sch_tree_lock(sch);
703         for (i = 0; i < osize; i++) {
704                 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
705                         h = qdisc_class_hash(cl->classid, nmask);
706                         hlist_add_head(&cl->hnode, &nhash[h]);
707                 }
708         }
709         clhash->hash     = nhash;
710         clhash->hashsize = nsize;
711         clhash->hashmask = nmask;
712         sch_tree_unlock(sch);
713
714         kvfree(ohash);
715 }
716 EXPORT_SYMBOL(qdisc_class_hash_grow);
717
718 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
719 {
720         unsigned int size = 4;
721
722         clhash->hash = qdisc_class_hash_alloc(size);
723         if (!clhash->hash)
724                 return -ENOMEM;
725         clhash->hashsize  = size;
726         clhash->hashmask  = size - 1;
727         clhash->hashelems = 0;
728         return 0;
729 }
730 EXPORT_SYMBOL(qdisc_class_hash_init);
731
732 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
733 {
734         kvfree(clhash->hash);
735 }
736 EXPORT_SYMBOL(qdisc_class_hash_destroy);
737
738 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
739                              struct Qdisc_class_common *cl)
740 {
741         unsigned int h;
742
743         INIT_HLIST_NODE(&cl->hnode);
744         h = qdisc_class_hash(cl->classid, clhash->hashmask);
745         hlist_add_head(&cl->hnode, &clhash->hash[h]);
746         clhash->hashelems++;
747 }
748 EXPORT_SYMBOL(qdisc_class_hash_insert);
749
750 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
751                              struct Qdisc_class_common *cl)
752 {
753         hlist_del(&cl->hnode);
754         clhash->hashelems--;
755 }
756 EXPORT_SYMBOL(qdisc_class_hash_remove);
757
758 /* Allocate an unique handle from space managed by kernel
759  * Possible range is [8000-FFFF]:0000 (0x8000 values)
760  */
761 static u32 qdisc_alloc_handle(struct net_device *dev)
762 {
763         int i = 0x8000;
764         static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
765
766         do {
767                 autohandle += TC_H_MAKE(0x10000U, 0);
768                 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
769                         autohandle = TC_H_MAKE(0x80000000U, 0);
770                 if (!qdisc_lookup(dev, autohandle))
771                         return autohandle;
772                 cond_resched();
773         } while (--i > 0);
774
775         return 0;
776 }
777
778 void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len)
779 {
780         bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
781         const struct Qdisc_class_ops *cops;
782         unsigned long cl;
783         u32 parentid;
784         bool notify;
785         int drops;
786
787         if (n == 0 && len == 0)
788                 return;
789         drops = max_t(int, n, 0);
790         rcu_read_lock();
791         while ((parentid = sch->parent)) {
792                 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
793                         break;
794
795                 if (sch->flags & TCQ_F_NOPARENT)
796                         break;
797                 /* Notify parent qdisc only if child qdisc becomes empty.
798                  *
799                  * If child was empty even before update then backlog
800                  * counter is screwed and we skip notification because
801                  * parent class is already passive.
802                  *
803                  * If the original child was offloaded then it is allowed
804                  * to be seem as empty, so the parent is notified anyway.
805                  */
806                 notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
807                                                        !qdisc_is_offloaded);
808                 /* TODO: perform the search on a per txq basis */
809                 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
810                 if (sch == NULL) {
811                         WARN_ON_ONCE(parentid != TC_H_ROOT);
812                         break;
813                 }
814                 cops = sch->ops->cl_ops;
815                 if (notify && cops->qlen_notify) {
816                         cl = cops->find(sch, parentid);
817                         cops->qlen_notify(sch, cl);
818                 }
819                 sch->q.qlen -= n;
820                 sch->qstats.backlog -= len;
821                 __qdisc_qstats_drop(sch, drops);
822         }
823         rcu_read_unlock();
824 }
825 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
826
827 int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type,
828                               void *type_data)
829 {
830         struct net_device *dev = qdisc_dev(sch);
831         int err;
832
833         sch->flags &= ~TCQ_F_OFFLOADED;
834         if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
835                 return 0;
836
837         err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
838         if (err == -EOPNOTSUPP)
839                 return 0;
840
841         if (!err)
842                 sch->flags |= TCQ_F_OFFLOADED;
843
844         return err;
845 }
846 EXPORT_SYMBOL(qdisc_offload_dump_helper);
847
848 void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
849                                 struct Qdisc *new, struct Qdisc *old,
850                                 enum tc_setup_type type, void *type_data,
851                                 struct netlink_ext_ack *extack)
852 {
853         bool any_qdisc_is_offloaded;
854         int err;
855
856         if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
857                 return;
858
859         err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
860
861         /* Don't report error if the graft is part of destroy operation. */
862         if (!err || !new || new == &noop_qdisc)
863                 return;
864
865         /* Don't report error if the parent, the old child and the new
866          * one are not offloaded.
867          */
868         any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED;
869         any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED;
870         any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED;
871
872         if (any_qdisc_is_offloaded)
873                 NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
874 }
875 EXPORT_SYMBOL(qdisc_offload_graft_helper);
876
877 void qdisc_offload_query_caps(struct net_device *dev,
878                               enum tc_setup_type type,
879                               void *caps, size_t caps_len)
880 {
881         const struct net_device_ops *ops = dev->netdev_ops;
882         struct tc_query_caps_base base = {
883                 .type = type,
884                 .caps = caps,
885         };
886
887         memset(caps, 0, caps_len);
888
889         if (ops->ndo_setup_tc)
890                 ops->ndo_setup_tc(dev, TC_QUERY_CAPS, &base);
891 }
892 EXPORT_SYMBOL(qdisc_offload_query_caps);
893
894 static void qdisc_offload_graft_root(struct net_device *dev,
895                                      struct Qdisc *new, struct Qdisc *old,
896                                      struct netlink_ext_ack *extack)
897 {
898         struct tc_root_qopt_offload graft_offload = {
899                 .command        = TC_ROOT_GRAFT,
900                 .handle         = new ? new->handle : 0,
901                 .ingress        = (new && new->flags & TCQ_F_INGRESS) ||
902                                   (old && old->flags & TCQ_F_INGRESS),
903         };
904
905         qdisc_offload_graft_helper(dev, NULL, new, old,
906                                    TC_SETUP_ROOT_QDISC, &graft_offload, extack);
907 }
908
909 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
910                          u32 portid, u32 seq, u16 flags, int event)
911 {
912         struct gnet_stats_basic_sync __percpu *cpu_bstats = NULL;
913         struct gnet_stats_queue __percpu *cpu_qstats = NULL;
914         struct tcmsg *tcm;
915         struct nlmsghdr  *nlh;
916         unsigned char *b = skb_tail_pointer(skb);
917         struct gnet_dump d;
918         struct qdisc_size_table *stab;
919         u32 block_index;
920         __u32 qlen;
921
922         cond_resched();
923         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
924         if (!nlh)
925                 goto out_nlmsg_trim;
926         tcm = nlmsg_data(nlh);
927         tcm->tcm_family = AF_UNSPEC;
928         tcm->tcm__pad1 = 0;
929         tcm->tcm__pad2 = 0;
930         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
931         tcm->tcm_parent = clid;
932         tcm->tcm_handle = q->handle;
933         tcm->tcm_info = refcount_read(&q->refcnt);
934         if (nla_put_string(skb, TCA_KIND, q->ops->id))
935                 goto nla_put_failure;
936         if (q->ops->ingress_block_get) {
937                 block_index = q->ops->ingress_block_get(q);
938                 if (block_index &&
939                     nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
940                         goto nla_put_failure;
941         }
942         if (q->ops->egress_block_get) {
943                 block_index = q->ops->egress_block_get(q);
944                 if (block_index &&
945                     nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
946                         goto nla_put_failure;
947         }
948         if (q->ops->dump && q->ops->dump(q, skb) < 0)
949                 goto nla_put_failure;
950         if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
951                 goto nla_put_failure;
952         qlen = qdisc_qlen_sum(q);
953
954         stab = rtnl_dereference(q->stab);
955         if (stab && qdisc_dump_stab(skb, stab) < 0)
956                 goto nla_put_failure;
957
958         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
959                                          NULL, &d, TCA_PAD) < 0)
960                 goto nla_put_failure;
961
962         if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
963                 goto nla_put_failure;
964
965         if (qdisc_is_percpu_stats(q)) {
966                 cpu_bstats = q->cpu_bstats;
967                 cpu_qstats = q->cpu_qstats;
968         }
969
970         if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats, true) < 0 ||
971             gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
972             gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
973                 goto nla_put_failure;
974
975         if (gnet_stats_finish_copy(&d) < 0)
976                 goto nla_put_failure;
977
978         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
979         return skb->len;
980
981 out_nlmsg_trim:
982 nla_put_failure:
983         nlmsg_trim(skb, b);
984         return -1;
985 }
986
987 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
988 {
989         if (q->flags & TCQ_F_BUILTIN)
990                 return true;
991         if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
992                 return true;
993
994         return false;
995 }
996
997 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
998                         struct nlmsghdr *n, u32 clid,
999                         struct Qdisc *old, struct Qdisc *new)
1000 {
1001         struct sk_buff *skb;
1002         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1003
1004         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1005         if (!skb)
1006                 return -ENOBUFS;
1007
1008         if (old && !tc_qdisc_dump_ignore(old, false)) {
1009                 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
1010                                   0, RTM_DELQDISC) < 0)
1011                         goto err_out;
1012         }
1013         if (new && !tc_qdisc_dump_ignore(new, false)) {
1014                 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
1015                                   old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1016                         goto err_out;
1017         }
1018
1019         if (skb->len)
1020                 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1021                                       n->nlmsg_flags & NLM_F_ECHO);
1022
1023 err_out:
1024         kfree_skb(skb);
1025         return -EINVAL;
1026 }
1027
1028 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
1029                                struct nlmsghdr *n, u32 clid,
1030                                struct Qdisc *old, struct Qdisc *new)
1031 {
1032         if (new || old)
1033                 qdisc_notify(net, skb, n, clid, old, new);
1034
1035         if (old)
1036                 qdisc_put(old);
1037 }
1038
1039 static void qdisc_clear_nolock(struct Qdisc *sch)
1040 {
1041         sch->flags &= ~TCQ_F_NOLOCK;
1042         if (!(sch->flags & TCQ_F_CPUSTATS))
1043                 return;
1044
1045         free_percpu(sch->cpu_bstats);
1046         free_percpu(sch->cpu_qstats);
1047         sch->cpu_bstats = NULL;
1048         sch->cpu_qstats = NULL;
1049         sch->flags &= ~TCQ_F_CPUSTATS;
1050 }
1051
1052 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
1053  * to device "dev".
1054  *
1055  * When appropriate send a netlink notification using 'skb'
1056  * and "n".
1057  *
1058  * On success, destroy old qdisc.
1059  */
1060
1061 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
1062                        struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
1063                        struct Qdisc *new, struct Qdisc *old,
1064                        struct netlink_ext_ack *extack)
1065 {
1066         struct Qdisc *q = old;
1067         struct net *net = dev_net(dev);
1068
1069         if (parent == NULL) {
1070                 unsigned int i, num_q, ingress;
1071
1072                 ingress = 0;
1073                 num_q = dev->num_tx_queues;
1074                 if ((q && q->flags & TCQ_F_INGRESS) ||
1075                     (new && new->flags & TCQ_F_INGRESS)) {
1076                         num_q = 1;
1077                         ingress = 1;
1078                         if (!dev_ingress_queue(dev)) {
1079                                 NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
1080                                 return -ENOENT;
1081                         }
1082                 }
1083
1084                 if (dev->flags & IFF_UP)
1085                         dev_deactivate(dev);
1086
1087                 qdisc_offload_graft_root(dev, new, old, extack);
1088
1089                 if (new && new->ops->attach && !ingress)
1090                         goto skip;
1091
1092                 for (i = 0; i < num_q; i++) {
1093                         struct netdev_queue *dev_queue = dev_ingress_queue(dev);
1094
1095                         if (!ingress)
1096                                 dev_queue = netdev_get_tx_queue(dev, i);
1097
1098                         old = dev_graft_qdisc(dev_queue, new);
1099                         if (new && i > 0)
1100                                 qdisc_refcount_inc(new);
1101
1102                         if (!ingress)
1103                                 qdisc_put(old);
1104                 }
1105
1106 skip:
1107                 if (!ingress) {
1108                         old = rtnl_dereference(dev->qdisc);
1109                         if (new && !new->ops->attach)
1110                                 qdisc_refcount_inc(new);
1111                         rcu_assign_pointer(dev->qdisc, new ? : &noop_qdisc);
1112
1113                         notify_and_destroy(net, skb, n, classid, old, new);
1114
1115                         if (new && new->ops->attach)
1116                                 new->ops->attach(new);
1117                 } else {
1118                         notify_and_destroy(net, skb, n, classid, old, new);
1119                 }
1120
1121                 if (dev->flags & IFF_UP)
1122                         dev_activate(dev);
1123         } else {
1124                 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
1125                 unsigned long cl;
1126                 int err;
1127
1128                 /* Only support running class lockless if parent is lockless */
1129                 if (new && (new->flags & TCQ_F_NOLOCK) && !(parent->flags & TCQ_F_NOLOCK))
1130                         qdisc_clear_nolock(new);
1131
1132                 if (!cops || !cops->graft)
1133                         return -EOPNOTSUPP;
1134
1135                 cl = cops->find(parent, classid);
1136                 if (!cl) {
1137                         NL_SET_ERR_MSG(extack, "Specified class not found");
1138                         return -ENOENT;
1139                 }
1140
1141                 if (new && new->ops == &noqueue_qdisc_ops) {
1142                         NL_SET_ERR_MSG(extack, "Cannot assign noqueue to a class");
1143                         return -EINVAL;
1144                 }
1145
1146                 err = cops->graft(parent, cl, new, &old, extack);
1147                 if (err)
1148                         return err;
1149                 notify_and_destroy(net, skb, n, classid, old, new);
1150         }
1151         return 0;
1152 }
1153
1154 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1155                                    struct netlink_ext_ack *extack)
1156 {
1157         u32 block_index;
1158
1159         if (tca[TCA_INGRESS_BLOCK]) {
1160                 block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1161
1162                 if (!block_index) {
1163                         NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1164                         return -EINVAL;
1165                 }
1166                 if (!sch->ops->ingress_block_set) {
1167                         NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1168                         return -EOPNOTSUPP;
1169                 }
1170                 sch->ops->ingress_block_set(sch, block_index);
1171         }
1172         if (tca[TCA_EGRESS_BLOCK]) {
1173                 block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1174
1175                 if (!block_index) {
1176                         NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1177                         return -EINVAL;
1178                 }
1179                 if (!sch->ops->egress_block_set) {
1180                         NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1181                         return -EOPNOTSUPP;
1182                 }
1183                 sch->ops->egress_block_set(sch, block_index);
1184         }
1185         return 0;
1186 }
1187
1188 /*
1189    Allocate and initialize new qdisc.
1190
1191    Parameters are passed via opt.
1192  */
1193
1194 static struct Qdisc *qdisc_create(struct net_device *dev,
1195                                   struct netdev_queue *dev_queue,
1196                                   u32 parent, u32 handle,
1197                                   struct nlattr **tca, int *errp,
1198                                   struct netlink_ext_ack *extack)
1199 {
1200         int err;
1201         struct nlattr *kind = tca[TCA_KIND];
1202         struct Qdisc *sch;
1203         struct Qdisc_ops *ops;
1204         struct qdisc_size_table *stab;
1205
1206         ops = qdisc_lookup_ops(kind);
1207 #ifdef CONFIG_MODULES
1208         if (ops == NULL && kind != NULL) {
1209                 char name[IFNAMSIZ];
1210                 if (nla_strscpy(name, kind, IFNAMSIZ) >= 0) {
1211                         /* We dropped the RTNL semaphore in order to
1212                          * perform the module load.  So, even if we
1213                          * succeeded in loading the module we have to
1214                          * tell the caller to replay the request.  We
1215                          * indicate this using -EAGAIN.
1216                          * We replay the request because the device may
1217                          * go away in the mean time.
1218                          */
1219                         rtnl_unlock();
1220                         request_module("sch_%s", name);
1221                         rtnl_lock();
1222                         ops = qdisc_lookup_ops(kind);
1223                         if (ops != NULL) {
1224                                 /* We will try again qdisc_lookup_ops,
1225                                  * so don't keep a reference.
1226                                  */
1227                                 module_put(ops->owner);
1228                                 err = -EAGAIN;
1229                                 goto err_out;
1230                         }
1231                 }
1232         }
1233 #endif
1234
1235         err = -ENOENT;
1236         if (!ops) {
1237                 NL_SET_ERR_MSG(extack, "Specified qdisc kind is unknown");
1238                 goto err_out;
1239         }
1240
1241         sch = qdisc_alloc(dev_queue, ops, extack);
1242         if (IS_ERR(sch)) {
1243                 err = PTR_ERR(sch);
1244                 goto err_out2;
1245         }
1246
1247         sch->parent = parent;
1248
1249         if (handle == TC_H_INGRESS) {
1250                 if (!(sch->flags & TCQ_F_INGRESS)) {
1251                         NL_SET_ERR_MSG(extack,
1252                                        "Specified parent ID is reserved for ingress and clsact Qdiscs");
1253                         err = -EINVAL;
1254                         goto err_out3;
1255                 }
1256                 handle = TC_H_MAKE(TC_H_INGRESS, 0);
1257         } else {
1258                 if (handle == 0) {
1259                         handle = qdisc_alloc_handle(dev);
1260                         if (handle == 0) {
1261                                 NL_SET_ERR_MSG(extack, "Maximum number of qdisc handles was exceeded");
1262                                 err = -ENOSPC;
1263                                 goto err_out3;
1264                         }
1265                 }
1266                 if (!netif_is_multiqueue(dev))
1267                         sch->flags |= TCQ_F_ONETXQUEUE;
1268         }
1269
1270         sch->handle = handle;
1271
1272         /* This exist to keep backward compatible with a userspace
1273          * loophole, what allowed userspace to get IFF_NO_QUEUE
1274          * facility on older kernels by setting tx_queue_len=0 (prior
1275          * to qdisc init), and then forgot to reinit tx_queue_len
1276          * before again attaching a qdisc.
1277          */
1278         if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1279                 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1280                 netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1281         }
1282
1283         err = qdisc_block_indexes_set(sch, tca, extack);
1284         if (err)
1285                 goto err_out3;
1286
1287         if (ops->init) {
1288                 err = ops->init(sch, tca[TCA_OPTIONS], extack);
1289                 if (err != 0)
1290                         goto err_out5;
1291         }
1292
1293         if (tca[TCA_STAB]) {
1294                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1295                 if (IS_ERR(stab)) {
1296                         err = PTR_ERR(stab);
1297                         goto err_out4;
1298                 }
1299                 rcu_assign_pointer(sch->stab, stab);
1300         }
1301         if (tca[TCA_RATE]) {
1302                 err = -EOPNOTSUPP;
1303                 if (sch->flags & TCQ_F_MQROOT) {
1304                         NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1305                         goto err_out4;
1306                 }
1307
1308                 err = gen_new_estimator(&sch->bstats,
1309                                         sch->cpu_bstats,
1310                                         &sch->rate_est,
1311                                         NULL,
1312                                         true,
1313                                         tca[TCA_RATE]);
1314                 if (err) {
1315                         NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1316                         goto err_out4;
1317                 }
1318         }
1319
1320         qdisc_hash_add(sch, false);
1321         trace_qdisc_create(ops, dev, parent);
1322
1323         return sch;
1324
1325 err_out5:
1326         /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1327         if (ops->destroy)
1328                 ops->destroy(sch);
1329 err_out3:
1330         netdev_put(dev, &sch->dev_tracker);
1331         qdisc_free(sch);
1332 err_out2:
1333         module_put(ops->owner);
1334 err_out:
1335         *errp = err;
1336         return NULL;
1337
1338 err_out4:
1339         /*
1340          * Any broken qdiscs that would require a ops->reset() here?
1341          * The qdisc was never in action so it shouldn't be necessary.
1342          */
1343         qdisc_put_stab(rtnl_dereference(sch->stab));
1344         if (ops->destroy)
1345                 ops->destroy(sch);
1346         goto err_out3;
1347 }
1348
1349 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1350                         struct netlink_ext_ack *extack)
1351 {
1352         struct qdisc_size_table *ostab, *stab = NULL;
1353         int err = 0;
1354
1355         if (tca[TCA_OPTIONS]) {
1356                 if (!sch->ops->change) {
1357                         NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1358                         return -EINVAL;
1359                 }
1360                 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1361                         NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1362                         return -EOPNOTSUPP;
1363                 }
1364                 err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1365                 if (err)
1366                         return err;
1367         }
1368
1369         if (tca[TCA_STAB]) {
1370                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1371                 if (IS_ERR(stab))
1372                         return PTR_ERR(stab);
1373         }
1374
1375         ostab = rtnl_dereference(sch->stab);
1376         rcu_assign_pointer(sch->stab, stab);
1377         qdisc_put_stab(ostab);
1378
1379         if (tca[TCA_RATE]) {
1380                 /* NB: ignores errors from replace_estimator
1381                    because change can't be undone. */
1382                 if (sch->flags & TCQ_F_MQROOT)
1383                         goto out;
1384                 gen_replace_estimator(&sch->bstats,
1385                                       sch->cpu_bstats,
1386                                       &sch->rate_est,
1387                                       NULL,
1388                                       true,
1389                                       tca[TCA_RATE]);
1390         }
1391 out:
1392         return 0;
1393 }
1394
1395 struct check_loop_arg {
1396         struct qdisc_walker     w;
1397         struct Qdisc            *p;
1398         int                     depth;
1399 };
1400
1401 static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1402                          struct qdisc_walker *w);
1403
1404 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1405 {
1406         struct check_loop_arg   arg;
1407
1408         if (q->ops->cl_ops == NULL)
1409                 return 0;
1410
1411         arg.w.stop = arg.w.skip = arg.w.count = 0;
1412         arg.w.fn = check_loop_fn;
1413         arg.depth = depth;
1414         arg.p = p;
1415         q->ops->cl_ops->walk(q, &arg.w);
1416         return arg.w.stop ? -ELOOP : 0;
1417 }
1418
1419 static int
1420 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1421 {
1422         struct Qdisc *leaf;
1423         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1424         struct check_loop_arg *arg = (struct check_loop_arg *)w;
1425
1426         leaf = cops->leaf(q, cl);
1427         if (leaf) {
1428                 if (leaf == arg->p || arg->depth > 7)
1429                         return -ELOOP;
1430                 return check_loop(leaf, arg->p, arg->depth + 1);
1431         }
1432         return 0;
1433 }
1434
1435 const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
1436         [TCA_KIND]              = { .type = NLA_STRING },
1437         [TCA_RATE]              = { .type = NLA_BINARY,
1438                                     .len = sizeof(struct tc_estimator) },
1439         [TCA_STAB]              = { .type = NLA_NESTED },
1440         [TCA_DUMP_INVISIBLE]    = { .type = NLA_FLAG },
1441         [TCA_CHAIN]             = { .type = NLA_U32 },
1442         [TCA_INGRESS_BLOCK]     = { .type = NLA_U32 },
1443         [TCA_EGRESS_BLOCK]      = { .type = NLA_U32 },
1444 };
1445
1446 /*
1447  * Delete/get qdisc.
1448  */
1449
1450 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1451                         struct netlink_ext_ack *extack)
1452 {
1453         struct net *net = sock_net(skb->sk);
1454         struct tcmsg *tcm = nlmsg_data(n);
1455         struct nlattr *tca[TCA_MAX + 1];
1456         struct net_device *dev;
1457         u32 clid;
1458         struct Qdisc *q = NULL;
1459         struct Qdisc *p = NULL;
1460         int err;
1461
1462         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1463                                      rtm_tca_policy, extack);
1464         if (err < 0)
1465                 return err;
1466
1467         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1468         if (!dev)
1469                 return -ENODEV;
1470
1471         clid = tcm->tcm_parent;
1472         if (clid) {
1473                 if (clid != TC_H_ROOT) {
1474                         if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1475                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1476                                 if (!p) {
1477                                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1478                                         return -ENOENT;
1479                                 }
1480                                 q = qdisc_leaf(p, clid);
1481                         } else if (dev_ingress_queue(dev)) {
1482                                 q = rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping);
1483                         }
1484                 } else {
1485                         q = rtnl_dereference(dev->qdisc);
1486                 }
1487                 if (!q) {
1488                         NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1489                         return -ENOENT;
1490                 }
1491
1492                 if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1493                         NL_SET_ERR_MSG(extack, "Invalid handle");
1494                         return -EINVAL;
1495                 }
1496         } else {
1497                 q = qdisc_lookup(dev, tcm->tcm_handle);
1498                 if (!q) {
1499                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1500                         return -ENOENT;
1501                 }
1502         }
1503
1504         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1505                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1506                 return -EINVAL;
1507         }
1508
1509         if (n->nlmsg_type == RTM_DELQDISC) {
1510                 if (!clid) {
1511                         NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1512                         return -EINVAL;
1513                 }
1514                 if (q->handle == 0) {
1515                         NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1516                         return -ENOENT;
1517                 }
1518                 err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1519                 if (err != 0)
1520                         return err;
1521         } else {
1522                 qdisc_notify(net, skb, n, clid, NULL, q);
1523         }
1524         return 0;
1525 }
1526
1527 /*
1528  * Create/change qdisc.
1529  */
1530
1531 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1532                            struct netlink_ext_ack *extack)
1533 {
1534         struct net *net = sock_net(skb->sk);
1535         struct tcmsg *tcm;
1536         struct nlattr *tca[TCA_MAX + 1];
1537         struct net_device *dev;
1538         u32 clid;
1539         struct Qdisc *q, *p;
1540         int err;
1541
1542 replay:
1543         /* Reinit, just in case something touches this. */
1544         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1545                                      rtm_tca_policy, extack);
1546         if (err < 0)
1547                 return err;
1548
1549         tcm = nlmsg_data(n);
1550         clid = tcm->tcm_parent;
1551         q = p = NULL;
1552
1553         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1554         if (!dev)
1555                 return -ENODEV;
1556
1557
1558         if (clid) {
1559                 if (clid != TC_H_ROOT) {
1560                         if (clid != TC_H_INGRESS) {
1561                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1562                                 if (!p) {
1563                                         NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1564                                         return -ENOENT;
1565                                 }
1566                                 q = qdisc_leaf(p, clid);
1567                         } else if (dev_ingress_queue_create(dev)) {
1568                                 q = rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping);
1569                         }
1570                 } else {
1571                         q = rtnl_dereference(dev->qdisc);
1572                 }
1573
1574                 /* It may be default qdisc, ignore it */
1575                 if (q && q->handle == 0)
1576                         q = NULL;
1577
1578                 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1579                         if (tcm->tcm_handle) {
1580                                 if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1581                                         NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1582                                         return -EEXIST;
1583                                 }
1584                                 if (TC_H_MIN(tcm->tcm_handle)) {
1585                                         NL_SET_ERR_MSG(extack, "Invalid minor handle");
1586                                         return -EINVAL;
1587                                 }
1588                                 q = qdisc_lookup(dev, tcm->tcm_handle);
1589                                 if (!q)
1590                                         goto create_n_graft;
1591                                 if (n->nlmsg_flags & NLM_F_EXCL) {
1592                                         NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1593                                         return -EEXIST;
1594                                 }
1595                                 if (tca[TCA_KIND] &&
1596                                     nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1597                                         NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1598                                         return -EINVAL;
1599                                 }
1600                                 if (q->flags & TCQ_F_INGRESS) {
1601                                         NL_SET_ERR_MSG(extack,
1602                                                        "Cannot regraft ingress or clsact Qdiscs");
1603                                         return -EINVAL;
1604                                 }
1605                                 if (q == p ||
1606                                     (p && check_loop(q, p, 0))) {
1607                                         NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1608                                         return -ELOOP;
1609                                 }
1610                                 if (clid == TC_H_INGRESS) {
1611                                         NL_SET_ERR_MSG(extack, "Ingress cannot graft directly");
1612                                         return -EINVAL;
1613                                 }
1614                                 qdisc_refcount_inc(q);
1615                                 goto graft;
1616                         } else {
1617                                 if (!q)
1618                                         goto create_n_graft;
1619
1620                                 /* This magic test requires explanation.
1621                                  *
1622                                  *   We know, that some child q is already
1623                                  *   attached to this parent and have choice:
1624                                  *   either to change it or to create/graft new one.
1625                                  *
1626                                  *   1. We are allowed to create/graft only
1627                                  *   if CREATE and REPLACE flags are set.
1628                                  *
1629                                  *   2. If EXCL is set, requestor wanted to say,
1630                                  *   that qdisc tcm_handle is not expected
1631                                  *   to exist, so that we choose create/graft too.
1632                                  *
1633                                  *   3. The last case is when no flags are set.
1634                                  *   Alas, it is sort of hole in API, we
1635                                  *   cannot decide what to do unambiguously.
1636                                  *   For now we select create/graft, if
1637                                  *   user gave KIND, which does not match existing.
1638                                  */
1639                                 if ((n->nlmsg_flags & NLM_F_CREATE) &&
1640                                     (n->nlmsg_flags & NLM_F_REPLACE) &&
1641                                     ((n->nlmsg_flags & NLM_F_EXCL) ||
1642                                      (tca[TCA_KIND] &&
1643                                       nla_strcmp(tca[TCA_KIND], q->ops->id))))
1644                                         goto create_n_graft;
1645                         }
1646                 }
1647         } else {
1648                 if (!tcm->tcm_handle) {
1649                         NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1650                         return -EINVAL;
1651                 }
1652                 q = qdisc_lookup(dev, tcm->tcm_handle);
1653         }
1654
1655         /* Change qdisc parameters */
1656         if (!q) {
1657                 NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1658                 return -ENOENT;
1659         }
1660         if (n->nlmsg_flags & NLM_F_EXCL) {
1661                 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1662                 return -EEXIST;
1663         }
1664         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1665                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1666                 return -EINVAL;
1667         }
1668         err = qdisc_change(q, tca, extack);
1669         if (err == 0)
1670                 qdisc_notify(net, skb, n, clid, NULL, q);
1671         return err;
1672
1673 create_n_graft:
1674         if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1675                 NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1676                 return -ENOENT;
1677         }
1678         if (clid == TC_H_INGRESS) {
1679                 if (dev_ingress_queue(dev)) {
1680                         q = qdisc_create(dev, dev_ingress_queue(dev),
1681                                          tcm->tcm_parent, tcm->tcm_parent,
1682                                          tca, &err, extack);
1683                 } else {
1684                         NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1685                         err = -ENOENT;
1686                 }
1687         } else {
1688                 struct netdev_queue *dev_queue;
1689
1690                 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1691                         dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1692                 else if (p)
1693                         dev_queue = p->dev_queue;
1694                 else
1695                         dev_queue = netdev_get_tx_queue(dev, 0);
1696
1697                 q = qdisc_create(dev, dev_queue,
1698                                  tcm->tcm_parent, tcm->tcm_handle,
1699                                  tca, &err, extack);
1700         }
1701         if (q == NULL) {
1702                 if (err == -EAGAIN)
1703                         goto replay;
1704                 return err;
1705         }
1706
1707 graft:
1708         err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1709         if (err) {
1710                 if (q)
1711                         qdisc_put(q);
1712                 return err;
1713         }
1714
1715         return 0;
1716 }
1717
1718 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1719                               struct netlink_callback *cb,
1720                               int *q_idx_p, int s_q_idx, bool recur,
1721                               bool dump_invisible)
1722 {
1723         int ret = 0, q_idx = *q_idx_p;
1724         struct Qdisc *q;
1725         int b;
1726
1727         if (!root)
1728                 return 0;
1729
1730         q = root;
1731         if (q_idx < s_q_idx) {
1732                 q_idx++;
1733         } else {
1734                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1735                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1736                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1737                                   RTM_NEWQDISC) <= 0)
1738                         goto done;
1739                 q_idx++;
1740         }
1741
1742         /* If dumping singletons, there is no qdisc_dev(root) and the singleton
1743          * itself has already been dumped.
1744          *
1745          * If we've already dumped the top-level (ingress) qdisc above and the global
1746          * qdisc hashtable, we don't want to hit it again
1747          */
1748         if (!qdisc_dev(root) || !recur)
1749                 goto out;
1750
1751         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1752                 if (q_idx < s_q_idx) {
1753                         q_idx++;
1754                         continue;
1755                 }
1756                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1757                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1758                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1759                                   RTM_NEWQDISC) <= 0)
1760                         goto done;
1761                 q_idx++;
1762         }
1763
1764 out:
1765         *q_idx_p = q_idx;
1766         return ret;
1767 done:
1768         ret = -1;
1769         goto out;
1770 }
1771
1772 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1773 {
1774         struct net *net = sock_net(skb->sk);
1775         int idx, q_idx;
1776         int s_idx, s_q_idx;
1777         struct net_device *dev;
1778         const struct nlmsghdr *nlh = cb->nlh;
1779         struct nlattr *tca[TCA_MAX + 1];
1780         int err;
1781
1782         s_idx = cb->args[0];
1783         s_q_idx = q_idx = cb->args[1];
1784
1785         idx = 0;
1786         ASSERT_RTNL();
1787
1788         err = nlmsg_parse_deprecated(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
1789                                      rtm_tca_policy, cb->extack);
1790         if (err < 0)
1791                 return err;
1792
1793         for_each_netdev(net, dev) {
1794                 struct netdev_queue *dev_queue;
1795
1796                 if (idx < s_idx)
1797                         goto cont;
1798                 if (idx > s_idx)
1799                         s_q_idx = 0;
1800                 q_idx = 0;
1801
1802                 if (tc_dump_qdisc_root(rtnl_dereference(dev->qdisc),
1803                                        skb, cb, &q_idx, s_q_idx,
1804                                        true, tca[TCA_DUMP_INVISIBLE]) < 0)
1805                         goto done;
1806
1807                 dev_queue = dev_ingress_queue(dev);
1808                 if (dev_queue &&
1809                     tc_dump_qdisc_root(rtnl_dereference(dev_queue->qdisc_sleeping),
1810                                        skb, cb, &q_idx, s_q_idx, false,
1811                                        tca[TCA_DUMP_INVISIBLE]) < 0)
1812                         goto done;
1813
1814 cont:
1815                 idx++;
1816         }
1817
1818 done:
1819         cb->args[0] = idx;
1820         cb->args[1] = q_idx;
1821
1822         return skb->len;
1823 }
1824
1825
1826
1827 /************************************************
1828  *      Traffic classes manipulation.           *
1829  ************************************************/
1830
1831 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1832                           unsigned long cl,
1833                           u32 portid, u32 seq, u16 flags, int event)
1834 {
1835         struct tcmsg *tcm;
1836         struct nlmsghdr  *nlh;
1837         unsigned char *b = skb_tail_pointer(skb);
1838         struct gnet_dump d;
1839         const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1840
1841         cond_resched();
1842         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1843         if (!nlh)
1844                 goto out_nlmsg_trim;
1845         tcm = nlmsg_data(nlh);
1846         tcm->tcm_family = AF_UNSPEC;
1847         tcm->tcm__pad1 = 0;
1848         tcm->tcm__pad2 = 0;
1849         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1850         tcm->tcm_parent = q->handle;
1851         tcm->tcm_handle = q->handle;
1852         tcm->tcm_info = 0;
1853         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1854                 goto nla_put_failure;
1855         if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1856                 goto nla_put_failure;
1857
1858         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1859                                          NULL, &d, TCA_PAD) < 0)
1860                 goto nla_put_failure;
1861
1862         if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1863                 goto nla_put_failure;
1864
1865         if (gnet_stats_finish_copy(&d) < 0)
1866                 goto nla_put_failure;
1867
1868         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1869         return skb->len;
1870
1871 out_nlmsg_trim:
1872 nla_put_failure:
1873         nlmsg_trim(skb, b);
1874         return -1;
1875 }
1876
1877 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1878                          struct nlmsghdr *n, struct Qdisc *q,
1879                          unsigned long cl, int event)
1880 {
1881         struct sk_buff *skb;
1882         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1883
1884         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1885         if (!skb)
1886                 return -ENOBUFS;
1887
1888         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1889                 kfree_skb(skb);
1890                 return -EINVAL;
1891         }
1892
1893         return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1894                               n->nlmsg_flags & NLM_F_ECHO);
1895 }
1896
1897 static int tclass_del_notify(struct net *net,
1898                              const struct Qdisc_class_ops *cops,
1899                              struct sk_buff *oskb, struct nlmsghdr *n,
1900                              struct Qdisc *q, unsigned long cl,
1901                              struct netlink_ext_ack *extack)
1902 {
1903         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1904         struct sk_buff *skb;
1905         int err = 0;
1906
1907         if (!cops->delete)
1908                 return -EOPNOTSUPP;
1909
1910         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1911         if (!skb)
1912                 return -ENOBUFS;
1913
1914         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1915                            RTM_DELTCLASS) < 0) {
1916                 kfree_skb(skb);
1917                 return -EINVAL;
1918         }
1919
1920         err = cops->delete(q, cl, extack);
1921         if (err) {
1922                 kfree_skb(skb);
1923                 return err;
1924         }
1925
1926         err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1927                              n->nlmsg_flags & NLM_F_ECHO);
1928         return err;
1929 }
1930
1931 #ifdef CONFIG_NET_CLS
1932
1933 struct tcf_bind_args {
1934         struct tcf_walker w;
1935         unsigned long base;
1936         unsigned long cl;
1937         u32 classid;
1938 };
1939
1940 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
1941 {
1942         struct tcf_bind_args *a = (void *)arg;
1943
1944         if (n && tp->ops->bind_class) {
1945                 struct Qdisc *q = tcf_block_q(tp->chain->block);
1946
1947                 sch_tree_lock(q);
1948                 tp->ops->bind_class(n, a->classid, a->cl, q, a->base);
1949                 sch_tree_unlock(q);
1950         }
1951         return 0;
1952 }
1953
1954 struct tc_bind_class_args {
1955         struct qdisc_walker w;
1956         unsigned long new_cl;
1957         u32 portid;
1958         u32 clid;
1959 };
1960
1961 static int tc_bind_class_walker(struct Qdisc *q, unsigned long cl,
1962                                 struct qdisc_walker *w)
1963 {
1964         struct tc_bind_class_args *a = (struct tc_bind_class_args *)w;
1965         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1966         struct tcf_block *block;
1967         struct tcf_chain *chain;
1968
1969         block = cops->tcf_block(q, cl, NULL);
1970         if (!block)
1971                 return 0;
1972         for (chain = tcf_get_next_chain(block, NULL);
1973              chain;
1974              chain = tcf_get_next_chain(block, chain)) {
1975                 struct tcf_proto *tp;
1976
1977                 for (tp = tcf_get_next_proto(chain, NULL);
1978                      tp; tp = tcf_get_next_proto(chain, tp)) {
1979                         struct tcf_bind_args arg = {};
1980
1981                         arg.w.fn = tcf_node_bind;
1982                         arg.classid = a->clid;
1983                         arg.base = cl;
1984                         arg.cl = a->new_cl;
1985                         tp->ops->walk(tp, &arg.w, true);
1986                 }
1987         }
1988
1989         return 0;
1990 }
1991
1992 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1993                            unsigned long new_cl)
1994 {
1995         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1996         struct tc_bind_class_args args = {};
1997
1998         if (!cops->tcf_block)
1999                 return;
2000         args.portid = portid;
2001         args.clid = clid;
2002         args.new_cl = new_cl;
2003         args.w.fn = tc_bind_class_walker;
2004         q->ops->cl_ops->walk(q, &args.w);
2005 }
2006
2007 #else
2008
2009 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
2010                            unsigned long new_cl)
2011 {
2012 }
2013
2014 #endif
2015
2016 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
2017                          struct netlink_ext_ack *extack)
2018 {
2019         struct net *net = sock_net(skb->sk);
2020         struct tcmsg *tcm = nlmsg_data(n);
2021         struct nlattr *tca[TCA_MAX + 1];
2022         struct net_device *dev;
2023         struct Qdisc *q = NULL;
2024         const struct Qdisc_class_ops *cops;
2025         unsigned long cl = 0;
2026         unsigned long new_cl;
2027         u32 portid;
2028         u32 clid;
2029         u32 qid;
2030         int err;
2031
2032         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
2033                                      rtm_tca_policy, extack);
2034         if (err < 0)
2035                 return err;
2036
2037         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
2038         if (!dev)
2039                 return -ENODEV;
2040
2041         /*
2042            parent == TC_H_UNSPEC - unspecified parent.
2043            parent == TC_H_ROOT   - class is root, which has no parent.
2044            parent == X:0         - parent is root class.
2045            parent == X:Y         - parent is a node in hierarchy.
2046            parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
2047
2048            handle == 0:0         - generate handle from kernel pool.
2049            handle == 0:Y         - class is X:Y, where X:0 is qdisc.
2050            handle == X:Y         - clear.
2051            handle == X:0         - root class.
2052          */
2053
2054         /* Step 1. Determine qdisc handle X:0 */
2055
2056         portid = tcm->tcm_parent;
2057         clid = tcm->tcm_handle;
2058         qid = TC_H_MAJ(clid);
2059
2060         if (portid != TC_H_ROOT) {
2061                 u32 qid1 = TC_H_MAJ(portid);
2062
2063                 if (qid && qid1) {
2064                         /* If both majors are known, they must be identical. */
2065                         if (qid != qid1)
2066                                 return -EINVAL;
2067                 } else if (qid1) {
2068                         qid = qid1;
2069                 } else if (qid == 0)
2070                         qid = rtnl_dereference(dev->qdisc)->handle;
2071
2072                 /* Now qid is genuine qdisc handle consistent
2073                  * both with parent and child.
2074                  *
2075                  * TC_H_MAJ(portid) still may be unspecified, complete it now.
2076                  */
2077                 if (portid)
2078                         portid = TC_H_MAKE(qid, portid);
2079         } else {
2080                 if (qid == 0)
2081                         qid = rtnl_dereference(dev->qdisc)->handle;
2082         }
2083
2084         /* OK. Locate qdisc */
2085         q = qdisc_lookup(dev, qid);
2086         if (!q)
2087                 return -ENOENT;
2088
2089         /* An check that it supports classes */
2090         cops = q->ops->cl_ops;
2091         if (cops == NULL)
2092                 return -EINVAL;
2093
2094         /* Now try to get class */
2095         if (clid == 0) {
2096                 if (portid == TC_H_ROOT)
2097                         clid = qid;
2098         } else
2099                 clid = TC_H_MAKE(qid, clid);
2100
2101         if (clid)
2102                 cl = cops->find(q, clid);
2103
2104         if (cl == 0) {
2105                 err = -ENOENT;
2106                 if (n->nlmsg_type != RTM_NEWTCLASS ||
2107                     !(n->nlmsg_flags & NLM_F_CREATE))
2108                         goto out;
2109         } else {
2110                 switch (n->nlmsg_type) {
2111                 case RTM_NEWTCLASS:
2112                         err = -EEXIST;
2113                         if (n->nlmsg_flags & NLM_F_EXCL)
2114                                 goto out;
2115                         break;
2116                 case RTM_DELTCLASS:
2117                         err = tclass_del_notify(net, cops, skb, n, q, cl, extack);
2118                         /* Unbind the class with flilters with 0 */
2119                         tc_bind_tclass(q, portid, clid, 0);
2120                         goto out;
2121                 case RTM_GETTCLASS:
2122                         err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
2123                         goto out;
2124                 default:
2125                         err = -EINVAL;
2126                         goto out;
2127                 }
2128         }
2129
2130         if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
2131                 NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
2132                 return -EOPNOTSUPP;
2133         }
2134
2135         new_cl = cl;
2136         err = -EOPNOTSUPP;
2137         if (cops->change)
2138                 err = cops->change(q, clid, portid, tca, &new_cl, extack);
2139         if (err == 0) {
2140                 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
2141                 /* We just create a new class, need to do reverse binding. */
2142                 if (cl != new_cl)
2143                         tc_bind_tclass(q, portid, clid, new_cl);
2144         }
2145 out:
2146         return err;
2147 }
2148
2149 struct qdisc_dump_args {
2150         struct qdisc_walker     w;
2151         struct sk_buff          *skb;
2152         struct netlink_callback *cb;
2153 };
2154
2155 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
2156                             struct qdisc_walker *arg)
2157 {
2158         struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
2159
2160         return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
2161                               a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
2162                               RTM_NEWTCLASS);
2163 }
2164
2165 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
2166                                 struct tcmsg *tcm, struct netlink_callback *cb,
2167                                 int *t_p, int s_t)
2168 {
2169         struct qdisc_dump_args arg;
2170
2171         if (tc_qdisc_dump_ignore(q, false) ||
2172             *t_p < s_t || !q->ops->cl_ops ||
2173             (tcm->tcm_parent &&
2174              TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2175                 (*t_p)++;
2176                 return 0;
2177         }
2178         if (*t_p > s_t)
2179                 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2180         arg.w.fn = qdisc_class_dump;
2181         arg.skb = skb;
2182         arg.cb = cb;
2183         arg.w.stop  = 0;
2184         arg.w.skip = cb->args[1];
2185         arg.w.count = 0;
2186         q->ops->cl_ops->walk(q, &arg.w);
2187         cb->args[1] = arg.w.count;
2188         if (arg.w.stop)
2189                 return -1;
2190         (*t_p)++;
2191         return 0;
2192 }
2193
2194 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2195                                struct tcmsg *tcm, struct netlink_callback *cb,
2196                                int *t_p, int s_t, bool recur)
2197 {
2198         struct Qdisc *q;
2199         int b;
2200
2201         if (!root)
2202                 return 0;
2203
2204         if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2205                 return -1;
2206
2207         if (!qdisc_dev(root) || !recur)
2208                 return 0;
2209
2210         if (tcm->tcm_parent) {
2211                 q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2212                 if (q && q != root &&
2213                     tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2214                         return -1;
2215                 return 0;
2216         }
2217         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2218                 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2219                         return -1;
2220         }
2221
2222         return 0;
2223 }
2224
2225 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2226 {
2227         struct tcmsg *tcm = nlmsg_data(cb->nlh);
2228         struct net *net = sock_net(skb->sk);
2229         struct netdev_queue *dev_queue;
2230         struct net_device *dev;
2231         int t, s_t;
2232
2233         if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2234                 return 0;
2235         dev = dev_get_by_index(net, tcm->tcm_ifindex);
2236         if (!dev)
2237                 return 0;
2238
2239         s_t = cb->args[0];
2240         t = 0;
2241
2242         if (tc_dump_tclass_root(rtnl_dereference(dev->qdisc),
2243                                 skb, tcm, cb, &t, s_t, true) < 0)
2244                 goto done;
2245
2246         dev_queue = dev_ingress_queue(dev);
2247         if (dev_queue &&
2248             tc_dump_tclass_root(rtnl_dereference(dev_queue->qdisc_sleeping),
2249                                 skb, tcm, cb, &t, s_t, false) < 0)
2250                 goto done;
2251
2252 done:
2253         cb->args[0] = t;
2254
2255         dev_put(dev);
2256         return skb->len;
2257 }
2258
2259 #ifdef CONFIG_PROC_FS
2260 static int psched_show(struct seq_file *seq, void *v)
2261 {
2262         seq_printf(seq, "%08x %08x %08x %08x\n",
2263                    (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2264                    1000000,
2265                    (u32)NSEC_PER_SEC / hrtimer_resolution);
2266
2267         return 0;
2268 }
2269
2270 static int __net_init psched_net_init(struct net *net)
2271 {
2272         struct proc_dir_entry *e;
2273
2274         e = proc_create_single("psched", 0, net->proc_net, psched_show);
2275         if (e == NULL)
2276                 return -ENOMEM;
2277
2278         return 0;
2279 }
2280
2281 static void __net_exit psched_net_exit(struct net *net)
2282 {
2283         remove_proc_entry("psched", net->proc_net);
2284 }
2285 #else
2286 static int __net_init psched_net_init(struct net *net)
2287 {
2288         return 0;
2289 }
2290
2291 static void __net_exit psched_net_exit(struct net *net)
2292 {
2293 }
2294 #endif
2295
2296 static struct pernet_operations psched_net_ops = {
2297         .init = psched_net_init,
2298         .exit = psched_net_exit,
2299 };
2300
2301 static int __init pktsched_init(void)
2302 {
2303         int err;
2304
2305         err = register_pernet_subsys(&psched_net_ops);
2306         if (err) {
2307                 pr_err("pktsched_init: "
2308                        "cannot initialize per netns operations\n");
2309                 return err;
2310         }
2311
2312         register_qdisc(&pfifo_fast_ops);
2313         register_qdisc(&pfifo_qdisc_ops);
2314         register_qdisc(&bfifo_qdisc_ops);
2315         register_qdisc(&pfifo_head_drop_qdisc_ops);
2316         register_qdisc(&mq_qdisc_ops);
2317         register_qdisc(&noqueue_qdisc_ops);
2318
2319         rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2320         rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2321         rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2322                       0);
2323         rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2324         rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2325         rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2326                       0);
2327
2328         return 0;
2329 }
2330
2331 subsys_initcall(pktsched_init);