0d99df1e764db812f5dfc78a9c54832c0f676f70
[platform/kernel/linux-rpi.git] / net / sched / sch_api.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * net/sched/sch_api.c  Packet scheduler API.
4  *
5  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
6  *
7  * Fixes:
8  *
9  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
10  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
11  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
12  */
13
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/string.h>
18 #include <linux/errno.h>
19 #include <linux/skbuff.h>
20 #include <linux/init.h>
21 #include <linux/proc_fs.h>
22 #include <linux/seq_file.h>
23 #include <linux/kmod.h>
24 #include <linux/list.h>
25 #include <linux/hrtimer.h>
26 #include <linux/slab.h>
27 #include <linux/hashtable.h>
28
29 #include <net/net_namespace.h>
30 #include <net/sock.h>
31 #include <net/netlink.h>
32 #include <net/pkt_sched.h>
33 #include <net/pkt_cls.h>
34
35 /*
36
37    Short review.
38    -------------
39
40    This file consists of two interrelated parts:
41
42    1. queueing disciplines manager frontend.
43    2. traffic classes manager frontend.
44
45    Generally, queueing discipline ("qdisc") is a black box,
46    which is able to enqueue packets and to dequeue them (when
47    device is ready to send something) in order and at times
48    determined by algorithm hidden in it.
49
50    qdisc's are divided to two categories:
51    - "queues", which have no internal structure visible from outside.
52    - "schedulers", which split all the packets to "traffic classes",
53      using "packet classifiers" (look at cls_api.c)
54
55    In turn, classes may have child qdiscs (as rule, queues)
56    attached to them etc. etc. etc.
57
58    The goal of the routines in this file is to translate
59    information supplied by user in the form of handles
60    to more intelligible for kernel form, to make some sanity
61    checks and part of work, which is common to all qdiscs
62    and to provide rtnetlink notifications.
63
64    All real intelligent work is done inside qdisc modules.
65
66
67
68    Every discipline has two major routines: enqueue and dequeue.
69
70    ---dequeue
71
72    dequeue usually returns a skb to send. It is allowed to return NULL,
73    but it does not mean that queue is empty, it just means that
74    discipline does not want to send anything this time.
75    Queue is really empty if q->q.qlen == 0.
76    For complicated disciplines with multiple queues q->q is not
77    real packet queue, but however q->q.qlen must be valid.
78
79    ---enqueue
80
81    enqueue returns 0, if packet was enqueued successfully.
82    If packet (this one or another one) was dropped, it returns
83    not zero error code.
84    NET_XMIT_DROP        - this packet dropped
85      Expected action: do not backoff, but wait until queue will clear.
86    NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
87      Expected action: backoff or ignore
88
89    Auxiliary routines:
90
91    ---peek
92
93    like dequeue but without removing a packet from the queue
94
95    ---reset
96
97    returns qdisc to initial state: purge all buffers, clear all
98    timers, counters (except for statistics) etc.
99
100    ---init
101
102    initializes newly created qdisc.
103
104    ---destroy
105
106    destroys resources allocated by init and during lifetime of qdisc.
107
108    ---change
109
110    changes qdisc parameters.
111  */
112
113 /* Protects list of registered TC modules. It is pure SMP lock. */
114 static DEFINE_RWLOCK(qdisc_mod_lock);
115
116
117 /************************************************
118  *      Queueing disciplines manipulation.      *
119  ************************************************/
120
121
122 /* The list of all installed queueing disciplines. */
123
124 static struct Qdisc_ops *qdisc_base;
125
126 /* Register/unregister queueing discipline */
127
128 int register_qdisc(struct Qdisc_ops *qops)
129 {
130         struct Qdisc_ops *q, **qp;
131         int rc = -EEXIST;
132
133         write_lock(&qdisc_mod_lock);
134         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
135                 if (!strcmp(qops->id, q->id))
136                         goto out;
137
138         if (qops->enqueue == NULL)
139                 qops->enqueue = noop_qdisc_ops.enqueue;
140         if (qops->peek == NULL) {
141                 if (qops->dequeue == NULL)
142                         qops->peek = noop_qdisc_ops.peek;
143                 else
144                         goto out_einval;
145         }
146         if (qops->dequeue == NULL)
147                 qops->dequeue = noop_qdisc_ops.dequeue;
148
149         if (qops->cl_ops) {
150                 const struct Qdisc_class_ops *cops = qops->cl_ops;
151
152                 if (!(cops->find && cops->walk && cops->leaf))
153                         goto out_einval;
154
155                 if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
156                         goto out_einval;
157         }
158
159         qops->next = NULL;
160         *qp = qops;
161         rc = 0;
162 out:
163         write_unlock(&qdisc_mod_lock);
164         return rc;
165
166 out_einval:
167         rc = -EINVAL;
168         goto out;
169 }
170 EXPORT_SYMBOL(register_qdisc);
171
172 int unregister_qdisc(struct Qdisc_ops *qops)
173 {
174         struct Qdisc_ops *q, **qp;
175         int err = -ENOENT;
176
177         write_lock(&qdisc_mod_lock);
178         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
179                 if (q == qops)
180                         break;
181         if (q) {
182                 *qp = q->next;
183                 q->next = NULL;
184                 err = 0;
185         }
186         write_unlock(&qdisc_mod_lock);
187         return err;
188 }
189 EXPORT_SYMBOL(unregister_qdisc);
190
191 /* Get default qdisc if not otherwise specified */
192 void qdisc_get_default(char *name, size_t len)
193 {
194         read_lock(&qdisc_mod_lock);
195         strlcpy(name, default_qdisc_ops->id, len);
196         read_unlock(&qdisc_mod_lock);
197 }
198
199 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
200 {
201         struct Qdisc_ops *q = NULL;
202
203         for (q = qdisc_base; q; q = q->next) {
204                 if (!strcmp(name, q->id)) {
205                         if (!try_module_get(q->owner))
206                                 q = NULL;
207                         break;
208                 }
209         }
210
211         return q;
212 }
213
214 /* Set new default qdisc to use */
215 int qdisc_set_default(const char *name)
216 {
217         const struct Qdisc_ops *ops;
218
219         if (!capable(CAP_NET_ADMIN))
220                 return -EPERM;
221
222         write_lock(&qdisc_mod_lock);
223         ops = qdisc_lookup_default(name);
224         if (!ops) {
225                 /* Not found, drop lock and try to load module */
226                 write_unlock(&qdisc_mod_lock);
227                 request_module("sch_%s", name);
228                 write_lock(&qdisc_mod_lock);
229
230                 ops = qdisc_lookup_default(name);
231         }
232
233         if (ops) {
234                 /* Set new default */
235                 module_put(default_qdisc_ops->owner);
236                 default_qdisc_ops = ops;
237         }
238         write_unlock(&qdisc_mod_lock);
239
240         return ops ? 0 : -ENOENT;
241 }
242
243 #ifdef CONFIG_NET_SCH_DEFAULT
244 /* Set default value from kernel config */
245 static int __init sch_default_qdisc(void)
246 {
247         return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
248 }
249 late_initcall(sch_default_qdisc);
250 #endif
251
252 /* We know handle. Find qdisc among all qdisc's attached to device
253  * (root qdisc, all its children, children of children etc.)
254  * Note: caller either uses rtnl or rcu_read_lock()
255  */
256
257 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
258 {
259         struct Qdisc *q;
260
261         if (!qdisc_dev(root))
262                 return (root->handle == handle ? root : NULL);
263
264         if (!(root->flags & TCQ_F_BUILTIN) &&
265             root->handle == handle)
266                 return root;
267
268         hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle) {
269                 if (q->handle == handle)
270                         return q;
271         }
272         return NULL;
273 }
274
275 void qdisc_hash_add(struct Qdisc *q, bool invisible)
276 {
277         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
278                 ASSERT_RTNL();
279                 hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
280                 if (invisible)
281                         q->flags |= TCQ_F_INVISIBLE;
282         }
283 }
284 EXPORT_SYMBOL(qdisc_hash_add);
285
286 void qdisc_hash_del(struct Qdisc *q)
287 {
288         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
289                 ASSERT_RTNL();
290                 hash_del_rcu(&q->hash);
291         }
292 }
293 EXPORT_SYMBOL(qdisc_hash_del);
294
295 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
296 {
297         struct Qdisc *q;
298
299         if (!handle)
300                 return NULL;
301         q = qdisc_match_from_root(dev->qdisc, handle);
302         if (q)
303                 goto out;
304
305         if (dev_ingress_queue(dev))
306                 q = qdisc_match_from_root(
307                         dev_ingress_queue(dev)->qdisc_sleeping,
308                         handle);
309 out:
310         return q;
311 }
312
313 struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle)
314 {
315         struct netdev_queue *nq;
316         struct Qdisc *q;
317
318         if (!handle)
319                 return NULL;
320         q = qdisc_match_from_root(dev->qdisc, handle);
321         if (q)
322                 goto out;
323
324         nq = dev_ingress_queue_rcu(dev);
325         if (nq)
326                 q = qdisc_match_from_root(nq->qdisc_sleeping, handle);
327 out:
328         return q;
329 }
330
331 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
332 {
333         unsigned long cl;
334         const struct Qdisc_class_ops *cops = p->ops->cl_ops;
335
336         if (cops == NULL)
337                 return NULL;
338         cl = cops->find(p, classid);
339
340         if (cl == 0)
341                 return NULL;
342         return cops->leaf(p, cl);
343 }
344
345 /* Find queueing discipline by name */
346
347 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
348 {
349         struct Qdisc_ops *q = NULL;
350
351         if (kind) {
352                 read_lock(&qdisc_mod_lock);
353                 for (q = qdisc_base; q; q = q->next) {
354                         if (nla_strcmp(kind, q->id) == 0) {
355                                 if (!try_module_get(q->owner))
356                                         q = NULL;
357                                 break;
358                         }
359                 }
360                 read_unlock(&qdisc_mod_lock);
361         }
362         return q;
363 }
364
365 /* The linklayer setting were not transferred from iproute2, in older
366  * versions, and the rate tables lookup systems have been dropped in
367  * the kernel. To keep backward compatible with older iproute2 tc
368  * utils, we detect the linklayer setting by detecting if the rate
369  * table were modified.
370  *
371  * For linklayer ATM table entries, the rate table will be aligned to
372  * 48 bytes, thus some table entries will contain the same value.  The
373  * mpu (min packet unit) is also encoded into the old rate table, thus
374  * starting from the mpu, we find low and high table entries for
375  * mapping this cell.  If these entries contain the same value, when
376  * the rate tables have been modified for linklayer ATM.
377  *
378  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
379  * and then roundup to the next cell, calc the table entry one below,
380  * and compare.
381  */
382 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
383 {
384         int low       = roundup(r->mpu, 48);
385         int high      = roundup(low+1, 48);
386         int cell_low  = low >> r->cell_log;
387         int cell_high = (high >> r->cell_log) - 1;
388
389         /* rtab is too inaccurate at rates > 100Mbit/s */
390         if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
391                 pr_debug("TC linklayer: Giving up ATM detection\n");
392                 return TC_LINKLAYER_ETHERNET;
393         }
394
395         if ((cell_high > cell_low) && (cell_high < 256)
396             && (rtab[cell_low] == rtab[cell_high])) {
397                 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
398                          cell_low, cell_high, rtab[cell_high]);
399                 return TC_LINKLAYER_ATM;
400         }
401         return TC_LINKLAYER_ETHERNET;
402 }
403
404 static struct qdisc_rate_table *qdisc_rtab_list;
405
406 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
407                                         struct nlattr *tab,
408                                         struct netlink_ext_ack *extack)
409 {
410         struct qdisc_rate_table *rtab;
411
412         if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
413             nla_len(tab) != TC_RTAB_SIZE) {
414                 NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
415                 return NULL;
416         }
417
418         for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
419                 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
420                     !memcmp(&rtab->data, nla_data(tab), 1024)) {
421                         rtab->refcnt++;
422                         return rtab;
423                 }
424         }
425
426         rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
427         if (rtab) {
428                 rtab->rate = *r;
429                 rtab->refcnt = 1;
430                 memcpy(rtab->data, nla_data(tab), 1024);
431                 if (r->linklayer == TC_LINKLAYER_UNAWARE)
432                         r->linklayer = __detect_linklayer(r, rtab->data);
433                 rtab->next = qdisc_rtab_list;
434                 qdisc_rtab_list = rtab;
435         } else {
436                 NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
437         }
438         return rtab;
439 }
440 EXPORT_SYMBOL(qdisc_get_rtab);
441
442 void qdisc_put_rtab(struct qdisc_rate_table *tab)
443 {
444         struct qdisc_rate_table *rtab, **rtabp;
445
446         if (!tab || --tab->refcnt)
447                 return;
448
449         for (rtabp = &qdisc_rtab_list;
450              (rtab = *rtabp) != NULL;
451              rtabp = &rtab->next) {
452                 if (rtab == tab) {
453                         *rtabp = rtab->next;
454                         kfree(rtab);
455                         return;
456                 }
457         }
458 }
459 EXPORT_SYMBOL(qdisc_put_rtab);
460
461 static LIST_HEAD(qdisc_stab_list);
462
463 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
464         [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
465         [TCA_STAB_DATA] = { .type = NLA_BINARY },
466 };
467
468 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
469                                                struct netlink_ext_ack *extack)
470 {
471         struct nlattr *tb[TCA_STAB_MAX + 1];
472         struct qdisc_size_table *stab;
473         struct tc_sizespec *s;
474         unsigned int tsize = 0;
475         u16 *tab = NULL;
476         int err;
477
478         err = nla_parse_nested_deprecated(tb, TCA_STAB_MAX, opt, stab_policy,
479                                           extack);
480         if (err < 0)
481                 return ERR_PTR(err);
482         if (!tb[TCA_STAB_BASE]) {
483                 NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
484                 return ERR_PTR(-EINVAL);
485         }
486
487         s = nla_data(tb[TCA_STAB_BASE]);
488
489         if (s->tsize > 0) {
490                 if (!tb[TCA_STAB_DATA]) {
491                         NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
492                         return ERR_PTR(-EINVAL);
493                 }
494                 tab = nla_data(tb[TCA_STAB_DATA]);
495                 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
496         }
497
498         if (tsize != s->tsize || (!tab && tsize > 0)) {
499                 NL_SET_ERR_MSG(extack, "Invalid size of size table");
500                 return ERR_PTR(-EINVAL);
501         }
502
503         list_for_each_entry(stab, &qdisc_stab_list, list) {
504                 if (memcmp(&stab->szopts, s, sizeof(*s)))
505                         continue;
506                 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
507                         continue;
508                 stab->refcnt++;
509                 return stab;
510         }
511
512         stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
513         if (!stab)
514                 return ERR_PTR(-ENOMEM);
515
516         stab->refcnt = 1;
517         stab->szopts = *s;
518         if (tsize > 0)
519                 memcpy(stab->data, tab, tsize * sizeof(u16));
520
521         list_add_tail(&stab->list, &qdisc_stab_list);
522
523         return stab;
524 }
525
526 void qdisc_put_stab(struct qdisc_size_table *tab)
527 {
528         if (!tab)
529                 return;
530
531         if (--tab->refcnt == 0) {
532                 list_del(&tab->list);
533                 kfree_rcu(tab, rcu);
534         }
535 }
536 EXPORT_SYMBOL(qdisc_put_stab);
537
538 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
539 {
540         struct nlattr *nest;
541
542         nest = nla_nest_start_noflag(skb, TCA_STAB);
543         if (nest == NULL)
544                 goto nla_put_failure;
545         if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
546                 goto nla_put_failure;
547         nla_nest_end(skb, nest);
548
549         return skb->len;
550
551 nla_put_failure:
552         return -1;
553 }
554
555 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
556                                const struct qdisc_size_table *stab)
557 {
558         int pkt_len, slot;
559
560         pkt_len = skb->len + stab->szopts.overhead;
561         if (unlikely(!stab->szopts.tsize))
562                 goto out;
563
564         slot = pkt_len + stab->szopts.cell_align;
565         if (unlikely(slot < 0))
566                 slot = 0;
567
568         slot >>= stab->szopts.cell_log;
569         if (likely(slot < stab->szopts.tsize))
570                 pkt_len = stab->data[slot];
571         else
572                 pkt_len = stab->data[stab->szopts.tsize - 1] *
573                                 (slot / stab->szopts.tsize) +
574                                 stab->data[slot % stab->szopts.tsize];
575
576         pkt_len <<= stab->szopts.size_log;
577 out:
578         if (unlikely(pkt_len < 1))
579                 pkt_len = 1;
580         qdisc_skb_cb(skb)->pkt_len = pkt_len;
581 }
582 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
583
584 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
585 {
586         if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
587                 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
588                         txt, qdisc->ops->id, qdisc->handle >> 16);
589                 qdisc->flags |= TCQ_F_WARN_NONWC;
590         }
591 }
592 EXPORT_SYMBOL(qdisc_warn_nonwc);
593
594 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
595 {
596         struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
597                                                  timer);
598
599         rcu_read_lock();
600         __netif_schedule(qdisc_root(wd->qdisc));
601         rcu_read_unlock();
602
603         return HRTIMER_NORESTART;
604 }
605
606 void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
607                                  clockid_t clockid)
608 {
609         hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
610         wd->timer.function = qdisc_watchdog;
611         wd->qdisc = qdisc;
612 }
613 EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
614
615 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
616 {
617         qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
618 }
619 EXPORT_SYMBOL(qdisc_watchdog_init);
620
621 void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires,
622                                       u64 delta_ns)
623 {
624         if (test_bit(__QDISC_STATE_DEACTIVATED,
625                      &qdisc_root_sleeping(wd->qdisc)->state))
626                 return;
627
628         if (hrtimer_is_queued(&wd->timer)) {
629                 /* If timer is already set in [expires, expires + delta_ns],
630                  * do not reprogram it.
631                  */
632                 if (wd->last_expires - expires <= delta_ns)
633                         return;
634         }
635
636         wd->last_expires = expires;
637         hrtimer_start_range_ns(&wd->timer,
638                                ns_to_ktime(expires),
639                                delta_ns,
640                                HRTIMER_MODE_ABS_PINNED);
641 }
642 EXPORT_SYMBOL(qdisc_watchdog_schedule_range_ns);
643
644 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
645 {
646         hrtimer_cancel(&wd->timer);
647 }
648 EXPORT_SYMBOL(qdisc_watchdog_cancel);
649
650 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
651 {
652         struct hlist_head *h;
653         unsigned int i;
654
655         h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
656
657         if (h != NULL) {
658                 for (i = 0; i < n; i++)
659                         INIT_HLIST_HEAD(&h[i]);
660         }
661         return h;
662 }
663
664 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
665 {
666         struct Qdisc_class_common *cl;
667         struct hlist_node *next;
668         struct hlist_head *nhash, *ohash;
669         unsigned int nsize, nmask, osize;
670         unsigned int i, h;
671
672         /* Rehash when load factor exceeds 0.75 */
673         if (clhash->hashelems * 4 <= clhash->hashsize * 3)
674                 return;
675         nsize = clhash->hashsize * 2;
676         nmask = nsize - 1;
677         nhash = qdisc_class_hash_alloc(nsize);
678         if (nhash == NULL)
679                 return;
680
681         ohash = clhash->hash;
682         osize = clhash->hashsize;
683
684         sch_tree_lock(sch);
685         for (i = 0; i < osize; i++) {
686                 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
687                         h = qdisc_class_hash(cl->classid, nmask);
688                         hlist_add_head(&cl->hnode, &nhash[h]);
689                 }
690         }
691         clhash->hash     = nhash;
692         clhash->hashsize = nsize;
693         clhash->hashmask = nmask;
694         sch_tree_unlock(sch);
695
696         kvfree(ohash);
697 }
698 EXPORT_SYMBOL(qdisc_class_hash_grow);
699
700 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
701 {
702         unsigned int size = 4;
703
704         clhash->hash = qdisc_class_hash_alloc(size);
705         if (!clhash->hash)
706                 return -ENOMEM;
707         clhash->hashsize  = size;
708         clhash->hashmask  = size - 1;
709         clhash->hashelems = 0;
710         return 0;
711 }
712 EXPORT_SYMBOL(qdisc_class_hash_init);
713
714 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
715 {
716         kvfree(clhash->hash);
717 }
718 EXPORT_SYMBOL(qdisc_class_hash_destroy);
719
720 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
721                              struct Qdisc_class_common *cl)
722 {
723         unsigned int h;
724
725         INIT_HLIST_NODE(&cl->hnode);
726         h = qdisc_class_hash(cl->classid, clhash->hashmask);
727         hlist_add_head(&cl->hnode, &clhash->hash[h]);
728         clhash->hashelems++;
729 }
730 EXPORT_SYMBOL(qdisc_class_hash_insert);
731
732 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
733                              struct Qdisc_class_common *cl)
734 {
735         hlist_del(&cl->hnode);
736         clhash->hashelems--;
737 }
738 EXPORT_SYMBOL(qdisc_class_hash_remove);
739
740 /* Allocate an unique handle from space managed by kernel
741  * Possible range is [8000-FFFF]:0000 (0x8000 values)
742  */
743 static u32 qdisc_alloc_handle(struct net_device *dev)
744 {
745         int i = 0x8000;
746         static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
747
748         do {
749                 autohandle += TC_H_MAKE(0x10000U, 0);
750                 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
751                         autohandle = TC_H_MAKE(0x80000000U, 0);
752                 if (!qdisc_lookup(dev, autohandle))
753                         return autohandle;
754                 cond_resched();
755         } while (--i > 0);
756
757         return 0;
758 }
759
760 void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len)
761 {
762         bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
763         const struct Qdisc_class_ops *cops;
764         unsigned long cl;
765         u32 parentid;
766         bool notify;
767         int drops;
768
769         if (n == 0 && len == 0)
770                 return;
771         drops = max_t(int, n, 0);
772         rcu_read_lock();
773         while ((parentid = sch->parent)) {
774                 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
775                         break;
776
777                 if (sch->flags & TCQ_F_NOPARENT)
778                         break;
779                 /* Notify parent qdisc only if child qdisc becomes empty.
780                  *
781                  * If child was empty even before update then backlog
782                  * counter is screwed and we skip notification because
783                  * parent class is already passive.
784                  *
785                  * If the original child was offloaded then it is allowed
786                  * to be seem as empty, so the parent is notified anyway.
787                  */
788                 notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
789                                                        !qdisc_is_offloaded);
790                 /* TODO: perform the search on a per txq basis */
791                 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
792                 if (sch == NULL) {
793                         WARN_ON_ONCE(parentid != TC_H_ROOT);
794                         break;
795                 }
796                 cops = sch->ops->cl_ops;
797                 if (notify && cops->qlen_notify) {
798                         cl = cops->find(sch, parentid);
799                         cops->qlen_notify(sch, cl);
800                 }
801                 sch->q.qlen -= n;
802                 sch->qstats.backlog -= len;
803                 __qdisc_qstats_drop(sch, drops);
804         }
805         rcu_read_unlock();
806 }
807 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
808
809 int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type,
810                               void *type_data)
811 {
812         struct net_device *dev = qdisc_dev(sch);
813         int err;
814
815         sch->flags &= ~TCQ_F_OFFLOADED;
816         if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
817                 return 0;
818
819         err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
820         if (err == -EOPNOTSUPP)
821                 return 0;
822
823         if (!err)
824                 sch->flags |= TCQ_F_OFFLOADED;
825
826         return err;
827 }
828 EXPORT_SYMBOL(qdisc_offload_dump_helper);
829
830 void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
831                                 struct Qdisc *new, struct Qdisc *old,
832                                 enum tc_setup_type type, void *type_data,
833                                 struct netlink_ext_ack *extack)
834 {
835         bool any_qdisc_is_offloaded;
836         int err;
837
838         if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
839                 return;
840
841         err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
842
843         /* Don't report error if the graft is part of destroy operation. */
844         if (!err || !new || new == &noop_qdisc)
845                 return;
846
847         /* Don't report error if the parent, the old child and the new
848          * one are not offloaded.
849          */
850         any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED;
851         any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED;
852         any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED;
853
854         if (any_qdisc_is_offloaded)
855                 NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
856 }
857 EXPORT_SYMBOL(qdisc_offload_graft_helper);
858
859 static void qdisc_offload_graft_root(struct net_device *dev,
860                                      struct Qdisc *new, struct Qdisc *old,
861                                      struct netlink_ext_ack *extack)
862 {
863         struct tc_root_qopt_offload graft_offload = {
864                 .command        = TC_ROOT_GRAFT,
865                 .handle         = new ? new->handle : 0,
866                 .ingress        = (new && new->flags & TCQ_F_INGRESS) ||
867                                   (old && old->flags & TCQ_F_INGRESS),
868         };
869
870         qdisc_offload_graft_helper(dev, NULL, new, old,
871                                    TC_SETUP_ROOT_QDISC, &graft_offload, extack);
872 }
873
874 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
875                          u32 portid, u32 seq, u16 flags, int event)
876 {
877         struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
878         struct gnet_stats_queue __percpu *cpu_qstats = NULL;
879         struct tcmsg *tcm;
880         struct nlmsghdr  *nlh;
881         unsigned char *b = skb_tail_pointer(skb);
882         struct gnet_dump d;
883         struct qdisc_size_table *stab;
884         u32 block_index;
885         __u32 qlen;
886
887         cond_resched();
888         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
889         if (!nlh)
890                 goto out_nlmsg_trim;
891         tcm = nlmsg_data(nlh);
892         tcm->tcm_family = AF_UNSPEC;
893         tcm->tcm__pad1 = 0;
894         tcm->tcm__pad2 = 0;
895         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
896         tcm->tcm_parent = clid;
897         tcm->tcm_handle = q->handle;
898         tcm->tcm_info = refcount_read(&q->refcnt);
899         if (nla_put_string(skb, TCA_KIND, q->ops->id))
900                 goto nla_put_failure;
901         if (q->ops->ingress_block_get) {
902                 block_index = q->ops->ingress_block_get(q);
903                 if (block_index &&
904                     nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
905                         goto nla_put_failure;
906         }
907         if (q->ops->egress_block_get) {
908                 block_index = q->ops->egress_block_get(q);
909                 if (block_index &&
910                     nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
911                         goto nla_put_failure;
912         }
913         if (q->ops->dump && q->ops->dump(q, skb) < 0)
914                 goto nla_put_failure;
915         if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
916                 goto nla_put_failure;
917         qlen = qdisc_qlen_sum(q);
918
919         stab = rtnl_dereference(q->stab);
920         if (stab && qdisc_dump_stab(skb, stab) < 0)
921                 goto nla_put_failure;
922
923         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
924                                          NULL, &d, TCA_PAD) < 0)
925                 goto nla_put_failure;
926
927         if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
928                 goto nla_put_failure;
929
930         if (qdisc_is_percpu_stats(q)) {
931                 cpu_bstats = q->cpu_bstats;
932                 cpu_qstats = q->cpu_qstats;
933         }
934
935         if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
936                                   &d, cpu_bstats, &q->bstats) < 0 ||
937             gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
938             gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
939                 goto nla_put_failure;
940
941         if (gnet_stats_finish_copy(&d) < 0)
942                 goto nla_put_failure;
943
944         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
945         return skb->len;
946
947 out_nlmsg_trim:
948 nla_put_failure:
949         nlmsg_trim(skb, b);
950         return -1;
951 }
952
953 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
954 {
955         if (q->flags & TCQ_F_BUILTIN)
956                 return true;
957         if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
958                 return true;
959
960         return false;
961 }
962
963 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
964                         struct nlmsghdr *n, u32 clid,
965                         struct Qdisc *old, struct Qdisc *new)
966 {
967         struct sk_buff *skb;
968         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
969
970         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
971         if (!skb)
972                 return -ENOBUFS;
973
974         if (old && !tc_qdisc_dump_ignore(old, false)) {
975                 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
976                                   0, RTM_DELQDISC) < 0)
977                         goto err_out;
978         }
979         if (new && !tc_qdisc_dump_ignore(new, false)) {
980                 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
981                                   old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
982                         goto err_out;
983         }
984
985         if (skb->len)
986                 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
987                                       n->nlmsg_flags & NLM_F_ECHO);
988
989 err_out:
990         kfree_skb(skb);
991         return -EINVAL;
992 }
993
994 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
995                                struct nlmsghdr *n, u32 clid,
996                                struct Qdisc *old, struct Qdisc *new)
997 {
998         if (new || old)
999                 qdisc_notify(net, skb, n, clid, old, new);
1000
1001         if (old)
1002                 qdisc_put(old);
1003 }
1004
1005 static void qdisc_clear_nolock(struct Qdisc *sch)
1006 {
1007         sch->flags &= ~TCQ_F_NOLOCK;
1008         if (!(sch->flags & TCQ_F_CPUSTATS))
1009                 return;
1010
1011         free_percpu(sch->cpu_bstats);
1012         free_percpu(sch->cpu_qstats);
1013         sch->cpu_bstats = NULL;
1014         sch->cpu_qstats = NULL;
1015         sch->flags &= ~TCQ_F_CPUSTATS;
1016 }
1017
1018 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
1019  * to device "dev".
1020  *
1021  * When appropriate send a netlink notification using 'skb'
1022  * and "n".
1023  *
1024  * On success, destroy old qdisc.
1025  */
1026
1027 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
1028                        struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
1029                        struct Qdisc *new, struct Qdisc *old,
1030                        struct netlink_ext_ack *extack)
1031 {
1032         struct Qdisc *q = old;
1033         struct net *net = dev_net(dev);
1034
1035         if (parent == NULL) {
1036                 unsigned int i, num_q, ingress;
1037
1038                 ingress = 0;
1039                 num_q = dev->num_tx_queues;
1040                 if ((q && q->flags & TCQ_F_INGRESS) ||
1041                     (new && new->flags & TCQ_F_INGRESS)) {
1042                         num_q = 1;
1043                         ingress = 1;
1044                         if (!dev_ingress_queue(dev)) {
1045                                 NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
1046                                 return -ENOENT;
1047                         }
1048                 }
1049
1050                 if (dev->flags & IFF_UP)
1051                         dev_deactivate(dev);
1052
1053                 qdisc_offload_graft_root(dev, new, old, extack);
1054
1055                 if (new && new->ops->attach)
1056                         goto skip;
1057
1058                 for (i = 0; i < num_q; i++) {
1059                         struct netdev_queue *dev_queue = dev_ingress_queue(dev);
1060
1061                         if (!ingress)
1062                                 dev_queue = netdev_get_tx_queue(dev, i);
1063
1064                         old = dev_graft_qdisc(dev_queue, new);
1065                         if (new && i > 0)
1066                                 qdisc_refcount_inc(new);
1067
1068                         if (!ingress)
1069                                 qdisc_put(old);
1070                 }
1071
1072 skip:
1073                 if (!ingress) {
1074                         notify_and_destroy(net, skb, n, classid,
1075                                            dev->qdisc, new);
1076                         if (new && !new->ops->attach)
1077                                 qdisc_refcount_inc(new);
1078                         dev->qdisc = new ? : &noop_qdisc;
1079
1080                         if (new && new->ops->attach)
1081                                 new->ops->attach(new);
1082                 } else {
1083                         notify_and_destroy(net, skb, n, classid, old, new);
1084                 }
1085
1086                 if (dev->flags & IFF_UP)
1087                         dev_activate(dev);
1088         } else {
1089                 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
1090                 unsigned long cl;
1091                 int err;
1092
1093                 /* Only support running class lockless if parent is lockless */
1094                 if (new && (new->flags & TCQ_F_NOLOCK) &&
1095                     parent && !(parent->flags & TCQ_F_NOLOCK))
1096                         qdisc_clear_nolock(new);
1097
1098                 if (!cops || !cops->graft)
1099                         return -EOPNOTSUPP;
1100
1101                 cl = cops->find(parent, classid);
1102                 if (!cl) {
1103                         NL_SET_ERR_MSG(extack, "Specified class not found");
1104                         return -ENOENT;
1105                 }
1106
1107                 err = cops->graft(parent, cl, new, &old, extack);
1108                 if (err)
1109                         return err;
1110                 notify_and_destroy(net, skb, n, classid, old, new);
1111         }
1112         return 0;
1113 }
1114
1115 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1116                                    struct netlink_ext_ack *extack)
1117 {
1118         u32 block_index;
1119
1120         if (tca[TCA_INGRESS_BLOCK]) {
1121                 block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1122
1123                 if (!block_index) {
1124                         NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1125                         return -EINVAL;
1126                 }
1127                 if (!sch->ops->ingress_block_set) {
1128                         NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1129                         return -EOPNOTSUPP;
1130                 }
1131                 sch->ops->ingress_block_set(sch, block_index);
1132         }
1133         if (tca[TCA_EGRESS_BLOCK]) {
1134                 block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1135
1136                 if (!block_index) {
1137                         NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1138                         return -EINVAL;
1139                 }
1140                 if (!sch->ops->egress_block_set) {
1141                         NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1142                         return -EOPNOTSUPP;
1143                 }
1144                 sch->ops->egress_block_set(sch, block_index);
1145         }
1146         return 0;
1147 }
1148
1149 /*
1150    Allocate and initialize new qdisc.
1151
1152    Parameters are passed via opt.
1153  */
1154
1155 static struct Qdisc *qdisc_create(struct net_device *dev,
1156                                   struct netdev_queue *dev_queue,
1157                                   struct Qdisc *p, u32 parent, u32 handle,
1158                                   struct nlattr **tca, int *errp,
1159                                   struct netlink_ext_ack *extack)
1160 {
1161         int err;
1162         struct nlattr *kind = tca[TCA_KIND];
1163         struct Qdisc *sch;
1164         struct Qdisc_ops *ops;
1165         struct qdisc_size_table *stab;
1166
1167         ops = qdisc_lookup_ops(kind);
1168 #ifdef CONFIG_MODULES
1169         if (ops == NULL && kind != NULL) {
1170                 char name[IFNAMSIZ];
1171                 if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
1172                         /* We dropped the RTNL semaphore in order to
1173                          * perform the module load.  So, even if we
1174                          * succeeded in loading the module we have to
1175                          * tell the caller to replay the request.  We
1176                          * indicate this using -EAGAIN.
1177                          * We replay the request because the device may
1178                          * go away in the mean time.
1179                          */
1180                         rtnl_unlock();
1181                         request_module("sch_%s", name);
1182                         rtnl_lock();
1183                         ops = qdisc_lookup_ops(kind);
1184                         if (ops != NULL) {
1185                                 /* We will try again qdisc_lookup_ops,
1186                                  * so don't keep a reference.
1187                                  */
1188                                 module_put(ops->owner);
1189                                 err = -EAGAIN;
1190                                 goto err_out;
1191                         }
1192                 }
1193         }
1194 #endif
1195
1196         err = -ENOENT;
1197         if (!ops) {
1198                 NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1199                 goto err_out;
1200         }
1201
1202         sch = qdisc_alloc(dev_queue, ops, extack);
1203         if (IS_ERR(sch)) {
1204                 err = PTR_ERR(sch);
1205                 goto err_out2;
1206         }
1207
1208         sch->parent = parent;
1209
1210         if (handle == TC_H_INGRESS) {
1211                 sch->flags |= TCQ_F_INGRESS;
1212                 handle = TC_H_MAKE(TC_H_INGRESS, 0);
1213         } else {
1214                 if (handle == 0) {
1215                         handle = qdisc_alloc_handle(dev);
1216                         if (handle == 0) {
1217                                 NL_SET_ERR_MSG(extack, "Maximum number of qdisc handles was exceeded");
1218                                 err = -ENOSPC;
1219                                 goto err_out3;
1220                         }
1221                 }
1222                 if (!netif_is_multiqueue(dev))
1223                         sch->flags |= TCQ_F_ONETXQUEUE;
1224         }
1225
1226         sch->handle = handle;
1227
1228         /* This exist to keep backward compatible with a userspace
1229          * loophole, what allowed userspace to get IFF_NO_QUEUE
1230          * facility on older kernels by setting tx_queue_len=0 (prior
1231          * to qdisc init), and then forgot to reinit tx_queue_len
1232          * before again attaching a qdisc.
1233          */
1234         if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1235                 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1236                 netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1237         }
1238
1239         err = qdisc_block_indexes_set(sch, tca, extack);
1240         if (err)
1241                 goto err_out3;
1242
1243         if (ops->init) {
1244                 err = ops->init(sch, tca[TCA_OPTIONS], extack);
1245                 if (err != 0)
1246                         goto err_out5;
1247         }
1248
1249         if (tca[TCA_STAB]) {
1250                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1251                 if (IS_ERR(stab)) {
1252                         err = PTR_ERR(stab);
1253                         goto err_out4;
1254                 }
1255                 rcu_assign_pointer(sch->stab, stab);
1256         }
1257         if (tca[TCA_RATE]) {
1258                 seqcount_t *running;
1259
1260                 err = -EOPNOTSUPP;
1261                 if (sch->flags & TCQ_F_MQROOT) {
1262                         NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1263                         goto err_out4;
1264                 }
1265
1266                 if (sch->parent != TC_H_ROOT &&
1267                     !(sch->flags & TCQ_F_INGRESS) &&
1268                     (!p || !(p->flags & TCQ_F_MQROOT)))
1269                         running = qdisc_root_sleeping_running(sch);
1270                 else
1271                         running = &sch->running;
1272
1273                 err = gen_new_estimator(&sch->bstats,
1274                                         sch->cpu_bstats,
1275                                         &sch->rate_est,
1276                                         NULL,
1277                                         running,
1278                                         tca[TCA_RATE]);
1279                 if (err) {
1280                         NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1281                         goto err_out4;
1282                 }
1283         }
1284
1285         qdisc_hash_add(sch, false);
1286
1287         return sch;
1288
1289 err_out5:
1290         /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1291         if (ops->destroy)
1292                 ops->destroy(sch);
1293 err_out3:
1294         dev_put(dev);
1295         qdisc_free(sch);
1296 err_out2:
1297         module_put(ops->owner);
1298 err_out:
1299         *errp = err;
1300         return NULL;
1301
1302 err_out4:
1303         /*
1304          * Any broken qdiscs that would require a ops->reset() here?
1305          * The qdisc was never in action so it shouldn't be necessary.
1306          */
1307         qdisc_put_stab(rtnl_dereference(sch->stab));
1308         if (ops->destroy)
1309                 ops->destroy(sch);
1310         goto err_out3;
1311 }
1312
1313 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1314                         struct netlink_ext_ack *extack)
1315 {
1316         struct qdisc_size_table *ostab, *stab = NULL;
1317         int err = 0;
1318
1319         if (tca[TCA_OPTIONS]) {
1320                 if (!sch->ops->change) {
1321                         NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1322                         return -EINVAL;
1323                 }
1324                 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1325                         NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1326                         return -EOPNOTSUPP;
1327                 }
1328                 err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1329                 if (err)
1330                         return err;
1331         }
1332
1333         if (tca[TCA_STAB]) {
1334                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1335                 if (IS_ERR(stab))
1336                         return PTR_ERR(stab);
1337         }
1338
1339         ostab = rtnl_dereference(sch->stab);
1340         rcu_assign_pointer(sch->stab, stab);
1341         qdisc_put_stab(ostab);
1342
1343         if (tca[TCA_RATE]) {
1344                 /* NB: ignores errors from replace_estimator
1345                    because change can't be undone. */
1346                 if (sch->flags & TCQ_F_MQROOT)
1347                         goto out;
1348                 gen_replace_estimator(&sch->bstats,
1349                                       sch->cpu_bstats,
1350                                       &sch->rate_est,
1351                                       NULL,
1352                                       qdisc_root_sleeping_running(sch),
1353                                       tca[TCA_RATE]);
1354         }
1355 out:
1356         return 0;
1357 }
1358
1359 struct check_loop_arg {
1360         struct qdisc_walker     w;
1361         struct Qdisc            *p;
1362         int                     depth;
1363 };
1364
1365 static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1366                          struct qdisc_walker *w);
1367
1368 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1369 {
1370         struct check_loop_arg   arg;
1371
1372         if (q->ops->cl_ops == NULL)
1373                 return 0;
1374
1375         arg.w.stop = arg.w.skip = arg.w.count = 0;
1376         arg.w.fn = check_loop_fn;
1377         arg.depth = depth;
1378         arg.p = p;
1379         q->ops->cl_ops->walk(q, &arg.w);
1380         return arg.w.stop ? -ELOOP : 0;
1381 }
1382
1383 static int
1384 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1385 {
1386         struct Qdisc *leaf;
1387         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1388         struct check_loop_arg *arg = (struct check_loop_arg *)w;
1389
1390         leaf = cops->leaf(q, cl);
1391         if (leaf) {
1392                 if (leaf == arg->p || arg->depth > 7)
1393                         return -ELOOP;
1394                 return check_loop(leaf, arg->p, arg->depth + 1);
1395         }
1396         return 0;
1397 }
1398
1399 const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
1400         [TCA_KIND]              = { .type = NLA_STRING },
1401         [TCA_RATE]              = { .type = NLA_BINARY,
1402                                     .len = sizeof(struct tc_estimator) },
1403         [TCA_STAB]              = { .type = NLA_NESTED },
1404         [TCA_DUMP_INVISIBLE]    = { .type = NLA_FLAG },
1405         [TCA_CHAIN]             = { .type = NLA_U32 },
1406         [TCA_INGRESS_BLOCK]     = { .type = NLA_U32 },
1407         [TCA_EGRESS_BLOCK]      = { .type = NLA_U32 },
1408 };
1409
1410 /*
1411  * Delete/get qdisc.
1412  */
1413
1414 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1415                         struct netlink_ext_ack *extack)
1416 {
1417         struct net *net = sock_net(skb->sk);
1418         struct tcmsg *tcm = nlmsg_data(n);
1419         struct nlattr *tca[TCA_MAX + 1];
1420         struct net_device *dev;
1421         u32 clid;
1422         struct Qdisc *q = NULL;
1423         struct Qdisc *p = NULL;
1424         int err;
1425
1426         if ((n->nlmsg_type != RTM_GETQDISC) &&
1427             !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1428                 return -EPERM;
1429
1430         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1431                                      rtm_tca_policy, extack);
1432         if (err < 0)
1433                 return err;
1434
1435         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1436         if (!dev)
1437                 return -ENODEV;
1438
1439         clid = tcm->tcm_parent;
1440         if (clid) {
1441                 if (clid != TC_H_ROOT) {
1442                         if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1443                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1444                                 if (!p) {
1445                                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1446                                         return -ENOENT;
1447                                 }
1448                                 q = qdisc_leaf(p, clid);
1449                         } else if (dev_ingress_queue(dev)) {
1450                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1451                         }
1452                 } else {
1453                         q = dev->qdisc;
1454                 }
1455                 if (!q) {
1456                         NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1457                         return -ENOENT;
1458                 }
1459
1460                 if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1461                         NL_SET_ERR_MSG(extack, "Invalid handle");
1462                         return -EINVAL;
1463                 }
1464         } else {
1465                 q = qdisc_lookup(dev, tcm->tcm_handle);
1466                 if (!q) {
1467                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1468                         return -ENOENT;
1469                 }
1470         }
1471
1472         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1473                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1474                 return -EINVAL;
1475         }
1476
1477         if (n->nlmsg_type == RTM_DELQDISC) {
1478                 if (!clid) {
1479                         NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1480                         return -EINVAL;
1481                 }
1482                 if (q->handle == 0) {
1483                         NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1484                         return -ENOENT;
1485                 }
1486                 err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1487                 if (err != 0)
1488                         return err;
1489         } else {
1490                 qdisc_notify(net, skb, n, clid, NULL, q);
1491         }
1492         return 0;
1493 }
1494
1495 /*
1496  * Create/change qdisc.
1497  */
1498
1499 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1500                            struct netlink_ext_ack *extack)
1501 {
1502         struct net *net = sock_net(skb->sk);
1503         struct tcmsg *tcm;
1504         struct nlattr *tca[TCA_MAX + 1];
1505         struct net_device *dev;
1506         u32 clid;
1507         struct Qdisc *q, *p;
1508         int err;
1509
1510         if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1511                 return -EPERM;
1512
1513 replay:
1514         /* Reinit, just in case something touches this. */
1515         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1516                                      rtm_tca_policy, extack);
1517         if (err < 0)
1518                 return err;
1519
1520         tcm = nlmsg_data(n);
1521         clid = tcm->tcm_parent;
1522         q = p = NULL;
1523
1524         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1525         if (!dev)
1526                 return -ENODEV;
1527
1528
1529         if (clid) {
1530                 if (clid != TC_H_ROOT) {
1531                         if (clid != TC_H_INGRESS) {
1532                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1533                                 if (!p) {
1534                                         NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1535                                         return -ENOENT;
1536                                 }
1537                                 q = qdisc_leaf(p, clid);
1538                         } else if (dev_ingress_queue_create(dev)) {
1539                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1540                         }
1541                 } else {
1542                         q = dev->qdisc;
1543                 }
1544
1545                 /* It may be default qdisc, ignore it */
1546                 if (q && q->handle == 0)
1547                         q = NULL;
1548
1549                 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1550                         if (tcm->tcm_handle) {
1551                                 if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1552                                         NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1553                                         return -EEXIST;
1554                                 }
1555                                 if (TC_H_MIN(tcm->tcm_handle)) {
1556                                         NL_SET_ERR_MSG(extack, "Invalid minor handle");
1557                                         return -EINVAL;
1558                                 }
1559                                 q = qdisc_lookup(dev, tcm->tcm_handle);
1560                                 if (!q)
1561                                         goto create_n_graft;
1562                                 if (n->nlmsg_flags & NLM_F_EXCL) {
1563                                         NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1564                                         return -EEXIST;
1565                                 }
1566                                 if (tca[TCA_KIND] &&
1567                                     nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1568                                         NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1569                                         return -EINVAL;
1570                                 }
1571                                 if (q == p ||
1572                                     (p && check_loop(q, p, 0))) {
1573                                         NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1574                                         return -ELOOP;
1575                                 }
1576                                 qdisc_refcount_inc(q);
1577                                 goto graft;
1578                         } else {
1579                                 if (!q)
1580                                         goto create_n_graft;
1581
1582                                 /* This magic test requires explanation.
1583                                  *
1584                                  *   We know, that some child q is already
1585                                  *   attached to this parent and have choice:
1586                                  *   either to change it or to create/graft new one.
1587                                  *
1588                                  *   1. We are allowed to create/graft only
1589                                  *   if CREATE and REPLACE flags are set.
1590                                  *
1591                                  *   2. If EXCL is set, requestor wanted to say,
1592                                  *   that qdisc tcm_handle is not expected
1593                                  *   to exist, so that we choose create/graft too.
1594                                  *
1595                                  *   3. The last case is when no flags are set.
1596                                  *   Alas, it is sort of hole in API, we
1597                                  *   cannot decide what to do unambiguously.
1598                                  *   For now we select create/graft, if
1599                                  *   user gave KIND, which does not match existing.
1600                                  */
1601                                 if ((n->nlmsg_flags & NLM_F_CREATE) &&
1602                                     (n->nlmsg_flags & NLM_F_REPLACE) &&
1603                                     ((n->nlmsg_flags & NLM_F_EXCL) ||
1604                                      (tca[TCA_KIND] &&
1605                                       nla_strcmp(tca[TCA_KIND], q->ops->id))))
1606                                         goto create_n_graft;
1607                         }
1608                 }
1609         } else {
1610                 if (!tcm->tcm_handle) {
1611                         NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1612                         return -EINVAL;
1613                 }
1614                 q = qdisc_lookup(dev, tcm->tcm_handle);
1615         }
1616
1617         /* Change qdisc parameters */
1618         if (!q) {
1619                 NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1620                 return -ENOENT;
1621         }
1622         if (n->nlmsg_flags & NLM_F_EXCL) {
1623                 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1624                 return -EEXIST;
1625         }
1626         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1627                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1628                 return -EINVAL;
1629         }
1630         err = qdisc_change(q, tca, extack);
1631         if (err == 0)
1632                 qdisc_notify(net, skb, n, clid, NULL, q);
1633         return err;
1634
1635 create_n_graft:
1636         if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1637                 NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1638                 return -ENOENT;
1639         }
1640         if (clid == TC_H_INGRESS) {
1641                 if (dev_ingress_queue(dev)) {
1642                         q = qdisc_create(dev, dev_ingress_queue(dev), p,
1643                                          tcm->tcm_parent, tcm->tcm_parent,
1644                                          tca, &err, extack);
1645                 } else {
1646                         NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1647                         err = -ENOENT;
1648                 }
1649         } else {
1650                 struct netdev_queue *dev_queue;
1651
1652                 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1653                         dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1654                 else if (p)
1655                         dev_queue = p->dev_queue;
1656                 else
1657                         dev_queue = netdev_get_tx_queue(dev, 0);
1658
1659                 q = qdisc_create(dev, dev_queue, p,
1660                                  tcm->tcm_parent, tcm->tcm_handle,
1661                                  tca, &err, extack);
1662         }
1663         if (q == NULL) {
1664                 if (err == -EAGAIN)
1665                         goto replay;
1666                 return err;
1667         }
1668
1669 graft:
1670         err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1671         if (err) {
1672                 if (q)
1673                         qdisc_put(q);
1674                 return err;
1675         }
1676
1677         return 0;
1678 }
1679
1680 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1681                               struct netlink_callback *cb,
1682                               int *q_idx_p, int s_q_idx, bool recur,
1683                               bool dump_invisible)
1684 {
1685         int ret = 0, q_idx = *q_idx_p;
1686         struct Qdisc *q;
1687         int b;
1688
1689         if (!root)
1690                 return 0;
1691
1692         q = root;
1693         if (q_idx < s_q_idx) {
1694                 q_idx++;
1695         } else {
1696                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1697                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1698                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1699                                   RTM_NEWQDISC) <= 0)
1700                         goto done;
1701                 q_idx++;
1702         }
1703
1704         /* If dumping singletons, there is no qdisc_dev(root) and the singleton
1705          * itself has already been dumped.
1706          *
1707          * If we've already dumped the top-level (ingress) qdisc above and the global
1708          * qdisc hashtable, we don't want to hit it again
1709          */
1710         if (!qdisc_dev(root) || !recur)
1711                 goto out;
1712
1713         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1714                 if (q_idx < s_q_idx) {
1715                         q_idx++;
1716                         continue;
1717                 }
1718                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1719                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1720                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1721                                   RTM_NEWQDISC) <= 0)
1722                         goto done;
1723                 q_idx++;
1724         }
1725
1726 out:
1727         *q_idx_p = q_idx;
1728         return ret;
1729 done:
1730         ret = -1;
1731         goto out;
1732 }
1733
1734 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1735 {
1736         struct net *net = sock_net(skb->sk);
1737         int idx, q_idx;
1738         int s_idx, s_q_idx;
1739         struct net_device *dev;
1740         const struct nlmsghdr *nlh = cb->nlh;
1741         struct nlattr *tca[TCA_MAX + 1];
1742         int err;
1743
1744         s_idx = cb->args[0];
1745         s_q_idx = q_idx = cb->args[1];
1746
1747         idx = 0;
1748         ASSERT_RTNL();
1749
1750         err = nlmsg_parse_deprecated(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
1751                                      rtm_tca_policy, cb->extack);
1752         if (err < 0)
1753                 return err;
1754
1755         for_each_netdev(net, dev) {
1756                 struct netdev_queue *dev_queue;
1757
1758                 if (idx < s_idx)
1759                         goto cont;
1760                 if (idx > s_idx)
1761                         s_q_idx = 0;
1762                 q_idx = 0;
1763
1764                 if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx,
1765                                        true, tca[TCA_DUMP_INVISIBLE]) < 0)
1766                         goto done;
1767
1768                 dev_queue = dev_ingress_queue(dev);
1769                 if (dev_queue &&
1770                     tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1771                                        &q_idx, s_q_idx, false,
1772                                        tca[TCA_DUMP_INVISIBLE]) < 0)
1773                         goto done;
1774
1775 cont:
1776                 idx++;
1777         }
1778
1779 done:
1780         cb->args[0] = idx;
1781         cb->args[1] = q_idx;
1782
1783         return skb->len;
1784 }
1785
1786
1787
1788 /************************************************
1789  *      Traffic classes manipulation.           *
1790  ************************************************/
1791
1792 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1793                           unsigned long cl,
1794                           u32 portid, u32 seq, u16 flags, int event)
1795 {
1796         struct tcmsg *tcm;
1797         struct nlmsghdr  *nlh;
1798         unsigned char *b = skb_tail_pointer(skb);
1799         struct gnet_dump d;
1800         const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1801
1802         cond_resched();
1803         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1804         if (!nlh)
1805                 goto out_nlmsg_trim;
1806         tcm = nlmsg_data(nlh);
1807         tcm->tcm_family = AF_UNSPEC;
1808         tcm->tcm__pad1 = 0;
1809         tcm->tcm__pad2 = 0;
1810         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1811         tcm->tcm_parent = q->handle;
1812         tcm->tcm_handle = q->handle;
1813         tcm->tcm_info = 0;
1814         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1815                 goto nla_put_failure;
1816         if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1817                 goto nla_put_failure;
1818
1819         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1820                                          NULL, &d, TCA_PAD) < 0)
1821                 goto nla_put_failure;
1822
1823         if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1824                 goto nla_put_failure;
1825
1826         if (gnet_stats_finish_copy(&d) < 0)
1827                 goto nla_put_failure;
1828
1829         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1830         return skb->len;
1831
1832 out_nlmsg_trim:
1833 nla_put_failure:
1834         nlmsg_trim(skb, b);
1835         return -1;
1836 }
1837
1838 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1839                          struct nlmsghdr *n, struct Qdisc *q,
1840                          unsigned long cl, int event)
1841 {
1842         struct sk_buff *skb;
1843         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1844         int err = 0;
1845
1846         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1847         if (!skb)
1848                 return -ENOBUFS;
1849
1850         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1851                 kfree_skb(skb);
1852                 return -EINVAL;
1853         }
1854
1855         err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1856                              n->nlmsg_flags & NLM_F_ECHO);
1857         if (err > 0)
1858                 err = 0;
1859         return err;
1860 }
1861
1862 static int tclass_del_notify(struct net *net,
1863                              const struct Qdisc_class_ops *cops,
1864                              struct sk_buff *oskb, struct nlmsghdr *n,
1865                              struct Qdisc *q, unsigned long cl)
1866 {
1867         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1868         struct sk_buff *skb;
1869         int err = 0;
1870
1871         if (!cops->delete)
1872                 return -EOPNOTSUPP;
1873
1874         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1875         if (!skb)
1876                 return -ENOBUFS;
1877
1878         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1879                            RTM_DELTCLASS) < 0) {
1880                 kfree_skb(skb);
1881                 return -EINVAL;
1882         }
1883
1884         err = cops->delete(q, cl);
1885         if (err) {
1886                 kfree_skb(skb);
1887                 return err;
1888         }
1889
1890         err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1891                              n->nlmsg_flags & NLM_F_ECHO);
1892         if (err > 0)
1893                 err = 0;
1894         return err;
1895 }
1896
1897 #ifdef CONFIG_NET_CLS
1898
1899 struct tcf_bind_args {
1900         struct tcf_walker w;
1901         unsigned long base;
1902         unsigned long cl;
1903         u32 classid;
1904 };
1905
1906 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
1907 {
1908         struct tcf_bind_args *a = (void *)arg;
1909
1910         if (tp->ops->bind_class) {
1911                 struct Qdisc *q = tcf_block_q(tp->chain->block);
1912
1913                 sch_tree_lock(q);
1914                 tp->ops->bind_class(n, a->classid, a->cl, q, a->base);
1915                 sch_tree_unlock(q);
1916         }
1917         return 0;
1918 }
1919
1920 struct tc_bind_class_args {
1921         struct qdisc_walker w;
1922         unsigned long new_cl;
1923         u32 portid;
1924         u32 clid;
1925 };
1926
1927 static int tc_bind_class_walker(struct Qdisc *q, unsigned long cl,
1928                                 struct qdisc_walker *w)
1929 {
1930         struct tc_bind_class_args *a = (struct tc_bind_class_args *)w;
1931         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1932         struct tcf_block *block;
1933         struct tcf_chain *chain;
1934
1935         block = cops->tcf_block(q, cl, NULL);
1936         if (!block)
1937                 return 0;
1938         for (chain = tcf_get_next_chain(block, NULL);
1939              chain;
1940              chain = tcf_get_next_chain(block, chain)) {
1941                 struct tcf_proto *tp;
1942
1943                 for (tp = tcf_get_next_proto(chain, NULL, true);
1944                      tp; tp = tcf_get_next_proto(chain, tp, true)) {
1945                         struct tcf_bind_args arg = {};
1946
1947                         arg.w.fn = tcf_node_bind;
1948                         arg.classid = a->clid;
1949                         arg.base = cl;
1950                         arg.cl = a->new_cl;
1951                         tp->ops->walk(tp, &arg.w, true);
1952                 }
1953         }
1954
1955         return 0;
1956 }
1957
1958 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1959                            unsigned long new_cl)
1960 {
1961         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1962         struct tc_bind_class_args args = {};
1963
1964         if (!cops->tcf_block)
1965                 return;
1966         args.portid = portid;
1967         args.clid = clid;
1968         args.new_cl = new_cl;
1969         args.w.fn = tc_bind_class_walker;
1970         q->ops->cl_ops->walk(q, &args.w);
1971 }
1972
1973 #else
1974
1975 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1976                            unsigned long new_cl)
1977 {
1978 }
1979
1980 #endif
1981
1982 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
1983                          struct netlink_ext_ack *extack)
1984 {
1985         struct net *net = sock_net(skb->sk);
1986         struct tcmsg *tcm = nlmsg_data(n);
1987         struct nlattr *tca[TCA_MAX + 1];
1988         struct net_device *dev;
1989         struct Qdisc *q = NULL;
1990         const struct Qdisc_class_ops *cops;
1991         unsigned long cl = 0;
1992         unsigned long new_cl;
1993         u32 portid;
1994         u32 clid;
1995         u32 qid;
1996         int err;
1997
1998         if ((n->nlmsg_type != RTM_GETTCLASS) &&
1999             !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
2000                 return -EPERM;
2001
2002         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
2003                                      rtm_tca_policy, extack);
2004         if (err < 0)
2005                 return err;
2006
2007         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
2008         if (!dev)
2009                 return -ENODEV;
2010
2011         /*
2012            parent == TC_H_UNSPEC - unspecified parent.
2013            parent == TC_H_ROOT   - class is root, which has no parent.
2014            parent == X:0         - parent is root class.
2015            parent == X:Y         - parent is a node in hierarchy.
2016            parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
2017
2018            handle == 0:0         - generate handle from kernel pool.
2019            handle == 0:Y         - class is X:Y, where X:0 is qdisc.
2020            handle == X:Y         - clear.
2021            handle == X:0         - root class.
2022          */
2023
2024         /* Step 1. Determine qdisc handle X:0 */
2025
2026         portid = tcm->tcm_parent;
2027         clid = tcm->tcm_handle;
2028         qid = TC_H_MAJ(clid);
2029
2030         if (portid != TC_H_ROOT) {
2031                 u32 qid1 = TC_H_MAJ(portid);
2032
2033                 if (qid && qid1) {
2034                         /* If both majors are known, they must be identical. */
2035                         if (qid != qid1)
2036                                 return -EINVAL;
2037                 } else if (qid1) {
2038                         qid = qid1;
2039                 } else if (qid == 0)
2040                         qid = dev->qdisc->handle;
2041
2042                 /* Now qid is genuine qdisc handle consistent
2043                  * both with parent and child.
2044                  *
2045                  * TC_H_MAJ(portid) still may be unspecified, complete it now.
2046                  */
2047                 if (portid)
2048                         portid = TC_H_MAKE(qid, portid);
2049         } else {
2050                 if (qid == 0)
2051                         qid = dev->qdisc->handle;
2052         }
2053
2054         /* OK. Locate qdisc */
2055         q = qdisc_lookup(dev, qid);
2056         if (!q)
2057                 return -ENOENT;
2058
2059         /* An check that it supports classes */
2060         cops = q->ops->cl_ops;
2061         if (cops == NULL)
2062                 return -EINVAL;
2063
2064         /* Now try to get class */
2065         if (clid == 0) {
2066                 if (portid == TC_H_ROOT)
2067                         clid = qid;
2068         } else
2069                 clid = TC_H_MAKE(qid, clid);
2070
2071         if (clid)
2072                 cl = cops->find(q, clid);
2073
2074         if (cl == 0) {
2075                 err = -ENOENT;
2076                 if (n->nlmsg_type != RTM_NEWTCLASS ||
2077                     !(n->nlmsg_flags & NLM_F_CREATE))
2078                         goto out;
2079         } else {
2080                 switch (n->nlmsg_type) {
2081                 case RTM_NEWTCLASS:
2082                         err = -EEXIST;
2083                         if (n->nlmsg_flags & NLM_F_EXCL)
2084                                 goto out;
2085                         break;
2086                 case RTM_DELTCLASS:
2087                         err = tclass_del_notify(net, cops, skb, n, q, cl);
2088                         /* Unbind the class with flilters with 0 */
2089                         tc_bind_tclass(q, portid, clid, 0);
2090                         goto out;
2091                 case RTM_GETTCLASS:
2092                         err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
2093                         goto out;
2094                 default:
2095                         err = -EINVAL;
2096                         goto out;
2097                 }
2098         }
2099
2100         if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
2101                 NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
2102                 return -EOPNOTSUPP;
2103         }
2104
2105         new_cl = cl;
2106         err = -EOPNOTSUPP;
2107         if (cops->change)
2108                 err = cops->change(q, clid, portid, tca, &new_cl, extack);
2109         if (err == 0) {
2110                 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
2111                 /* We just create a new class, need to do reverse binding. */
2112                 if (cl != new_cl)
2113                         tc_bind_tclass(q, portid, clid, new_cl);
2114         }
2115 out:
2116         return err;
2117 }
2118
2119 struct qdisc_dump_args {
2120         struct qdisc_walker     w;
2121         struct sk_buff          *skb;
2122         struct netlink_callback *cb;
2123 };
2124
2125 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
2126                             struct qdisc_walker *arg)
2127 {
2128         struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
2129
2130         return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
2131                               a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
2132                               RTM_NEWTCLASS);
2133 }
2134
2135 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
2136                                 struct tcmsg *tcm, struct netlink_callback *cb,
2137                                 int *t_p, int s_t)
2138 {
2139         struct qdisc_dump_args arg;
2140
2141         if (tc_qdisc_dump_ignore(q, false) ||
2142             *t_p < s_t || !q->ops->cl_ops ||
2143             (tcm->tcm_parent &&
2144              TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2145                 (*t_p)++;
2146                 return 0;
2147         }
2148         if (*t_p > s_t)
2149                 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2150         arg.w.fn = qdisc_class_dump;
2151         arg.skb = skb;
2152         arg.cb = cb;
2153         arg.w.stop  = 0;
2154         arg.w.skip = cb->args[1];
2155         arg.w.count = 0;
2156         q->ops->cl_ops->walk(q, &arg.w);
2157         cb->args[1] = arg.w.count;
2158         if (arg.w.stop)
2159                 return -1;
2160         (*t_p)++;
2161         return 0;
2162 }
2163
2164 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2165                                struct tcmsg *tcm, struct netlink_callback *cb,
2166                                int *t_p, int s_t)
2167 {
2168         struct Qdisc *q;
2169         int b;
2170
2171         if (!root)
2172                 return 0;
2173
2174         if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2175                 return -1;
2176
2177         if (!qdisc_dev(root))
2178                 return 0;
2179
2180         if (tcm->tcm_parent) {
2181                 q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2182                 if (q && q != root &&
2183                     tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2184                         return -1;
2185                 return 0;
2186         }
2187         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2188                 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2189                         return -1;
2190         }
2191
2192         return 0;
2193 }
2194
2195 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2196 {
2197         struct tcmsg *tcm = nlmsg_data(cb->nlh);
2198         struct net *net = sock_net(skb->sk);
2199         struct netdev_queue *dev_queue;
2200         struct net_device *dev;
2201         int t, s_t;
2202
2203         if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2204                 return 0;
2205         dev = dev_get_by_index(net, tcm->tcm_ifindex);
2206         if (!dev)
2207                 return 0;
2208
2209         s_t = cb->args[0];
2210         t = 0;
2211
2212         if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
2213                 goto done;
2214
2215         dev_queue = dev_ingress_queue(dev);
2216         if (dev_queue &&
2217             tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
2218                                 &t, s_t) < 0)
2219                 goto done;
2220
2221 done:
2222         cb->args[0] = t;
2223
2224         dev_put(dev);
2225         return skb->len;
2226 }
2227
2228 #ifdef CONFIG_PROC_FS
2229 static int psched_show(struct seq_file *seq, void *v)
2230 {
2231         seq_printf(seq, "%08x %08x %08x %08x\n",
2232                    (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2233                    1000000,
2234                    (u32)NSEC_PER_SEC / hrtimer_resolution);
2235
2236         return 0;
2237 }
2238
2239 static int __net_init psched_net_init(struct net *net)
2240 {
2241         struct proc_dir_entry *e;
2242
2243         e = proc_create_single("psched", 0, net->proc_net, psched_show);
2244         if (e == NULL)
2245                 return -ENOMEM;
2246
2247         return 0;
2248 }
2249
2250 static void __net_exit psched_net_exit(struct net *net)
2251 {
2252         remove_proc_entry("psched", net->proc_net);
2253 }
2254 #else
2255 static int __net_init psched_net_init(struct net *net)
2256 {
2257         return 0;
2258 }
2259
2260 static void __net_exit psched_net_exit(struct net *net)
2261 {
2262 }
2263 #endif
2264
2265 static struct pernet_operations psched_net_ops = {
2266         .init = psched_net_init,
2267         .exit = psched_net_exit,
2268 };
2269
2270 static int __init pktsched_init(void)
2271 {
2272         int err;
2273
2274         err = register_pernet_subsys(&psched_net_ops);
2275         if (err) {
2276                 pr_err("pktsched_init: "
2277                        "cannot initialize per netns operations\n");
2278                 return err;
2279         }
2280
2281         register_qdisc(&pfifo_fast_ops);
2282         register_qdisc(&pfifo_qdisc_ops);
2283         register_qdisc(&bfifo_qdisc_ops);
2284         register_qdisc(&pfifo_head_drop_qdisc_ops);
2285         register_qdisc(&mq_qdisc_ops);
2286         register_qdisc(&noqueue_qdisc_ops);
2287
2288         rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2289         rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2290         rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2291                       0);
2292         rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2293         rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2294         rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2295                       0);
2296
2297         return 0;
2298 }
2299
2300 subsys_initcall(pktsched_init);