Merge tag 'arm-soc-fixes-5.15-3' of git://git.kernel.org/pub/scm/linux/kernel/git...
[platform/kernel/linux-starfive.git] / net / sched / sch_api.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * net/sched/sch_api.c  Packet scheduler API.
4  *
5  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
6  *
7  * Fixes:
8  *
9  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
10  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
11  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
12  */
13
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/string.h>
18 #include <linux/errno.h>
19 #include <linux/skbuff.h>
20 #include <linux/init.h>
21 #include <linux/proc_fs.h>
22 #include <linux/seq_file.h>
23 #include <linux/kmod.h>
24 #include <linux/list.h>
25 #include <linux/hrtimer.h>
26 #include <linux/slab.h>
27 #include <linux/hashtable.h>
28
29 #include <net/net_namespace.h>
30 #include <net/sock.h>
31 #include <net/netlink.h>
32 #include <net/pkt_sched.h>
33 #include <net/pkt_cls.h>
34
35 #include <trace/events/qdisc.h>
36
37 /*
38
39    Short review.
40    -------------
41
42    This file consists of two interrelated parts:
43
44    1. queueing disciplines manager frontend.
45    2. traffic classes manager frontend.
46
47    Generally, queueing discipline ("qdisc") is a black box,
48    which is able to enqueue packets and to dequeue them (when
49    device is ready to send something) in order and at times
50    determined by algorithm hidden in it.
51
52    qdisc's are divided to two categories:
53    - "queues", which have no internal structure visible from outside.
54    - "schedulers", which split all the packets to "traffic classes",
55      using "packet classifiers" (look at cls_api.c)
56
57    In turn, classes may have child qdiscs (as rule, queues)
58    attached to them etc. etc. etc.
59
60    The goal of the routines in this file is to translate
61    information supplied by user in the form of handles
62    to more intelligible for kernel form, to make some sanity
63    checks and part of work, which is common to all qdiscs
64    and to provide rtnetlink notifications.
65
66    All real intelligent work is done inside qdisc modules.
67
68
69
70    Every discipline has two major routines: enqueue and dequeue.
71
72    ---dequeue
73
74    dequeue usually returns a skb to send. It is allowed to return NULL,
75    but it does not mean that queue is empty, it just means that
76    discipline does not want to send anything this time.
77    Queue is really empty if q->q.qlen == 0.
78    For complicated disciplines with multiple queues q->q is not
79    real packet queue, but however q->q.qlen must be valid.
80
81    ---enqueue
82
83    enqueue returns 0, if packet was enqueued successfully.
84    If packet (this one or another one) was dropped, it returns
85    not zero error code.
86    NET_XMIT_DROP        - this packet dropped
87      Expected action: do not backoff, but wait until queue will clear.
88    NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
89      Expected action: backoff or ignore
90
91    Auxiliary routines:
92
93    ---peek
94
95    like dequeue but without removing a packet from the queue
96
97    ---reset
98
99    returns qdisc to initial state: purge all buffers, clear all
100    timers, counters (except for statistics) etc.
101
102    ---init
103
104    initializes newly created qdisc.
105
106    ---destroy
107
108    destroys resources allocated by init and during lifetime of qdisc.
109
110    ---change
111
112    changes qdisc parameters.
113  */
114
115 /* Protects list of registered TC modules. It is pure SMP lock. */
116 static DEFINE_RWLOCK(qdisc_mod_lock);
117
118
119 /************************************************
120  *      Queueing disciplines manipulation.      *
121  ************************************************/
122
123
124 /* The list of all installed queueing disciplines. */
125
126 static struct Qdisc_ops *qdisc_base;
127
128 /* Register/unregister queueing discipline */
129
130 int register_qdisc(struct Qdisc_ops *qops)
131 {
132         struct Qdisc_ops *q, **qp;
133         int rc = -EEXIST;
134
135         write_lock(&qdisc_mod_lock);
136         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
137                 if (!strcmp(qops->id, q->id))
138                         goto out;
139
140         if (qops->enqueue == NULL)
141                 qops->enqueue = noop_qdisc_ops.enqueue;
142         if (qops->peek == NULL) {
143                 if (qops->dequeue == NULL)
144                         qops->peek = noop_qdisc_ops.peek;
145                 else
146                         goto out_einval;
147         }
148         if (qops->dequeue == NULL)
149                 qops->dequeue = noop_qdisc_ops.dequeue;
150
151         if (qops->cl_ops) {
152                 const struct Qdisc_class_ops *cops = qops->cl_ops;
153
154                 if (!(cops->find && cops->walk && cops->leaf))
155                         goto out_einval;
156
157                 if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
158                         goto out_einval;
159         }
160
161         qops->next = NULL;
162         *qp = qops;
163         rc = 0;
164 out:
165         write_unlock(&qdisc_mod_lock);
166         return rc;
167
168 out_einval:
169         rc = -EINVAL;
170         goto out;
171 }
172 EXPORT_SYMBOL(register_qdisc);
173
174 int unregister_qdisc(struct Qdisc_ops *qops)
175 {
176         struct Qdisc_ops *q, **qp;
177         int err = -ENOENT;
178
179         write_lock(&qdisc_mod_lock);
180         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
181                 if (q == qops)
182                         break;
183         if (q) {
184                 *qp = q->next;
185                 q->next = NULL;
186                 err = 0;
187         }
188         write_unlock(&qdisc_mod_lock);
189         return err;
190 }
191 EXPORT_SYMBOL(unregister_qdisc);
192
193 /* Get default qdisc if not otherwise specified */
194 void qdisc_get_default(char *name, size_t len)
195 {
196         read_lock(&qdisc_mod_lock);
197         strlcpy(name, default_qdisc_ops->id, len);
198         read_unlock(&qdisc_mod_lock);
199 }
200
201 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
202 {
203         struct Qdisc_ops *q = NULL;
204
205         for (q = qdisc_base; q; q = q->next) {
206                 if (!strcmp(name, q->id)) {
207                         if (!try_module_get(q->owner))
208                                 q = NULL;
209                         break;
210                 }
211         }
212
213         return q;
214 }
215
216 /* Set new default qdisc to use */
217 int qdisc_set_default(const char *name)
218 {
219         const struct Qdisc_ops *ops;
220
221         if (!capable(CAP_NET_ADMIN))
222                 return -EPERM;
223
224         write_lock(&qdisc_mod_lock);
225         ops = qdisc_lookup_default(name);
226         if (!ops) {
227                 /* Not found, drop lock and try to load module */
228                 write_unlock(&qdisc_mod_lock);
229                 request_module("sch_%s", name);
230                 write_lock(&qdisc_mod_lock);
231
232                 ops = qdisc_lookup_default(name);
233         }
234
235         if (ops) {
236                 /* Set new default */
237                 module_put(default_qdisc_ops->owner);
238                 default_qdisc_ops = ops;
239         }
240         write_unlock(&qdisc_mod_lock);
241
242         return ops ? 0 : -ENOENT;
243 }
244
245 #ifdef CONFIG_NET_SCH_DEFAULT
246 /* Set default value from kernel config */
247 static int __init sch_default_qdisc(void)
248 {
249         return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
250 }
251 late_initcall(sch_default_qdisc);
252 #endif
253
254 /* We know handle. Find qdisc among all qdisc's attached to device
255  * (root qdisc, all its children, children of children etc.)
256  * Note: caller either uses rtnl or rcu_read_lock()
257  */
258
259 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
260 {
261         struct Qdisc *q;
262
263         if (!qdisc_dev(root))
264                 return (root->handle == handle ? root : NULL);
265
266         if (!(root->flags & TCQ_F_BUILTIN) &&
267             root->handle == handle)
268                 return root;
269
270         hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle,
271                                    lockdep_rtnl_is_held()) {
272                 if (q->handle == handle)
273                         return q;
274         }
275         return NULL;
276 }
277
278 void qdisc_hash_add(struct Qdisc *q, bool invisible)
279 {
280         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
281                 ASSERT_RTNL();
282                 hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
283                 if (invisible)
284                         q->flags |= TCQ_F_INVISIBLE;
285         }
286 }
287 EXPORT_SYMBOL(qdisc_hash_add);
288
289 void qdisc_hash_del(struct Qdisc *q)
290 {
291         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
292                 ASSERT_RTNL();
293                 hash_del_rcu(&q->hash);
294         }
295 }
296 EXPORT_SYMBOL(qdisc_hash_del);
297
298 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
299 {
300         struct Qdisc *q;
301
302         if (!handle)
303                 return NULL;
304         q = qdisc_match_from_root(dev->qdisc, handle);
305         if (q)
306                 goto out;
307
308         if (dev_ingress_queue(dev))
309                 q = qdisc_match_from_root(
310                         dev_ingress_queue(dev)->qdisc_sleeping,
311                         handle);
312 out:
313         return q;
314 }
315
316 struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle)
317 {
318         struct netdev_queue *nq;
319         struct Qdisc *q;
320
321         if (!handle)
322                 return NULL;
323         q = qdisc_match_from_root(dev->qdisc, handle);
324         if (q)
325                 goto out;
326
327         nq = dev_ingress_queue_rcu(dev);
328         if (nq)
329                 q = qdisc_match_from_root(nq->qdisc_sleeping, handle);
330 out:
331         return q;
332 }
333
334 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
335 {
336         unsigned long cl;
337         const struct Qdisc_class_ops *cops = p->ops->cl_ops;
338
339         if (cops == NULL)
340                 return NULL;
341         cl = cops->find(p, classid);
342
343         if (cl == 0)
344                 return NULL;
345         return cops->leaf(p, cl);
346 }
347
348 /* Find queueing discipline by name */
349
350 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
351 {
352         struct Qdisc_ops *q = NULL;
353
354         if (kind) {
355                 read_lock(&qdisc_mod_lock);
356                 for (q = qdisc_base; q; q = q->next) {
357                         if (nla_strcmp(kind, q->id) == 0) {
358                                 if (!try_module_get(q->owner))
359                                         q = NULL;
360                                 break;
361                         }
362                 }
363                 read_unlock(&qdisc_mod_lock);
364         }
365         return q;
366 }
367
368 /* The linklayer setting were not transferred from iproute2, in older
369  * versions, and the rate tables lookup systems have been dropped in
370  * the kernel. To keep backward compatible with older iproute2 tc
371  * utils, we detect the linklayer setting by detecting if the rate
372  * table were modified.
373  *
374  * For linklayer ATM table entries, the rate table will be aligned to
375  * 48 bytes, thus some table entries will contain the same value.  The
376  * mpu (min packet unit) is also encoded into the old rate table, thus
377  * starting from the mpu, we find low and high table entries for
378  * mapping this cell.  If these entries contain the same value, when
379  * the rate tables have been modified for linklayer ATM.
380  *
381  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
382  * and then roundup to the next cell, calc the table entry one below,
383  * and compare.
384  */
385 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
386 {
387         int low       = roundup(r->mpu, 48);
388         int high      = roundup(low+1, 48);
389         int cell_low  = low >> r->cell_log;
390         int cell_high = (high >> r->cell_log) - 1;
391
392         /* rtab is too inaccurate at rates > 100Mbit/s */
393         if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
394                 pr_debug("TC linklayer: Giving up ATM detection\n");
395                 return TC_LINKLAYER_ETHERNET;
396         }
397
398         if ((cell_high > cell_low) && (cell_high < 256)
399             && (rtab[cell_low] == rtab[cell_high])) {
400                 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
401                          cell_low, cell_high, rtab[cell_high]);
402                 return TC_LINKLAYER_ATM;
403         }
404         return TC_LINKLAYER_ETHERNET;
405 }
406
407 static struct qdisc_rate_table *qdisc_rtab_list;
408
409 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
410                                         struct nlattr *tab,
411                                         struct netlink_ext_ack *extack)
412 {
413         struct qdisc_rate_table *rtab;
414
415         if (tab == NULL || r->rate == 0 ||
416             r->cell_log == 0 || r->cell_log >= 32 ||
417             nla_len(tab) != TC_RTAB_SIZE) {
418                 NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
419                 return NULL;
420         }
421
422         for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
423                 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
424                     !memcmp(&rtab->data, nla_data(tab), 1024)) {
425                         rtab->refcnt++;
426                         return rtab;
427                 }
428         }
429
430         rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
431         if (rtab) {
432                 rtab->rate = *r;
433                 rtab->refcnt = 1;
434                 memcpy(rtab->data, nla_data(tab), 1024);
435                 if (r->linklayer == TC_LINKLAYER_UNAWARE)
436                         r->linklayer = __detect_linklayer(r, rtab->data);
437                 rtab->next = qdisc_rtab_list;
438                 qdisc_rtab_list = rtab;
439         } else {
440                 NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
441         }
442         return rtab;
443 }
444 EXPORT_SYMBOL(qdisc_get_rtab);
445
446 void qdisc_put_rtab(struct qdisc_rate_table *tab)
447 {
448         struct qdisc_rate_table *rtab, **rtabp;
449
450         if (!tab || --tab->refcnt)
451                 return;
452
453         for (rtabp = &qdisc_rtab_list;
454              (rtab = *rtabp) != NULL;
455              rtabp = &rtab->next) {
456                 if (rtab == tab) {
457                         *rtabp = rtab->next;
458                         kfree(rtab);
459                         return;
460                 }
461         }
462 }
463 EXPORT_SYMBOL(qdisc_put_rtab);
464
465 static LIST_HEAD(qdisc_stab_list);
466
467 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
468         [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
469         [TCA_STAB_DATA] = { .type = NLA_BINARY },
470 };
471
472 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
473                                                struct netlink_ext_ack *extack)
474 {
475         struct nlattr *tb[TCA_STAB_MAX + 1];
476         struct qdisc_size_table *stab;
477         struct tc_sizespec *s;
478         unsigned int tsize = 0;
479         u16 *tab = NULL;
480         int err;
481
482         err = nla_parse_nested_deprecated(tb, TCA_STAB_MAX, opt, stab_policy,
483                                           extack);
484         if (err < 0)
485                 return ERR_PTR(err);
486         if (!tb[TCA_STAB_BASE]) {
487                 NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
488                 return ERR_PTR(-EINVAL);
489         }
490
491         s = nla_data(tb[TCA_STAB_BASE]);
492
493         if (s->tsize > 0) {
494                 if (!tb[TCA_STAB_DATA]) {
495                         NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
496                         return ERR_PTR(-EINVAL);
497                 }
498                 tab = nla_data(tb[TCA_STAB_DATA]);
499                 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
500         }
501
502         if (tsize != s->tsize || (!tab && tsize > 0)) {
503                 NL_SET_ERR_MSG(extack, "Invalid size of size table");
504                 return ERR_PTR(-EINVAL);
505         }
506
507         list_for_each_entry(stab, &qdisc_stab_list, list) {
508                 if (memcmp(&stab->szopts, s, sizeof(*s)))
509                         continue;
510                 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
511                         continue;
512                 stab->refcnt++;
513                 return stab;
514         }
515
516         if (s->size_log > STAB_SIZE_LOG_MAX ||
517             s->cell_log > STAB_SIZE_LOG_MAX) {
518                 NL_SET_ERR_MSG(extack, "Invalid logarithmic size of size table");
519                 return ERR_PTR(-EINVAL);
520         }
521
522         stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
523         if (!stab)
524                 return ERR_PTR(-ENOMEM);
525
526         stab->refcnt = 1;
527         stab->szopts = *s;
528         if (tsize > 0)
529                 memcpy(stab->data, tab, tsize * sizeof(u16));
530
531         list_add_tail(&stab->list, &qdisc_stab_list);
532
533         return stab;
534 }
535
536 void qdisc_put_stab(struct qdisc_size_table *tab)
537 {
538         if (!tab)
539                 return;
540
541         if (--tab->refcnt == 0) {
542                 list_del(&tab->list);
543                 kfree_rcu(tab, rcu);
544         }
545 }
546 EXPORT_SYMBOL(qdisc_put_stab);
547
548 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
549 {
550         struct nlattr *nest;
551
552         nest = nla_nest_start_noflag(skb, TCA_STAB);
553         if (nest == NULL)
554                 goto nla_put_failure;
555         if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
556                 goto nla_put_failure;
557         nla_nest_end(skb, nest);
558
559         return skb->len;
560
561 nla_put_failure:
562         return -1;
563 }
564
565 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
566                                const struct qdisc_size_table *stab)
567 {
568         int pkt_len, slot;
569
570         pkt_len = skb->len + stab->szopts.overhead;
571         if (unlikely(!stab->szopts.tsize))
572                 goto out;
573
574         slot = pkt_len + stab->szopts.cell_align;
575         if (unlikely(slot < 0))
576                 slot = 0;
577
578         slot >>= stab->szopts.cell_log;
579         if (likely(slot < stab->szopts.tsize))
580                 pkt_len = stab->data[slot];
581         else
582                 pkt_len = stab->data[stab->szopts.tsize - 1] *
583                                 (slot / stab->szopts.tsize) +
584                                 stab->data[slot % stab->szopts.tsize];
585
586         pkt_len <<= stab->szopts.size_log;
587 out:
588         if (unlikely(pkt_len < 1))
589                 pkt_len = 1;
590         qdisc_skb_cb(skb)->pkt_len = pkt_len;
591 }
592 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
593
594 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
595 {
596         if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
597                 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
598                         txt, qdisc->ops->id, qdisc->handle >> 16);
599                 qdisc->flags |= TCQ_F_WARN_NONWC;
600         }
601 }
602 EXPORT_SYMBOL(qdisc_warn_nonwc);
603
604 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
605 {
606         struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
607                                                  timer);
608
609         rcu_read_lock();
610         __netif_schedule(qdisc_root(wd->qdisc));
611         rcu_read_unlock();
612
613         return HRTIMER_NORESTART;
614 }
615
616 void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
617                                  clockid_t clockid)
618 {
619         hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
620         wd->timer.function = qdisc_watchdog;
621         wd->qdisc = qdisc;
622 }
623 EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
624
625 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
626 {
627         qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
628 }
629 EXPORT_SYMBOL(qdisc_watchdog_init);
630
631 void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires,
632                                       u64 delta_ns)
633 {
634         if (test_bit(__QDISC_STATE_DEACTIVATED,
635                      &qdisc_root_sleeping(wd->qdisc)->state))
636                 return;
637
638         if (hrtimer_is_queued(&wd->timer)) {
639                 /* If timer is already set in [expires, expires + delta_ns],
640                  * do not reprogram it.
641                  */
642                 if (wd->last_expires - expires <= delta_ns)
643                         return;
644         }
645
646         wd->last_expires = expires;
647         hrtimer_start_range_ns(&wd->timer,
648                                ns_to_ktime(expires),
649                                delta_ns,
650                                HRTIMER_MODE_ABS_PINNED);
651 }
652 EXPORT_SYMBOL(qdisc_watchdog_schedule_range_ns);
653
654 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
655 {
656         hrtimer_cancel(&wd->timer);
657 }
658 EXPORT_SYMBOL(qdisc_watchdog_cancel);
659
660 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
661 {
662         struct hlist_head *h;
663         unsigned int i;
664
665         h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
666
667         if (h != NULL) {
668                 for (i = 0; i < n; i++)
669                         INIT_HLIST_HEAD(&h[i]);
670         }
671         return h;
672 }
673
674 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
675 {
676         struct Qdisc_class_common *cl;
677         struct hlist_node *next;
678         struct hlist_head *nhash, *ohash;
679         unsigned int nsize, nmask, osize;
680         unsigned int i, h;
681
682         /* Rehash when load factor exceeds 0.75 */
683         if (clhash->hashelems * 4 <= clhash->hashsize * 3)
684                 return;
685         nsize = clhash->hashsize * 2;
686         nmask = nsize - 1;
687         nhash = qdisc_class_hash_alloc(nsize);
688         if (nhash == NULL)
689                 return;
690
691         ohash = clhash->hash;
692         osize = clhash->hashsize;
693
694         sch_tree_lock(sch);
695         for (i = 0; i < osize; i++) {
696                 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
697                         h = qdisc_class_hash(cl->classid, nmask);
698                         hlist_add_head(&cl->hnode, &nhash[h]);
699                 }
700         }
701         clhash->hash     = nhash;
702         clhash->hashsize = nsize;
703         clhash->hashmask = nmask;
704         sch_tree_unlock(sch);
705
706         kvfree(ohash);
707 }
708 EXPORT_SYMBOL(qdisc_class_hash_grow);
709
710 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
711 {
712         unsigned int size = 4;
713
714         clhash->hash = qdisc_class_hash_alloc(size);
715         if (!clhash->hash)
716                 return -ENOMEM;
717         clhash->hashsize  = size;
718         clhash->hashmask  = size - 1;
719         clhash->hashelems = 0;
720         return 0;
721 }
722 EXPORT_SYMBOL(qdisc_class_hash_init);
723
724 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
725 {
726         kvfree(clhash->hash);
727 }
728 EXPORT_SYMBOL(qdisc_class_hash_destroy);
729
730 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
731                              struct Qdisc_class_common *cl)
732 {
733         unsigned int h;
734
735         INIT_HLIST_NODE(&cl->hnode);
736         h = qdisc_class_hash(cl->classid, clhash->hashmask);
737         hlist_add_head(&cl->hnode, &clhash->hash[h]);
738         clhash->hashelems++;
739 }
740 EXPORT_SYMBOL(qdisc_class_hash_insert);
741
742 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
743                              struct Qdisc_class_common *cl)
744 {
745         hlist_del(&cl->hnode);
746         clhash->hashelems--;
747 }
748 EXPORT_SYMBOL(qdisc_class_hash_remove);
749
750 /* Allocate an unique handle from space managed by kernel
751  * Possible range is [8000-FFFF]:0000 (0x8000 values)
752  */
753 static u32 qdisc_alloc_handle(struct net_device *dev)
754 {
755         int i = 0x8000;
756         static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
757
758         do {
759                 autohandle += TC_H_MAKE(0x10000U, 0);
760                 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
761                         autohandle = TC_H_MAKE(0x80000000U, 0);
762                 if (!qdisc_lookup(dev, autohandle))
763                         return autohandle;
764                 cond_resched();
765         } while (--i > 0);
766
767         return 0;
768 }
769
770 void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len)
771 {
772         bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
773         const struct Qdisc_class_ops *cops;
774         unsigned long cl;
775         u32 parentid;
776         bool notify;
777         int drops;
778
779         if (n == 0 && len == 0)
780                 return;
781         drops = max_t(int, n, 0);
782         rcu_read_lock();
783         while ((parentid = sch->parent)) {
784                 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
785                         break;
786
787                 if (sch->flags & TCQ_F_NOPARENT)
788                         break;
789                 /* Notify parent qdisc only if child qdisc becomes empty.
790                  *
791                  * If child was empty even before update then backlog
792                  * counter is screwed and we skip notification because
793                  * parent class is already passive.
794                  *
795                  * If the original child was offloaded then it is allowed
796                  * to be seem as empty, so the parent is notified anyway.
797                  */
798                 notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
799                                                        !qdisc_is_offloaded);
800                 /* TODO: perform the search on a per txq basis */
801                 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
802                 if (sch == NULL) {
803                         WARN_ON_ONCE(parentid != TC_H_ROOT);
804                         break;
805                 }
806                 cops = sch->ops->cl_ops;
807                 if (notify && cops->qlen_notify) {
808                         cl = cops->find(sch, parentid);
809                         cops->qlen_notify(sch, cl);
810                 }
811                 sch->q.qlen -= n;
812                 sch->qstats.backlog -= len;
813                 __qdisc_qstats_drop(sch, drops);
814         }
815         rcu_read_unlock();
816 }
817 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
818
819 int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type,
820                               void *type_data)
821 {
822         struct net_device *dev = qdisc_dev(sch);
823         int err;
824
825         sch->flags &= ~TCQ_F_OFFLOADED;
826         if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
827                 return 0;
828
829         err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
830         if (err == -EOPNOTSUPP)
831                 return 0;
832
833         if (!err)
834                 sch->flags |= TCQ_F_OFFLOADED;
835
836         return err;
837 }
838 EXPORT_SYMBOL(qdisc_offload_dump_helper);
839
840 void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
841                                 struct Qdisc *new, struct Qdisc *old,
842                                 enum tc_setup_type type, void *type_data,
843                                 struct netlink_ext_ack *extack)
844 {
845         bool any_qdisc_is_offloaded;
846         int err;
847
848         if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
849                 return;
850
851         err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
852
853         /* Don't report error if the graft is part of destroy operation. */
854         if (!err || !new || new == &noop_qdisc)
855                 return;
856
857         /* Don't report error if the parent, the old child and the new
858          * one are not offloaded.
859          */
860         any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED;
861         any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED;
862         any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED;
863
864         if (any_qdisc_is_offloaded)
865                 NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
866 }
867 EXPORT_SYMBOL(qdisc_offload_graft_helper);
868
869 static void qdisc_offload_graft_root(struct net_device *dev,
870                                      struct Qdisc *new, struct Qdisc *old,
871                                      struct netlink_ext_ack *extack)
872 {
873         struct tc_root_qopt_offload graft_offload = {
874                 .command        = TC_ROOT_GRAFT,
875                 .handle         = new ? new->handle : 0,
876                 .ingress        = (new && new->flags & TCQ_F_INGRESS) ||
877                                   (old && old->flags & TCQ_F_INGRESS),
878         };
879
880         qdisc_offload_graft_helper(dev, NULL, new, old,
881                                    TC_SETUP_ROOT_QDISC, &graft_offload, extack);
882 }
883
884 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
885                          u32 portid, u32 seq, u16 flags, int event)
886 {
887         struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
888         struct gnet_stats_queue __percpu *cpu_qstats = NULL;
889         struct tcmsg *tcm;
890         struct nlmsghdr  *nlh;
891         unsigned char *b = skb_tail_pointer(skb);
892         struct gnet_dump d;
893         struct qdisc_size_table *stab;
894         u32 block_index;
895         __u32 qlen;
896
897         cond_resched();
898         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
899         if (!nlh)
900                 goto out_nlmsg_trim;
901         tcm = nlmsg_data(nlh);
902         tcm->tcm_family = AF_UNSPEC;
903         tcm->tcm__pad1 = 0;
904         tcm->tcm__pad2 = 0;
905         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
906         tcm->tcm_parent = clid;
907         tcm->tcm_handle = q->handle;
908         tcm->tcm_info = refcount_read(&q->refcnt);
909         if (nla_put_string(skb, TCA_KIND, q->ops->id))
910                 goto nla_put_failure;
911         if (q->ops->ingress_block_get) {
912                 block_index = q->ops->ingress_block_get(q);
913                 if (block_index &&
914                     nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
915                         goto nla_put_failure;
916         }
917         if (q->ops->egress_block_get) {
918                 block_index = q->ops->egress_block_get(q);
919                 if (block_index &&
920                     nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
921                         goto nla_put_failure;
922         }
923         if (q->ops->dump && q->ops->dump(q, skb) < 0)
924                 goto nla_put_failure;
925         if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
926                 goto nla_put_failure;
927         qlen = qdisc_qlen_sum(q);
928
929         stab = rtnl_dereference(q->stab);
930         if (stab && qdisc_dump_stab(skb, stab) < 0)
931                 goto nla_put_failure;
932
933         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
934                                          NULL, &d, TCA_PAD) < 0)
935                 goto nla_put_failure;
936
937         if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
938                 goto nla_put_failure;
939
940         if (qdisc_is_percpu_stats(q)) {
941                 cpu_bstats = q->cpu_bstats;
942                 cpu_qstats = q->cpu_qstats;
943         }
944
945         if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
946                                   &d, cpu_bstats, &q->bstats) < 0 ||
947             gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
948             gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
949                 goto nla_put_failure;
950
951         if (gnet_stats_finish_copy(&d) < 0)
952                 goto nla_put_failure;
953
954         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
955         return skb->len;
956
957 out_nlmsg_trim:
958 nla_put_failure:
959         nlmsg_trim(skb, b);
960         return -1;
961 }
962
963 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
964 {
965         if (q->flags & TCQ_F_BUILTIN)
966                 return true;
967         if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
968                 return true;
969
970         return false;
971 }
972
973 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
974                         struct nlmsghdr *n, u32 clid,
975                         struct Qdisc *old, struct Qdisc *new)
976 {
977         struct sk_buff *skb;
978         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
979
980         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
981         if (!skb)
982                 return -ENOBUFS;
983
984         if (old && !tc_qdisc_dump_ignore(old, false)) {
985                 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
986                                   0, RTM_DELQDISC) < 0)
987                         goto err_out;
988         }
989         if (new && !tc_qdisc_dump_ignore(new, false)) {
990                 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
991                                   old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
992                         goto err_out;
993         }
994
995         if (skb->len)
996                 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
997                                       n->nlmsg_flags & NLM_F_ECHO);
998
999 err_out:
1000         kfree_skb(skb);
1001         return -EINVAL;
1002 }
1003
1004 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
1005                                struct nlmsghdr *n, u32 clid,
1006                                struct Qdisc *old, struct Qdisc *new)
1007 {
1008         if (new || old)
1009                 qdisc_notify(net, skb, n, clid, old, new);
1010
1011         if (old)
1012                 qdisc_put(old);
1013 }
1014
1015 static void qdisc_clear_nolock(struct Qdisc *sch)
1016 {
1017         sch->flags &= ~TCQ_F_NOLOCK;
1018         if (!(sch->flags & TCQ_F_CPUSTATS))
1019                 return;
1020
1021         free_percpu(sch->cpu_bstats);
1022         free_percpu(sch->cpu_qstats);
1023         sch->cpu_bstats = NULL;
1024         sch->cpu_qstats = NULL;
1025         sch->flags &= ~TCQ_F_CPUSTATS;
1026 }
1027
1028 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
1029  * to device "dev".
1030  *
1031  * When appropriate send a netlink notification using 'skb'
1032  * and "n".
1033  *
1034  * On success, destroy old qdisc.
1035  */
1036
1037 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
1038                        struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
1039                        struct Qdisc *new, struct Qdisc *old,
1040                        struct netlink_ext_ack *extack)
1041 {
1042         struct Qdisc *q = old;
1043         struct net *net = dev_net(dev);
1044
1045         if (parent == NULL) {
1046                 unsigned int i, num_q, ingress;
1047
1048                 ingress = 0;
1049                 num_q = dev->num_tx_queues;
1050                 if ((q && q->flags & TCQ_F_INGRESS) ||
1051                     (new && new->flags & TCQ_F_INGRESS)) {
1052                         num_q = 1;
1053                         ingress = 1;
1054                         if (!dev_ingress_queue(dev)) {
1055                                 NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
1056                                 return -ENOENT;
1057                         }
1058                 }
1059
1060                 if (dev->flags & IFF_UP)
1061                         dev_deactivate(dev);
1062
1063                 qdisc_offload_graft_root(dev, new, old, extack);
1064
1065                 if (new && new->ops->attach)
1066                         goto skip;
1067
1068                 for (i = 0; i < num_q; i++) {
1069                         struct netdev_queue *dev_queue = dev_ingress_queue(dev);
1070
1071                         if (!ingress)
1072                                 dev_queue = netdev_get_tx_queue(dev, i);
1073
1074                         old = dev_graft_qdisc(dev_queue, new);
1075                         if (new && i > 0)
1076                                 qdisc_refcount_inc(new);
1077
1078                         if (!ingress)
1079                                 qdisc_put(old);
1080                 }
1081
1082 skip:
1083                 if (!ingress) {
1084                         notify_and_destroy(net, skb, n, classid,
1085                                            dev->qdisc, new);
1086                         if (new && !new->ops->attach)
1087                                 qdisc_refcount_inc(new);
1088                         dev->qdisc = new ? : &noop_qdisc;
1089
1090                         if (new && new->ops->attach)
1091                                 new->ops->attach(new);
1092                 } else {
1093                         notify_and_destroy(net, skb, n, classid, old, new);
1094                 }
1095
1096                 if (dev->flags & IFF_UP)
1097                         dev_activate(dev);
1098         } else {
1099                 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
1100                 unsigned long cl;
1101                 int err;
1102
1103                 /* Only support running class lockless if parent is lockless */
1104                 if (new && (new->flags & TCQ_F_NOLOCK) && !(parent->flags & TCQ_F_NOLOCK))
1105                         qdisc_clear_nolock(new);
1106
1107                 if (!cops || !cops->graft)
1108                         return -EOPNOTSUPP;
1109
1110                 cl = cops->find(parent, classid);
1111                 if (!cl) {
1112                         NL_SET_ERR_MSG(extack, "Specified class not found");
1113                         return -ENOENT;
1114                 }
1115
1116                 err = cops->graft(parent, cl, new, &old, extack);
1117                 if (err)
1118                         return err;
1119                 notify_and_destroy(net, skb, n, classid, old, new);
1120         }
1121         return 0;
1122 }
1123
1124 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1125                                    struct netlink_ext_ack *extack)
1126 {
1127         u32 block_index;
1128
1129         if (tca[TCA_INGRESS_BLOCK]) {
1130                 block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1131
1132                 if (!block_index) {
1133                         NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1134                         return -EINVAL;
1135                 }
1136                 if (!sch->ops->ingress_block_set) {
1137                         NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1138                         return -EOPNOTSUPP;
1139                 }
1140                 sch->ops->ingress_block_set(sch, block_index);
1141         }
1142         if (tca[TCA_EGRESS_BLOCK]) {
1143                 block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1144
1145                 if (!block_index) {
1146                         NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1147                         return -EINVAL;
1148                 }
1149                 if (!sch->ops->egress_block_set) {
1150                         NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1151                         return -EOPNOTSUPP;
1152                 }
1153                 sch->ops->egress_block_set(sch, block_index);
1154         }
1155         return 0;
1156 }
1157
1158 /*
1159    Allocate and initialize new qdisc.
1160
1161    Parameters are passed via opt.
1162  */
1163
1164 static struct Qdisc *qdisc_create(struct net_device *dev,
1165                                   struct netdev_queue *dev_queue,
1166                                   struct Qdisc *p, u32 parent, u32 handle,
1167                                   struct nlattr **tca, int *errp,
1168                                   struct netlink_ext_ack *extack)
1169 {
1170         int err;
1171         struct nlattr *kind = tca[TCA_KIND];
1172         struct Qdisc *sch;
1173         struct Qdisc_ops *ops;
1174         struct qdisc_size_table *stab;
1175
1176         ops = qdisc_lookup_ops(kind);
1177 #ifdef CONFIG_MODULES
1178         if (ops == NULL && kind != NULL) {
1179                 char name[IFNAMSIZ];
1180                 if (nla_strscpy(name, kind, IFNAMSIZ) >= 0) {
1181                         /* We dropped the RTNL semaphore in order to
1182                          * perform the module load.  So, even if we
1183                          * succeeded in loading the module we have to
1184                          * tell the caller to replay the request.  We
1185                          * indicate this using -EAGAIN.
1186                          * We replay the request because the device may
1187                          * go away in the mean time.
1188                          */
1189                         rtnl_unlock();
1190                         request_module("sch_%s", name);
1191                         rtnl_lock();
1192                         ops = qdisc_lookup_ops(kind);
1193                         if (ops != NULL) {
1194                                 /* We will try again qdisc_lookup_ops,
1195                                  * so don't keep a reference.
1196                                  */
1197                                 module_put(ops->owner);
1198                                 err = -EAGAIN;
1199                                 goto err_out;
1200                         }
1201                 }
1202         }
1203 #endif
1204
1205         err = -ENOENT;
1206         if (!ops) {
1207                 NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1208                 goto err_out;
1209         }
1210
1211         sch = qdisc_alloc(dev_queue, ops, extack);
1212         if (IS_ERR(sch)) {
1213                 err = PTR_ERR(sch);
1214                 goto err_out2;
1215         }
1216
1217         sch->parent = parent;
1218
1219         if (handle == TC_H_INGRESS) {
1220                 sch->flags |= TCQ_F_INGRESS;
1221                 handle = TC_H_MAKE(TC_H_INGRESS, 0);
1222         } else {
1223                 if (handle == 0) {
1224                         handle = qdisc_alloc_handle(dev);
1225                         if (handle == 0) {
1226                                 NL_SET_ERR_MSG(extack, "Maximum number of qdisc handles was exceeded");
1227                                 err = -ENOSPC;
1228                                 goto err_out3;
1229                         }
1230                 }
1231                 if (!netif_is_multiqueue(dev))
1232                         sch->flags |= TCQ_F_ONETXQUEUE;
1233         }
1234
1235         sch->handle = handle;
1236
1237         /* This exist to keep backward compatible with a userspace
1238          * loophole, what allowed userspace to get IFF_NO_QUEUE
1239          * facility on older kernels by setting tx_queue_len=0 (prior
1240          * to qdisc init), and then forgot to reinit tx_queue_len
1241          * before again attaching a qdisc.
1242          */
1243         if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1244                 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1245                 netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1246         }
1247
1248         err = qdisc_block_indexes_set(sch, tca, extack);
1249         if (err)
1250                 goto err_out3;
1251
1252         if (ops->init) {
1253                 err = ops->init(sch, tca[TCA_OPTIONS], extack);
1254                 if (err != 0)
1255                         goto err_out5;
1256         }
1257
1258         if (tca[TCA_STAB]) {
1259                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1260                 if (IS_ERR(stab)) {
1261                         err = PTR_ERR(stab);
1262                         goto err_out4;
1263                 }
1264                 rcu_assign_pointer(sch->stab, stab);
1265         }
1266         if (tca[TCA_RATE]) {
1267                 seqcount_t *running;
1268
1269                 err = -EOPNOTSUPP;
1270                 if (sch->flags & TCQ_F_MQROOT) {
1271                         NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1272                         goto err_out4;
1273                 }
1274
1275                 if (sch->parent != TC_H_ROOT &&
1276                     !(sch->flags & TCQ_F_INGRESS) &&
1277                     (!p || !(p->flags & TCQ_F_MQROOT)))
1278                         running = qdisc_root_sleeping_running(sch);
1279                 else
1280                         running = &sch->running;
1281
1282                 err = gen_new_estimator(&sch->bstats,
1283                                         sch->cpu_bstats,
1284                                         &sch->rate_est,
1285                                         NULL,
1286                                         running,
1287                                         tca[TCA_RATE]);
1288                 if (err) {
1289                         NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1290                         goto err_out4;
1291                 }
1292         }
1293
1294         qdisc_hash_add(sch, false);
1295         trace_qdisc_create(ops, dev, parent);
1296
1297         return sch;
1298
1299 err_out5:
1300         /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1301         if (ops->destroy)
1302                 ops->destroy(sch);
1303 err_out3:
1304         dev_put(dev);
1305         qdisc_free(sch);
1306 err_out2:
1307         module_put(ops->owner);
1308 err_out:
1309         *errp = err;
1310         return NULL;
1311
1312 err_out4:
1313         /*
1314          * Any broken qdiscs that would require a ops->reset() here?
1315          * The qdisc was never in action so it shouldn't be necessary.
1316          */
1317         qdisc_put_stab(rtnl_dereference(sch->stab));
1318         if (ops->destroy)
1319                 ops->destroy(sch);
1320         goto err_out3;
1321 }
1322
1323 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1324                         struct netlink_ext_ack *extack)
1325 {
1326         struct qdisc_size_table *ostab, *stab = NULL;
1327         int err = 0;
1328
1329         if (tca[TCA_OPTIONS]) {
1330                 if (!sch->ops->change) {
1331                         NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1332                         return -EINVAL;
1333                 }
1334                 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1335                         NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1336                         return -EOPNOTSUPP;
1337                 }
1338                 err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1339                 if (err)
1340                         return err;
1341         }
1342
1343         if (tca[TCA_STAB]) {
1344                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1345                 if (IS_ERR(stab))
1346                         return PTR_ERR(stab);
1347         }
1348
1349         ostab = rtnl_dereference(sch->stab);
1350         rcu_assign_pointer(sch->stab, stab);
1351         qdisc_put_stab(ostab);
1352
1353         if (tca[TCA_RATE]) {
1354                 /* NB: ignores errors from replace_estimator
1355                    because change can't be undone. */
1356                 if (sch->flags & TCQ_F_MQROOT)
1357                         goto out;
1358                 gen_replace_estimator(&sch->bstats,
1359                                       sch->cpu_bstats,
1360                                       &sch->rate_est,
1361                                       NULL,
1362                                       qdisc_root_sleeping_running(sch),
1363                                       tca[TCA_RATE]);
1364         }
1365 out:
1366         return 0;
1367 }
1368
1369 struct check_loop_arg {
1370         struct qdisc_walker     w;
1371         struct Qdisc            *p;
1372         int                     depth;
1373 };
1374
1375 static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1376                          struct qdisc_walker *w);
1377
1378 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1379 {
1380         struct check_loop_arg   arg;
1381
1382         if (q->ops->cl_ops == NULL)
1383                 return 0;
1384
1385         arg.w.stop = arg.w.skip = arg.w.count = 0;
1386         arg.w.fn = check_loop_fn;
1387         arg.depth = depth;
1388         arg.p = p;
1389         q->ops->cl_ops->walk(q, &arg.w);
1390         return arg.w.stop ? -ELOOP : 0;
1391 }
1392
1393 static int
1394 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1395 {
1396         struct Qdisc *leaf;
1397         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1398         struct check_loop_arg *arg = (struct check_loop_arg *)w;
1399
1400         leaf = cops->leaf(q, cl);
1401         if (leaf) {
1402                 if (leaf == arg->p || arg->depth > 7)
1403                         return -ELOOP;
1404                 return check_loop(leaf, arg->p, arg->depth + 1);
1405         }
1406         return 0;
1407 }
1408
1409 const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
1410         [TCA_KIND]              = { .type = NLA_STRING },
1411         [TCA_RATE]              = { .type = NLA_BINARY,
1412                                     .len = sizeof(struct tc_estimator) },
1413         [TCA_STAB]              = { .type = NLA_NESTED },
1414         [TCA_DUMP_INVISIBLE]    = { .type = NLA_FLAG },
1415         [TCA_CHAIN]             = { .type = NLA_U32 },
1416         [TCA_INGRESS_BLOCK]     = { .type = NLA_U32 },
1417         [TCA_EGRESS_BLOCK]      = { .type = NLA_U32 },
1418 };
1419
1420 /*
1421  * Delete/get qdisc.
1422  */
1423
1424 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1425                         struct netlink_ext_ack *extack)
1426 {
1427         struct net *net = sock_net(skb->sk);
1428         struct tcmsg *tcm = nlmsg_data(n);
1429         struct nlattr *tca[TCA_MAX + 1];
1430         struct net_device *dev;
1431         u32 clid;
1432         struct Qdisc *q = NULL;
1433         struct Qdisc *p = NULL;
1434         int err;
1435
1436         if ((n->nlmsg_type != RTM_GETQDISC) &&
1437             !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1438                 return -EPERM;
1439
1440         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1441                                      rtm_tca_policy, extack);
1442         if (err < 0)
1443                 return err;
1444
1445         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1446         if (!dev)
1447                 return -ENODEV;
1448
1449         clid = tcm->tcm_parent;
1450         if (clid) {
1451                 if (clid != TC_H_ROOT) {
1452                         if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1453                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1454                                 if (!p) {
1455                                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1456                                         return -ENOENT;
1457                                 }
1458                                 q = qdisc_leaf(p, clid);
1459                         } else if (dev_ingress_queue(dev)) {
1460                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1461                         }
1462                 } else {
1463                         q = dev->qdisc;
1464                 }
1465                 if (!q) {
1466                         NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1467                         return -ENOENT;
1468                 }
1469
1470                 if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1471                         NL_SET_ERR_MSG(extack, "Invalid handle");
1472                         return -EINVAL;
1473                 }
1474         } else {
1475                 q = qdisc_lookup(dev, tcm->tcm_handle);
1476                 if (!q) {
1477                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1478                         return -ENOENT;
1479                 }
1480         }
1481
1482         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1483                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1484                 return -EINVAL;
1485         }
1486
1487         if (n->nlmsg_type == RTM_DELQDISC) {
1488                 if (!clid) {
1489                         NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1490                         return -EINVAL;
1491                 }
1492                 if (q->handle == 0) {
1493                         NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1494                         return -ENOENT;
1495                 }
1496                 err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1497                 if (err != 0)
1498                         return err;
1499         } else {
1500                 qdisc_notify(net, skb, n, clid, NULL, q);
1501         }
1502         return 0;
1503 }
1504
1505 /*
1506  * Create/change qdisc.
1507  */
1508
1509 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1510                            struct netlink_ext_ack *extack)
1511 {
1512         struct net *net = sock_net(skb->sk);
1513         struct tcmsg *tcm;
1514         struct nlattr *tca[TCA_MAX + 1];
1515         struct net_device *dev;
1516         u32 clid;
1517         struct Qdisc *q, *p;
1518         int err;
1519
1520         if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1521                 return -EPERM;
1522
1523 replay:
1524         /* Reinit, just in case something touches this. */
1525         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1526                                      rtm_tca_policy, extack);
1527         if (err < 0)
1528                 return err;
1529
1530         tcm = nlmsg_data(n);
1531         clid = tcm->tcm_parent;
1532         q = p = NULL;
1533
1534         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1535         if (!dev)
1536                 return -ENODEV;
1537
1538
1539         if (clid) {
1540                 if (clid != TC_H_ROOT) {
1541                         if (clid != TC_H_INGRESS) {
1542                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1543                                 if (!p) {
1544                                         NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1545                                         return -ENOENT;
1546                                 }
1547                                 q = qdisc_leaf(p, clid);
1548                         } else if (dev_ingress_queue_create(dev)) {
1549                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1550                         }
1551                 } else {
1552                         q = dev->qdisc;
1553                 }
1554
1555                 /* It may be default qdisc, ignore it */
1556                 if (q && q->handle == 0)
1557                         q = NULL;
1558
1559                 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1560                         if (tcm->tcm_handle) {
1561                                 if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1562                                         NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1563                                         return -EEXIST;
1564                                 }
1565                                 if (TC_H_MIN(tcm->tcm_handle)) {
1566                                         NL_SET_ERR_MSG(extack, "Invalid minor handle");
1567                                         return -EINVAL;
1568                                 }
1569                                 q = qdisc_lookup(dev, tcm->tcm_handle);
1570                                 if (!q)
1571                                         goto create_n_graft;
1572                                 if (n->nlmsg_flags & NLM_F_EXCL) {
1573                                         NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1574                                         return -EEXIST;
1575                                 }
1576                                 if (tca[TCA_KIND] &&
1577                                     nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1578                                         NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1579                                         return -EINVAL;
1580                                 }
1581                                 if (q == p ||
1582                                     (p && check_loop(q, p, 0))) {
1583                                         NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1584                                         return -ELOOP;
1585                                 }
1586                                 qdisc_refcount_inc(q);
1587                                 goto graft;
1588                         } else {
1589                                 if (!q)
1590                                         goto create_n_graft;
1591
1592                                 /* This magic test requires explanation.
1593                                  *
1594                                  *   We know, that some child q is already
1595                                  *   attached to this parent and have choice:
1596                                  *   either to change it or to create/graft new one.
1597                                  *
1598                                  *   1. We are allowed to create/graft only
1599                                  *   if CREATE and REPLACE flags are set.
1600                                  *
1601                                  *   2. If EXCL is set, requestor wanted to say,
1602                                  *   that qdisc tcm_handle is not expected
1603                                  *   to exist, so that we choose create/graft too.
1604                                  *
1605                                  *   3. The last case is when no flags are set.
1606                                  *   Alas, it is sort of hole in API, we
1607                                  *   cannot decide what to do unambiguously.
1608                                  *   For now we select create/graft, if
1609                                  *   user gave KIND, which does not match existing.
1610                                  */
1611                                 if ((n->nlmsg_flags & NLM_F_CREATE) &&
1612                                     (n->nlmsg_flags & NLM_F_REPLACE) &&
1613                                     ((n->nlmsg_flags & NLM_F_EXCL) ||
1614                                      (tca[TCA_KIND] &&
1615                                       nla_strcmp(tca[TCA_KIND], q->ops->id))))
1616                                         goto create_n_graft;
1617                         }
1618                 }
1619         } else {
1620                 if (!tcm->tcm_handle) {
1621                         NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1622                         return -EINVAL;
1623                 }
1624                 q = qdisc_lookup(dev, tcm->tcm_handle);
1625         }
1626
1627         /* Change qdisc parameters */
1628         if (!q) {
1629                 NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1630                 return -ENOENT;
1631         }
1632         if (n->nlmsg_flags & NLM_F_EXCL) {
1633                 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1634                 return -EEXIST;
1635         }
1636         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1637                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1638                 return -EINVAL;
1639         }
1640         err = qdisc_change(q, tca, extack);
1641         if (err == 0)
1642                 qdisc_notify(net, skb, n, clid, NULL, q);
1643         return err;
1644
1645 create_n_graft:
1646         if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1647                 NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1648                 return -ENOENT;
1649         }
1650         if (clid == TC_H_INGRESS) {
1651                 if (dev_ingress_queue(dev)) {
1652                         q = qdisc_create(dev, dev_ingress_queue(dev), p,
1653                                          tcm->tcm_parent, tcm->tcm_parent,
1654                                          tca, &err, extack);
1655                 } else {
1656                         NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1657                         err = -ENOENT;
1658                 }
1659         } else {
1660                 struct netdev_queue *dev_queue;
1661
1662                 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1663                         dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1664                 else if (p)
1665                         dev_queue = p->dev_queue;
1666                 else
1667                         dev_queue = netdev_get_tx_queue(dev, 0);
1668
1669                 q = qdisc_create(dev, dev_queue, p,
1670                                  tcm->tcm_parent, tcm->tcm_handle,
1671                                  tca, &err, extack);
1672         }
1673         if (q == NULL) {
1674                 if (err == -EAGAIN)
1675                         goto replay;
1676                 return err;
1677         }
1678
1679 graft:
1680         err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1681         if (err) {
1682                 if (q)
1683                         qdisc_put(q);
1684                 return err;
1685         }
1686
1687         return 0;
1688 }
1689
1690 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1691                               struct netlink_callback *cb,
1692                               int *q_idx_p, int s_q_idx, bool recur,
1693                               bool dump_invisible)
1694 {
1695         int ret = 0, q_idx = *q_idx_p;
1696         struct Qdisc *q;
1697         int b;
1698
1699         if (!root)
1700                 return 0;
1701
1702         q = root;
1703         if (q_idx < s_q_idx) {
1704                 q_idx++;
1705         } else {
1706                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1707                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1708                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1709                                   RTM_NEWQDISC) <= 0)
1710                         goto done;
1711                 q_idx++;
1712         }
1713
1714         /* If dumping singletons, there is no qdisc_dev(root) and the singleton
1715          * itself has already been dumped.
1716          *
1717          * If we've already dumped the top-level (ingress) qdisc above and the global
1718          * qdisc hashtable, we don't want to hit it again
1719          */
1720         if (!qdisc_dev(root) || !recur)
1721                 goto out;
1722
1723         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1724                 if (q_idx < s_q_idx) {
1725                         q_idx++;
1726                         continue;
1727                 }
1728                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1729                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1730                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1731                                   RTM_NEWQDISC) <= 0)
1732                         goto done;
1733                 q_idx++;
1734         }
1735
1736 out:
1737         *q_idx_p = q_idx;
1738         return ret;
1739 done:
1740         ret = -1;
1741         goto out;
1742 }
1743
1744 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1745 {
1746         struct net *net = sock_net(skb->sk);
1747         int idx, q_idx;
1748         int s_idx, s_q_idx;
1749         struct net_device *dev;
1750         const struct nlmsghdr *nlh = cb->nlh;
1751         struct nlattr *tca[TCA_MAX + 1];
1752         int err;
1753
1754         s_idx = cb->args[0];
1755         s_q_idx = q_idx = cb->args[1];
1756
1757         idx = 0;
1758         ASSERT_RTNL();
1759
1760         err = nlmsg_parse_deprecated(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
1761                                      rtm_tca_policy, cb->extack);
1762         if (err < 0)
1763                 return err;
1764
1765         for_each_netdev(net, dev) {
1766                 struct netdev_queue *dev_queue;
1767
1768                 if (idx < s_idx)
1769                         goto cont;
1770                 if (idx > s_idx)
1771                         s_q_idx = 0;
1772                 q_idx = 0;
1773
1774                 if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx,
1775                                        true, tca[TCA_DUMP_INVISIBLE]) < 0)
1776                         goto done;
1777
1778                 dev_queue = dev_ingress_queue(dev);
1779                 if (dev_queue &&
1780                     tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1781                                        &q_idx, s_q_idx, false,
1782                                        tca[TCA_DUMP_INVISIBLE]) < 0)
1783                         goto done;
1784
1785 cont:
1786                 idx++;
1787         }
1788
1789 done:
1790         cb->args[0] = idx;
1791         cb->args[1] = q_idx;
1792
1793         return skb->len;
1794 }
1795
1796
1797
1798 /************************************************
1799  *      Traffic classes manipulation.           *
1800  ************************************************/
1801
1802 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1803                           unsigned long cl,
1804                           u32 portid, u32 seq, u16 flags, int event)
1805 {
1806         struct tcmsg *tcm;
1807         struct nlmsghdr  *nlh;
1808         unsigned char *b = skb_tail_pointer(skb);
1809         struct gnet_dump d;
1810         const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1811
1812         cond_resched();
1813         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1814         if (!nlh)
1815                 goto out_nlmsg_trim;
1816         tcm = nlmsg_data(nlh);
1817         tcm->tcm_family = AF_UNSPEC;
1818         tcm->tcm__pad1 = 0;
1819         tcm->tcm__pad2 = 0;
1820         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1821         tcm->tcm_parent = q->handle;
1822         tcm->tcm_handle = q->handle;
1823         tcm->tcm_info = 0;
1824         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1825                 goto nla_put_failure;
1826         if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1827                 goto nla_put_failure;
1828
1829         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1830                                          NULL, &d, TCA_PAD) < 0)
1831                 goto nla_put_failure;
1832
1833         if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1834                 goto nla_put_failure;
1835
1836         if (gnet_stats_finish_copy(&d) < 0)
1837                 goto nla_put_failure;
1838
1839         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1840         return skb->len;
1841
1842 out_nlmsg_trim:
1843 nla_put_failure:
1844         nlmsg_trim(skb, b);
1845         return -1;
1846 }
1847
1848 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1849                          struct nlmsghdr *n, struct Qdisc *q,
1850                          unsigned long cl, int event)
1851 {
1852         struct sk_buff *skb;
1853         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1854
1855         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1856         if (!skb)
1857                 return -ENOBUFS;
1858
1859         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1860                 kfree_skb(skb);
1861                 return -EINVAL;
1862         }
1863
1864         return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1865                               n->nlmsg_flags & NLM_F_ECHO);
1866 }
1867
1868 static int tclass_del_notify(struct net *net,
1869                              const struct Qdisc_class_ops *cops,
1870                              struct sk_buff *oskb, struct nlmsghdr *n,
1871                              struct Qdisc *q, unsigned long cl,
1872                              struct netlink_ext_ack *extack)
1873 {
1874         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1875         struct sk_buff *skb;
1876         int err = 0;
1877
1878         if (!cops->delete)
1879                 return -EOPNOTSUPP;
1880
1881         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1882         if (!skb)
1883                 return -ENOBUFS;
1884
1885         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1886                            RTM_DELTCLASS) < 0) {
1887                 kfree_skb(skb);
1888                 return -EINVAL;
1889         }
1890
1891         err = cops->delete(q, cl, extack);
1892         if (err) {
1893                 kfree_skb(skb);
1894                 return err;
1895         }
1896
1897         err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1898                              n->nlmsg_flags & NLM_F_ECHO);
1899         return err;
1900 }
1901
1902 #ifdef CONFIG_NET_CLS
1903
1904 struct tcf_bind_args {
1905         struct tcf_walker w;
1906         unsigned long base;
1907         unsigned long cl;
1908         u32 classid;
1909 };
1910
1911 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
1912 {
1913         struct tcf_bind_args *a = (void *)arg;
1914
1915         if (tp->ops->bind_class) {
1916                 struct Qdisc *q = tcf_block_q(tp->chain->block);
1917
1918                 sch_tree_lock(q);
1919                 tp->ops->bind_class(n, a->classid, a->cl, q, a->base);
1920                 sch_tree_unlock(q);
1921         }
1922         return 0;
1923 }
1924
1925 struct tc_bind_class_args {
1926         struct qdisc_walker w;
1927         unsigned long new_cl;
1928         u32 portid;
1929         u32 clid;
1930 };
1931
1932 static int tc_bind_class_walker(struct Qdisc *q, unsigned long cl,
1933                                 struct qdisc_walker *w)
1934 {
1935         struct tc_bind_class_args *a = (struct tc_bind_class_args *)w;
1936         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1937         struct tcf_block *block;
1938         struct tcf_chain *chain;
1939
1940         block = cops->tcf_block(q, cl, NULL);
1941         if (!block)
1942                 return 0;
1943         for (chain = tcf_get_next_chain(block, NULL);
1944              chain;
1945              chain = tcf_get_next_chain(block, chain)) {
1946                 struct tcf_proto *tp;
1947
1948                 for (tp = tcf_get_next_proto(chain, NULL);
1949                      tp; tp = tcf_get_next_proto(chain, tp)) {
1950                         struct tcf_bind_args arg = {};
1951
1952                         arg.w.fn = tcf_node_bind;
1953                         arg.classid = a->clid;
1954                         arg.base = cl;
1955                         arg.cl = a->new_cl;
1956                         tp->ops->walk(tp, &arg.w, true);
1957                 }
1958         }
1959
1960         return 0;
1961 }
1962
1963 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1964                            unsigned long new_cl)
1965 {
1966         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1967         struct tc_bind_class_args args = {};
1968
1969         if (!cops->tcf_block)
1970                 return;
1971         args.portid = portid;
1972         args.clid = clid;
1973         args.new_cl = new_cl;
1974         args.w.fn = tc_bind_class_walker;
1975         q->ops->cl_ops->walk(q, &args.w);
1976 }
1977
1978 #else
1979
1980 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1981                            unsigned long new_cl)
1982 {
1983 }
1984
1985 #endif
1986
1987 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
1988                          struct netlink_ext_ack *extack)
1989 {
1990         struct net *net = sock_net(skb->sk);
1991         struct tcmsg *tcm = nlmsg_data(n);
1992         struct nlattr *tca[TCA_MAX + 1];
1993         struct net_device *dev;
1994         struct Qdisc *q = NULL;
1995         const struct Qdisc_class_ops *cops;
1996         unsigned long cl = 0;
1997         unsigned long new_cl;
1998         u32 portid;
1999         u32 clid;
2000         u32 qid;
2001         int err;
2002
2003         if ((n->nlmsg_type != RTM_GETTCLASS) &&
2004             !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
2005                 return -EPERM;
2006
2007         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
2008                                      rtm_tca_policy, extack);
2009         if (err < 0)
2010                 return err;
2011
2012         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
2013         if (!dev)
2014                 return -ENODEV;
2015
2016         /*
2017            parent == TC_H_UNSPEC - unspecified parent.
2018            parent == TC_H_ROOT   - class is root, which has no parent.
2019            parent == X:0         - parent is root class.
2020            parent == X:Y         - parent is a node in hierarchy.
2021            parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
2022
2023            handle == 0:0         - generate handle from kernel pool.
2024            handle == 0:Y         - class is X:Y, where X:0 is qdisc.
2025            handle == X:Y         - clear.
2026            handle == X:0         - root class.
2027          */
2028
2029         /* Step 1. Determine qdisc handle X:0 */
2030
2031         portid = tcm->tcm_parent;
2032         clid = tcm->tcm_handle;
2033         qid = TC_H_MAJ(clid);
2034
2035         if (portid != TC_H_ROOT) {
2036                 u32 qid1 = TC_H_MAJ(portid);
2037
2038                 if (qid && qid1) {
2039                         /* If both majors are known, they must be identical. */
2040                         if (qid != qid1)
2041                                 return -EINVAL;
2042                 } else if (qid1) {
2043                         qid = qid1;
2044                 } else if (qid == 0)
2045                         qid = dev->qdisc->handle;
2046
2047                 /* Now qid is genuine qdisc handle consistent
2048                  * both with parent and child.
2049                  *
2050                  * TC_H_MAJ(portid) still may be unspecified, complete it now.
2051                  */
2052                 if (portid)
2053                         portid = TC_H_MAKE(qid, portid);
2054         } else {
2055                 if (qid == 0)
2056                         qid = dev->qdisc->handle;
2057         }
2058
2059         /* OK. Locate qdisc */
2060         q = qdisc_lookup(dev, qid);
2061         if (!q)
2062                 return -ENOENT;
2063
2064         /* An check that it supports classes */
2065         cops = q->ops->cl_ops;
2066         if (cops == NULL)
2067                 return -EINVAL;
2068
2069         /* Now try to get class */
2070         if (clid == 0) {
2071                 if (portid == TC_H_ROOT)
2072                         clid = qid;
2073         } else
2074                 clid = TC_H_MAKE(qid, clid);
2075
2076         if (clid)
2077                 cl = cops->find(q, clid);
2078
2079         if (cl == 0) {
2080                 err = -ENOENT;
2081                 if (n->nlmsg_type != RTM_NEWTCLASS ||
2082                     !(n->nlmsg_flags & NLM_F_CREATE))
2083                         goto out;
2084         } else {
2085                 switch (n->nlmsg_type) {
2086                 case RTM_NEWTCLASS:
2087                         err = -EEXIST;
2088                         if (n->nlmsg_flags & NLM_F_EXCL)
2089                                 goto out;
2090                         break;
2091                 case RTM_DELTCLASS:
2092                         err = tclass_del_notify(net, cops, skb, n, q, cl, extack);
2093                         /* Unbind the class with flilters with 0 */
2094                         tc_bind_tclass(q, portid, clid, 0);
2095                         goto out;
2096                 case RTM_GETTCLASS:
2097                         err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
2098                         goto out;
2099                 default:
2100                         err = -EINVAL;
2101                         goto out;
2102                 }
2103         }
2104
2105         if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
2106                 NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
2107                 return -EOPNOTSUPP;
2108         }
2109
2110         new_cl = cl;
2111         err = -EOPNOTSUPP;
2112         if (cops->change)
2113                 err = cops->change(q, clid, portid, tca, &new_cl, extack);
2114         if (err == 0) {
2115                 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
2116                 /* We just create a new class, need to do reverse binding. */
2117                 if (cl != new_cl)
2118                         tc_bind_tclass(q, portid, clid, new_cl);
2119         }
2120 out:
2121         return err;
2122 }
2123
2124 struct qdisc_dump_args {
2125         struct qdisc_walker     w;
2126         struct sk_buff          *skb;
2127         struct netlink_callback *cb;
2128 };
2129
2130 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
2131                             struct qdisc_walker *arg)
2132 {
2133         struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
2134
2135         return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
2136                               a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
2137                               RTM_NEWTCLASS);
2138 }
2139
2140 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
2141                                 struct tcmsg *tcm, struct netlink_callback *cb,
2142                                 int *t_p, int s_t)
2143 {
2144         struct qdisc_dump_args arg;
2145
2146         if (tc_qdisc_dump_ignore(q, false) ||
2147             *t_p < s_t || !q->ops->cl_ops ||
2148             (tcm->tcm_parent &&
2149              TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2150                 (*t_p)++;
2151                 return 0;
2152         }
2153         if (*t_p > s_t)
2154                 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2155         arg.w.fn = qdisc_class_dump;
2156         arg.skb = skb;
2157         arg.cb = cb;
2158         arg.w.stop  = 0;
2159         arg.w.skip = cb->args[1];
2160         arg.w.count = 0;
2161         q->ops->cl_ops->walk(q, &arg.w);
2162         cb->args[1] = arg.w.count;
2163         if (arg.w.stop)
2164                 return -1;
2165         (*t_p)++;
2166         return 0;
2167 }
2168
2169 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2170                                struct tcmsg *tcm, struct netlink_callback *cb,
2171                                int *t_p, int s_t, bool recur)
2172 {
2173         struct Qdisc *q;
2174         int b;
2175
2176         if (!root)
2177                 return 0;
2178
2179         if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2180                 return -1;
2181
2182         if (!qdisc_dev(root) || !recur)
2183                 return 0;
2184
2185         if (tcm->tcm_parent) {
2186                 q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2187                 if (q && q != root &&
2188                     tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2189                         return -1;
2190                 return 0;
2191         }
2192         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2193                 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2194                         return -1;
2195         }
2196
2197         return 0;
2198 }
2199
2200 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2201 {
2202         struct tcmsg *tcm = nlmsg_data(cb->nlh);
2203         struct net *net = sock_net(skb->sk);
2204         struct netdev_queue *dev_queue;
2205         struct net_device *dev;
2206         int t, s_t;
2207
2208         if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2209                 return 0;
2210         dev = dev_get_by_index(net, tcm->tcm_ifindex);
2211         if (!dev)
2212                 return 0;
2213
2214         s_t = cb->args[0];
2215         t = 0;
2216
2217         if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t, true) < 0)
2218                 goto done;
2219
2220         dev_queue = dev_ingress_queue(dev);
2221         if (dev_queue &&
2222             tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
2223                                 &t, s_t, false) < 0)
2224                 goto done;
2225
2226 done:
2227         cb->args[0] = t;
2228
2229         dev_put(dev);
2230         return skb->len;
2231 }
2232
2233 #ifdef CONFIG_PROC_FS
2234 static int psched_show(struct seq_file *seq, void *v)
2235 {
2236         seq_printf(seq, "%08x %08x %08x %08x\n",
2237                    (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2238                    1000000,
2239                    (u32)NSEC_PER_SEC / hrtimer_resolution);
2240
2241         return 0;
2242 }
2243
2244 static int __net_init psched_net_init(struct net *net)
2245 {
2246         struct proc_dir_entry *e;
2247
2248         e = proc_create_single("psched", 0, net->proc_net, psched_show);
2249         if (e == NULL)
2250                 return -ENOMEM;
2251
2252         return 0;
2253 }
2254
2255 static void __net_exit psched_net_exit(struct net *net)
2256 {
2257         remove_proc_entry("psched", net->proc_net);
2258 }
2259 #else
2260 static int __net_init psched_net_init(struct net *net)
2261 {
2262         return 0;
2263 }
2264
2265 static void __net_exit psched_net_exit(struct net *net)
2266 {
2267 }
2268 #endif
2269
2270 static struct pernet_operations psched_net_ops = {
2271         .init = psched_net_init,
2272         .exit = psched_net_exit,
2273 };
2274
2275 static int __init pktsched_init(void)
2276 {
2277         int err;
2278
2279         err = register_pernet_subsys(&psched_net_ops);
2280         if (err) {
2281                 pr_err("pktsched_init: "
2282                        "cannot initialize per netns operations\n");
2283                 return err;
2284         }
2285
2286         register_qdisc(&pfifo_fast_ops);
2287         register_qdisc(&pfifo_qdisc_ops);
2288         register_qdisc(&bfifo_qdisc_ops);
2289         register_qdisc(&pfifo_head_drop_qdisc_ops);
2290         register_qdisc(&mq_qdisc_ops);
2291         register_qdisc(&noqueue_qdisc_ops);
2292
2293         rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2294         rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2295         rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2296                       0);
2297         rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2298         rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2299         rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2300                       0);
2301
2302         return 0;
2303 }
2304
2305 subsys_initcall(pktsched_init);