Merge tag 'nolibc-urgent.2022.10.28a' of git://git.kernel.org/pub/scm/linux/kernel...
[platform/kernel/linux-starfive.git] / net / sched / sch_api.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * net/sched/sch_api.c  Packet scheduler API.
4  *
5  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
6  *
7  * Fixes:
8  *
9  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
10  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
11  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
12  */
13
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/string.h>
18 #include <linux/errno.h>
19 #include <linux/skbuff.h>
20 #include <linux/init.h>
21 #include <linux/proc_fs.h>
22 #include <linux/seq_file.h>
23 #include <linux/kmod.h>
24 #include <linux/list.h>
25 #include <linux/hrtimer.h>
26 #include <linux/slab.h>
27 #include <linux/hashtable.h>
28
29 #include <net/net_namespace.h>
30 #include <net/sock.h>
31 #include <net/netlink.h>
32 #include <net/pkt_sched.h>
33 #include <net/pkt_cls.h>
34
35 #include <trace/events/qdisc.h>
36
37 /*
38
39    Short review.
40    -------------
41
42    This file consists of two interrelated parts:
43
44    1. queueing disciplines manager frontend.
45    2. traffic classes manager frontend.
46
47    Generally, queueing discipline ("qdisc") is a black box,
48    which is able to enqueue packets and to dequeue them (when
49    device is ready to send something) in order and at times
50    determined by algorithm hidden in it.
51
52    qdisc's are divided to two categories:
53    - "queues", which have no internal structure visible from outside.
54    - "schedulers", which split all the packets to "traffic classes",
55      using "packet classifiers" (look at cls_api.c)
56
57    In turn, classes may have child qdiscs (as rule, queues)
58    attached to them etc. etc. etc.
59
60    The goal of the routines in this file is to translate
61    information supplied by user in the form of handles
62    to more intelligible for kernel form, to make some sanity
63    checks and part of work, which is common to all qdiscs
64    and to provide rtnetlink notifications.
65
66    All real intelligent work is done inside qdisc modules.
67
68
69
70    Every discipline has two major routines: enqueue and dequeue.
71
72    ---dequeue
73
74    dequeue usually returns a skb to send. It is allowed to return NULL,
75    but it does not mean that queue is empty, it just means that
76    discipline does not want to send anything this time.
77    Queue is really empty if q->q.qlen == 0.
78    For complicated disciplines with multiple queues q->q is not
79    real packet queue, but however q->q.qlen must be valid.
80
81    ---enqueue
82
83    enqueue returns 0, if packet was enqueued successfully.
84    If packet (this one or another one) was dropped, it returns
85    not zero error code.
86    NET_XMIT_DROP        - this packet dropped
87      Expected action: do not backoff, but wait until queue will clear.
88    NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
89      Expected action: backoff or ignore
90
91    Auxiliary routines:
92
93    ---peek
94
95    like dequeue but without removing a packet from the queue
96
97    ---reset
98
99    returns qdisc to initial state: purge all buffers, clear all
100    timers, counters (except for statistics) etc.
101
102    ---init
103
104    initializes newly created qdisc.
105
106    ---destroy
107
108    destroys resources allocated by init and during lifetime of qdisc.
109
110    ---change
111
112    changes qdisc parameters.
113  */
114
115 /* Protects list of registered TC modules. It is pure SMP lock. */
116 static DEFINE_RWLOCK(qdisc_mod_lock);
117
118
119 /************************************************
120  *      Queueing disciplines manipulation.      *
121  ************************************************/
122
123
124 /* The list of all installed queueing disciplines. */
125
126 static struct Qdisc_ops *qdisc_base;
127
128 /* Register/unregister queueing discipline */
129
130 int register_qdisc(struct Qdisc_ops *qops)
131 {
132         struct Qdisc_ops *q, **qp;
133         int rc = -EEXIST;
134
135         write_lock(&qdisc_mod_lock);
136         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
137                 if (!strcmp(qops->id, q->id))
138                         goto out;
139
140         if (qops->enqueue == NULL)
141                 qops->enqueue = noop_qdisc_ops.enqueue;
142         if (qops->peek == NULL) {
143                 if (qops->dequeue == NULL)
144                         qops->peek = noop_qdisc_ops.peek;
145                 else
146                         goto out_einval;
147         }
148         if (qops->dequeue == NULL)
149                 qops->dequeue = noop_qdisc_ops.dequeue;
150
151         if (qops->cl_ops) {
152                 const struct Qdisc_class_ops *cops = qops->cl_ops;
153
154                 if (!(cops->find && cops->walk && cops->leaf))
155                         goto out_einval;
156
157                 if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
158                         goto out_einval;
159         }
160
161         qops->next = NULL;
162         *qp = qops;
163         rc = 0;
164 out:
165         write_unlock(&qdisc_mod_lock);
166         return rc;
167
168 out_einval:
169         rc = -EINVAL;
170         goto out;
171 }
172 EXPORT_SYMBOL(register_qdisc);
173
174 void unregister_qdisc(struct Qdisc_ops *qops)
175 {
176         struct Qdisc_ops *q, **qp;
177         int err = -ENOENT;
178
179         write_lock(&qdisc_mod_lock);
180         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
181                 if (q == qops)
182                         break;
183         if (q) {
184                 *qp = q->next;
185                 q->next = NULL;
186                 err = 0;
187         }
188         write_unlock(&qdisc_mod_lock);
189
190         WARN(err, "unregister qdisc(%s) failed\n", qops->id);
191 }
192 EXPORT_SYMBOL(unregister_qdisc);
193
194 /* Get default qdisc if not otherwise specified */
195 void qdisc_get_default(char *name, size_t len)
196 {
197         read_lock(&qdisc_mod_lock);
198         strscpy(name, default_qdisc_ops->id, len);
199         read_unlock(&qdisc_mod_lock);
200 }
201
202 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
203 {
204         struct Qdisc_ops *q = NULL;
205
206         for (q = qdisc_base; q; q = q->next) {
207                 if (!strcmp(name, q->id)) {
208                         if (!try_module_get(q->owner))
209                                 q = NULL;
210                         break;
211                 }
212         }
213
214         return q;
215 }
216
217 /* Set new default qdisc to use */
218 int qdisc_set_default(const char *name)
219 {
220         const struct Qdisc_ops *ops;
221
222         if (!capable(CAP_NET_ADMIN))
223                 return -EPERM;
224
225         write_lock(&qdisc_mod_lock);
226         ops = qdisc_lookup_default(name);
227         if (!ops) {
228                 /* Not found, drop lock and try to load module */
229                 write_unlock(&qdisc_mod_lock);
230                 request_module("sch_%s", name);
231                 write_lock(&qdisc_mod_lock);
232
233                 ops = qdisc_lookup_default(name);
234         }
235
236         if (ops) {
237                 /* Set new default */
238                 module_put(default_qdisc_ops->owner);
239                 default_qdisc_ops = ops;
240         }
241         write_unlock(&qdisc_mod_lock);
242
243         return ops ? 0 : -ENOENT;
244 }
245
246 #ifdef CONFIG_NET_SCH_DEFAULT
247 /* Set default value from kernel config */
248 static int __init sch_default_qdisc(void)
249 {
250         return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
251 }
252 late_initcall(sch_default_qdisc);
253 #endif
254
255 /* We know handle. Find qdisc among all qdisc's attached to device
256  * (root qdisc, all its children, children of children etc.)
257  * Note: caller either uses rtnl or rcu_read_lock()
258  */
259
260 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
261 {
262         struct Qdisc *q;
263
264         if (!qdisc_dev(root))
265                 return (root->handle == handle ? root : NULL);
266
267         if (!(root->flags & TCQ_F_BUILTIN) &&
268             root->handle == handle)
269                 return root;
270
271         hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle,
272                                    lockdep_rtnl_is_held()) {
273                 if (q->handle == handle)
274                         return q;
275         }
276         return NULL;
277 }
278
279 void qdisc_hash_add(struct Qdisc *q, bool invisible)
280 {
281         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
282                 ASSERT_RTNL();
283                 hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
284                 if (invisible)
285                         q->flags |= TCQ_F_INVISIBLE;
286         }
287 }
288 EXPORT_SYMBOL(qdisc_hash_add);
289
290 void qdisc_hash_del(struct Qdisc *q)
291 {
292         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
293                 ASSERT_RTNL();
294                 hash_del_rcu(&q->hash);
295         }
296 }
297 EXPORT_SYMBOL(qdisc_hash_del);
298
299 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
300 {
301         struct Qdisc *q;
302
303         if (!handle)
304                 return NULL;
305         q = qdisc_match_from_root(rtnl_dereference(dev->qdisc), handle);
306         if (q)
307                 goto out;
308
309         if (dev_ingress_queue(dev))
310                 q = qdisc_match_from_root(
311                         dev_ingress_queue(dev)->qdisc_sleeping,
312                         handle);
313 out:
314         return q;
315 }
316
317 struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle)
318 {
319         struct netdev_queue *nq;
320         struct Qdisc *q;
321
322         if (!handle)
323                 return NULL;
324         q = qdisc_match_from_root(rcu_dereference(dev->qdisc), handle);
325         if (q)
326                 goto out;
327
328         nq = dev_ingress_queue_rcu(dev);
329         if (nq)
330                 q = qdisc_match_from_root(nq->qdisc_sleeping, handle);
331 out:
332         return q;
333 }
334
335 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
336 {
337         unsigned long cl;
338         const struct Qdisc_class_ops *cops = p->ops->cl_ops;
339
340         if (cops == NULL)
341                 return NULL;
342         cl = cops->find(p, classid);
343
344         if (cl == 0)
345                 return NULL;
346         return cops->leaf(p, cl);
347 }
348
349 /* Find queueing discipline by name */
350
351 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
352 {
353         struct Qdisc_ops *q = NULL;
354
355         if (kind) {
356                 read_lock(&qdisc_mod_lock);
357                 for (q = qdisc_base; q; q = q->next) {
358                         if (nla_strcmp(kind, q->id) == 0) {
359                                 if (!try_module_get(q->owner))
360                                         q = NULL;
361                                 break;
362                         }
363                 }
364                 read_unlock(&qdisc_mod_lock);
365         }
366         return q;
367 }
368
369 /* The linklayer setting were not transferred from iproute2, in older
370  * versions, and the rate tables lookup systems have been dropped in
371  * the kernel. To keep backward compatible with older iproute2 tc
372  * utils, we detect the linklayer setting by detecting if the rate
373  * table were modified.
374  *
375  * For linklayer ATM table entries, the rate table will be aligned to
376  * 48 bytes, thus some table entries will contain the same value.  The
377  * mpu (min packet unit) is also encoded into the old rate table, thus
378  * starting from the mpu, we find low and high table entries for
379  * mapping this cell.  If these entries contain the same value, when
380  * the rate tables have been modified for linklayer ATM.
381  *
382  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
383  * and then roundup to the next cell, calc the table entry one below,
384  * and compare.
385  */
386 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
387 {
388         int low       = roundup(r->mpu, 48);
389         int high      = roundup(low+1, 48);
390         int cell_low  = low >> r->cell_log;
391         int cell_high = (high >> r->cell_log) - 1;
392
393         /* rtab is too inaccurate at rates > 100Mbit/s */
394         if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
395                 pr_debug("TC linklayer: Giving up ATM detection\n");
396                 return TC_LINKLAYER_ETHERNET;
397         }
398
399         if ((cell_high > cell_low) && (cell_high < 256)
400             && (rtab[cell_low] == rtab[cell_high])) {
401                 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
402                          cell_low, cell_high, rtab[cell_high]);
403                 return TC_LINKLAYER_ATM;
404         }
405         return TC_LINKLAYER_ETHERNET;
406 }
407
408 static struct qdisc_rate_table *qdisc_rtab_list;
409
410 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
411                                         struct nlattr *tab,
412                                         struct netlink_ext_ack *extack)
413 {
414         struct qdisc_rate_table *rtab;
415
416         if (tab == NULL || r->rate == 0 ||
417             r->cell_log == 0 || r->cell_log >= 32 ||
418             nla_len(tab) != TC_RTAB_SIZE) {
419                 NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
420                 return NULL;
421         }
422
423         for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
424                 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
425                     !memcmp(&rtab->data, nla_data(tab), 1024)) {
426                         rtab->refcnt++;
427                         return rtab;
428                 }
429         }
430
431         rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
432         if (rtab) {
433                 rtab->rate = *r;
434                 rtab->refcnt = 1;
435                 memcpy(rtab->data, nla_data(tab), 1024);
436                 if (r->linklayer == TC_LINKLAYER_UNAWARE)
437                         r->linklayer = __detect_linklayer(r, rtab->data);
438                 rtab->next = qdisc_rtab_list;
439                 qdisc_rtab_list = rtab;
440         } else {
441                 NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
442         }
443         return rtab;
444 }
445 EXPORT_SYMBOL(qdisc_get_rtab);
446
447 void qdisc_put_rtab(struct qdisc_rate_table *tab)
448 {
449         struct qdisc_rate_table *rtab, **rtabp;
450
451         if (!tab || --tab->refcnt)
452                 return;
453
454         for (rtabp = &qdisc_rtab_list;
455              (rtab = *rtabp) != NULL;
456              rtabp = &rtab->next) {
457                 if (rtab == tab) {
458                         *rtabp = rtab->next;
459                         kfree(rtab);
460                         return;
461                 }
462         }
463 }
464 EXPORT_SYMBOL(qdisc_put_rtab);
465
466 static LIST_HEAD(qdisc_stab_list);
467
468 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
469         [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
470         [TCA_STAB_DATA] = { .type = NLA_BINARY },
471 };
472
473 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
474                                                struct netlink_ext_ack *extack)
475 {
476         struct nlattr *tb[TCA_STAB_MAX + 1];
477         struct qdisc_size_table *stab;
478         struct tc_sizespec *s;
479         unsigned int tsize = 0;
480         u16 *tab = NULL;
481         int err;
482
483         err = nla_parse_nested_deprecated(tb, TCA_STAB_MAX, opt, stab_policy,
484                                           extack);
485         if (err < 0)
486                 return ERR_PTR(err);
487         if (!tb[TCA_STAB_BASE]) {
488                 NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
489                 return ERR_PTR(-EINVAL);
490         }
491
492         s = nla_data(tb[TCA_STAB_BASE]);
493
494         if (s->tsize > 0) {
495                 if (!tb[TCA_STAB_DATA]) {
496                         NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
497                         return ERR_PTR(-EINVAL);
498                 }
499                 tab = nla_data(tb[TCA_STAB_DATA]);
500                 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
501         }
502
503         if (tsize != s->tsize || (!tab && tsize > 0)) {
504                 NL_SET_ERR_MSG(extack, "Invalid size of size table");
505                 return ERR_PTR(-EINVAL);
506         }
507
508         list_for_each_entry(stab, &qdisc_stab_list, list) {
509                 if (memcmp(&stab->szopts, s, sizeof(*s)))
510                         continue;
511                 if (tsize > 0 &&
512                     memcmp(stab->data, tab, flex_array_size(stab, data, tsize)))
513                         continue;
514                 stab->refcnt++;
515                 return stab;
516         }
517
518         if (s->size_log > STAB_SIZE_LOG_MAX ||
519             s->cell_log > STAB_SIZE_LOG_MAX) {
520                 NL_SET_ERR_MSG(extack, "Invalid logarithmic size of size table");
521                 return ERR_PTR(-EINVAL);
522         }
523
524         stab = kmalloc(struct_size(stab, data, tsize), GFP_KERNEL);
525         if (!stab)
526                 return ERR_PTR(-ENOMEM);
527
528         stab->refcnt = 1;
529         stab->szopts = *s;
530         if (tsize > 0)
531                 memcpy(stab->data, tab, flex_array_size(stab, data, tsize));
532
533         list_add_tail(&stab->list, &qdisc_stab_list);
534
535         return stab;
536 }
537
538 void qdisc_put_stab(struct qdisc_size_table *tab)
539 {
540         if (!tab)
541                 return;
542
543         if (--tab->refcnt == 0) {
544                 list_del(&tab->list);
545                 kfree_rcu(tab, rcu);
546         }
547 }
548 EXPORT_SYMBOL(qdisc_put_stab);
549
550 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
551 {
552         struct nlattr *nest;
553
554         nest = nla_nest_start_noflag(skb, TCA_STAB);
555         if (nest == NULL)
556                 goto nla_put_failure;
557         if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
558                 goto nla_put_failure;
559         nla_nest_end(skb, nest);
560
561         return skb->len;
562
563 nla_put_failure:
564         return -1;
565 }
566
567 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
568                                const struct qdisc_size_table *stab)
569 {
570         int pkt_len, slot;
571
572         pkt_len = skb->len + stab->szopts.overhead;
573         if (unlikely(!stab->szopts.tsize))
574                 goto out;
575
576         slot = pkt_len + stab->szopts.cell_align;
577         if (unlikely(slot < 0))
578                 slot = 0;
579
580         slot >>= stab->szopts.cell_log;
581         if (likely(slot < stab->szopts.tsize))
582                 pkt_len = stab->data[slot];
583         else
584                 pkt_len = stab->data[stab->szopts.tsize - 1] *
585                                 (slot / stab->szopts.tsize) +
586                                 stab->data[slot % stab->szopts.tsize];
587
588         pkt_len <<= stab->szopts.size_log;
589 out:
590         if (unlikely(pkt_len < 1))
591                 pkt_len = 1;
592         qdisc_skb_cb(skb)->pkt_len = pkt_len;
593 }
594 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
595
596 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
597 {
598         if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
599                 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
600                         txt, qdisc->ops->id, qdisc->handle >> 16);
601                 qdisc->flags |= TCQ_F_WARN_NONWC;
602         }
603 }
604 EXPORT_SYMBOL(qdisc_warn_nonwc);
605
606 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
607 {
608         struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
609                                                  timer);
610
611         rcu_read_lock();
612         __netif_schedule(qdisc_root(wd->qdisc));
613         rcu_read_unlock();
614
615         return HRTIMER_NORESTART;
616 }
617
618 void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
619                                  clockid_t clockid)
620 {
621         hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
622         wd->timer.function = qdisc_watchdog;
623         wd->qdisc = qdisc;
624 }
625 EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
626
627 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
628 {
629         qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
630 }
631 EXPORT_SYMBOL(qdisc_watchdog_init);
632
633 void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires,
634                                       u64 delta_ns)
635 {
636         if (test_bit(__QDISC_STATE_DEACTIVATED,
637                      &qdisc_root_sleeping(wd->qdisc)->state))
638                 return;
639
640         if (hrtimer_is_queued(&wd->timer)) {
641                 /* If timer is already set in [expires, expires + delta_ns],
642                  * do not reprogram it.
643                  */
644                 if (wd->last_expires - expires <= delta_ns)
645                         return;
646         }
647
648         wd->last_expires = expires;
649         hrtimer_start_range_ns(&wd->timer,
650                                ns_to_ktime(expires),
651                                delta_ns,
652                                HRTIMER_MODE_ABS_PINNED);
653 }
654 EXPORT_SYMBOL(qdisc_watchdog_schedule_range_ns);
655
656 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
657 {
658         hrtimer_cancel(&wd->timer);
659 }
660 EXPORT_SYMBOL(qdisc_watchdog_cancel);
661
662 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
663 {
664         struct hlist_head *h;
665         unsigned int i;
666
667         h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
668
669         if (h != NULL) {
670                 for (i = 0; i < n; i++)
671                         INIT_HLIST_HEAD(&h[i]);
672         }
673         return h;
674 }
675
676 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
677 {
678         struct Qdisc_class_common *cl;
679         struct hlist_node *next;
680         struct hlist_head *nhash, *ohash;
681         unsigned int nsize, nmask, osize;
682         unsigned int i, h;
683
684         /* Rehash when load factor exceeds 0.75 */
685         if (clhash->hashelems * 4 <= clhash->hashsize * 3)
686                 return;
687         nsize = clhash->hashsize * 2;
688         nmask = nsize - 1;
689         nhash = qdisc_class_hash_alloc(nsize);
690         if (nhash == NULL)
691                 return;
692
693         ohash = clhash->hash;
694         osize = clhash->hashsize;
695
696         sch_tree_lock(sch);
697         for (i = 0; i < osize; i++) {
698                 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
699                         h = qdisc_class_hash(cl->classid, nmask);
700                         hlist_add_head(&cl->hnode, &nhash[h]);
701                 }
702         }
703         clhash->hash     = nhash;
704         clhash->hashsize = nsize;
705         clhash->hashmask = nmask;
706         sch_tree_unlock(sch);
707
708         kvfree(ohash);
709 }
710 EXPORT_SYMBOL(qdisc_class_hash_grow);
711
712 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
713 {
714         unsigned int size = 4;
715
716         clhash->hash = qdisc_class_hash_alloc(size);
717         if (!clhash->hash)
718                 return -ENOMEM;
719         clhash->hashsize  = size;
720         clhash->hashmask  = size - 1;
721         clhash->hashelems = 0;
722         return 0;
723 }
724 EXPORT_SYMBOL(qdisc_class_hash_init);
725
726 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
727 {
728         kvfree(clhash->hash);
729 }
730 EXPORT_SYMBOL(qdisc_class_hash_destroy);
731
732 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
733                              struct Qdisc_class_common *cl)
734 {
735         unsigned int h;
736
737         INIT_HLIST_NODE(&cl->hnode);
738         h = qdisc_class_hash(cl->classid, clhash->hashmask);
739         hlist_add_head(&cl->hnode, &clhash->hash[h]);
740         clhash->hashelems++;
741 }
742 EXPORT_SYMBOL(qdisc_class_hash_insert);
743
744 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
745                              struct Qdisc_class_common *cl)
746 {
747         hlist_del(&cl->hnode);
748         clhash->hashelems--;
749 }
750 EXPORT_SYMBOL(qdisc_class_hash_remove);
751
752 /* Allocate an unique handle from space managed by kernel
753  * Possible range is [8000-FFFF]:0000 (0x8000 values)
754  */
755 static u32 qdisc_alloc_handle(struct net_device *dev)
756 {
757         int i = 0x8000;
758         static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
759
760         do {
761                 autohandle += TC_H_MAKE(0x10000U, 0);
762                 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
763                         autohandle = TC_H_MAKE(0x80000000U, 0);
764                 if (!qdisc_lookup(dev, autohandle))
765                         return autohandle;
766                 cond_resched();
767         } while (--i > 0);
768
769         return 0;
770 }
771
772 void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len)
773 {
774         bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
775         const struct Qdisc_class_ops *cops;
776         unsigned long cl;
777         u32 parentid;
778         bool notify;
779         int drops;
780
781         if (n == 0 && len == 0)
782                 return;
783         drops = max_t(int, n, 0);
784         rcu_read_lock();
785         while ((parentid = sch->parent)) {
786                 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
787                         break;
788
789                 if (sch->flags & TCQ_F_NOPARENT)
790                         break;
791                 /* Notify parent qdisc only if child qdisc becomes empty.
792                  *
793                  * If child was empty even before update then backlog
794                  * counter is screwed and we skip notification because
795                  * parent class is already passive.
796                  *
797                  * If the original child was offloaded then it is allowed
798                  * to be seem as empty, so the parent is notified anyway.
799                  */
800                 notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
801                                                        !qdisc_is_offloaded);
802                 /* TODO: perform the search on a per txq basis */
803                 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
804                 if (sch == NULL) {
805                         WARN_ON_ONCE(parentid != TC_H_ROOT);
806                         break;
807                 }
808                 cops = sch->ops->cl_ops;
809                 if (notify && cops->qlen_notify) {
810                         cl = cops->find(sch, parentid);
811                         cops->qlen_notify(sch, cl);
812                 }
813                 sch->q.qlen -= n;
814                 sch->qstats.backlog -= len;
815                 __qdisc_qstats_drop(sch, drops);
816         }
817         rcu_read_unlock();
818 }
819 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
820
821 int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type,
822                               void *type_data)
823 {
824         struct net_device *dev = qdisc_dev(sch);
825         int err;
826
827         sch->flags &= ~TCQ_F_OFFLOADED;
828         if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
829                 return 0;
830
831         err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
832         if (err == -EOPNOTSUPP)
833                 return 0;
834
835         if (!err)
836                 sch->flags |= TCQ_F_OFFLOADED;
837
838         return err;
839 }
840 EXPORT_SYMBOL(qdisc_offload_dump_helper);
841
842 void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
843                                 struct Qdisc *new, struct Qdisc *old,
844                                 enum tc_setup_type type, void *type_data,
845                                 struct netlink_ext_ack *extack)
846 {
847         bool any_qdisc_is_offloaded;
848         int err;
849
850         if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
851                 return;
852
853         err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
854
855         /* Don't report error if the graft is part of destroy operation. */
856         if (!err || !new || new == &noop_qdisc)
857                 return;
858
859         /* Don't report error if the parent, the old child and the new
860          * one are not offloaded.
861          */
862         any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED;
863         any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED;
864         any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED;
865
866         if (any_qdisc_is_offloaded)
867                 NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
868 }
869 EXPORT_SYMBOL(qdisc_offload_graft_helper);
870
871 void qdisc_offload_query_caps(struct net_device *dev,
872                               enum tc_setup_type type,
873                               void *caps, size_t caps_len)
874 {
875         const struct net_device_ops *ops = dev->netdev_ops;
876         struct tc_query_caps_base base = {
877                 .type = type,
878                 .caps = caps,
879         };
880
881         memset(caps, 0, caps_len);
882
883         if (ops->ndo_setup_tc)
884                 ops->ndo_setup_tc(dev, TC_QUERY_CAPS, &base);
885 }
886 EXPORT_SYMBOL(qdisc_offload_query_caps);
887
888 static void qdisc_offload_graft_root(struct net_device *dev,
889                                      struct Qdisc *new, struct Qdisc *old,
890                                      struct netlink_ext_ack *extack)
891 {
892         struct tc_root_qopt_offload graft_offload = {
893                 .command        = TC_ROOT_GRAFT,
894                 .handle         = new ? new->handle : 0,
895                 .ingress        = (new && new->flags & TCQ_F_INGRESS) ||
896                                   (old && old->flags & TCQ_F_INGRESS),
897         };
898
899         qdisc_offload_graft_helper(dev, NULL, new, old,
900                                    TC_SETUP_ROOT_QDISC, &graft_offload, extack);
901 }
902
903 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
904                          u32 portid, u32 seq, u16 flags, int event)
905 {
906         struct gnet_stats_basic_sync __percpu *cpu_bstats = NULL;
907         struct gnet_stats_queue __percpu *cpu_qstats = NULL;
908         struct tcmsg *tcm;
909         struct nlmsghdr  *nlh;
910         unsigned char *b = skb_tail_pointer(skb);
911         struct gnet_dump d;
912         struct qdisc_size_table *stab;
913         u32 block_index;
914         __u32 qlen;
915
916         cond_resched();
917         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
918         if (!nlh)
919                 goto out_nlmsg_trim;
920         tcm = nlmsg_data(nlh);
921         tcm->tcm_family = AF_UNSPEC;
922         tcm->tcm__pad1 = 0;
923         tcm->tcm__pad2 = 0;
924         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
925         tcm->tcm_parent = clid;
926         tcm->tcm_handle = q->handle;
927         tcm->tcm_info = refcount_read(&q->refcnt);
928         if (nla_put_string(skb, TCA_KIND, q->ops->id))
929                 goto nla_put_failure;
930         if (q->ops->ingress_block_get) {
931                 block_index = q->ops->ingress_block_get(q);
932                 if (block_index &&
933                     nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
934                         goto nla_put_failure;
935         }
936         if (q->ops->egress_block_get) {
937                 block_index = q->ops->egress_block_get(q);
938                 if (block_index &&
939                     nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
940                         goto nla_put_failure;
941         }
942         if (q->ops->dump && q->ops->dump(q, skb) < 0)
943                 goto nla_put_failure;
944         if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
945                 goto nla_put_failure;
946         qlen = qdisc_qlen_sum(q);
947
948         stab = rtnl_dereference(q->stab);
949         if (stab && qdisc_dump_stab(skb, stab) < 0)
950                 goto nla_put_failure;
951
952         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
953                                          NULL, &d, TCA_PAD) < 0)
954                 goto nla_put_failure;
955
956         if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
957                 goto nla_put_failure;
958
959         if (qdisc_is_percpu_stats(q)) {
960                 cpu_bstats = q->cpu_bstats;
961                 cpu_qstats = q->cpu_qstats;
962         }
963
964         if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats, true) < 0 ||
965             gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
966             gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
967                 goto nla_put_failure;
968
969         if (gnet_stats_finish_copy(&d) < 0)
970                 goto nla_put_failure;
971
972         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
973         return skb->len;
974
975 out_nlmsg_trim:
976 nla_put_failure:
977         nlmsg_trim(skb, b);
978         return -1;
979 }
980
981 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
982 {
983         if (q->flags & TCQ_F_BUILTIN)
984                 return true;
985         if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
986                 return true;
987
988         return false;
989 }
990
991 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
992                         struct nlmsghdr *n, u32 clid,
993                         struct Qdisc *old, struct Qdisc *new)
994 {
995         struct sk_buff *skb;
996         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
997
998         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
999         if (!skb)
1000                 return -ENOBUFS;
1001
1002         if (old && !tc_qdisc_dump_ignore(old, false)) {
1003                 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
1004                                   0, RTM_DELQDISC) < 0)
1005                         goto err_out;
1006         }
1007         if (new && !tc_qdisc_dump_ignore(new, false)) {
1008                 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
1009                                   old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1010                         goto err_out;
1011         }
1012
1013         if (skb->len)
1014                 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1015                                       n->nlmsg_flags & NLM_F_ECHO);
1016
1017 err_out:
1018         kfree_skb(skb);
1019         return -EINVAL;
1020 }
1021
1022 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
1023                                struct nlmsghdr *n, u32 clid,
1024                                struct Qdisc *old, struct Qdisc *new)
1025 {
1026         if (new || old)
1027                 qdisc_notify(net, skb, n, clid, old, new);
1028
1029         if (old)
1030                 qdisc_put(old);
1031 }
1032
1033 static void qdisc_clear_nolock(struct Qdisc *sch)
1034 {
1035         sch->flags &= ~TCQ_F_NOLOCK;
1036         if (!(sch->flags & TCQ_F_CPUSTATS))
1037                 return;
1038
1039         free_percpu(sch->cpu_bstats);
1040         free_percpu(sch->cpu_qstats);
1041         sch->cpu_bstats = NULL;
1042         sch->cpu_qstats = NULL;
1043         sch->flags &= ~TCQ_F_CPUSTATS;
1044 }
1045
1046 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
1047  * to device "dev".
1048  *
1049  * When appropriate send a netlink notification using 'skb'
1050  * and "n".
1051  *
1052  * On success, destroy old qdisc.
1053  */
1054
1055 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
1056                        struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
1057                        struct Qdisc *new, struct Qdisc *old,
1058                        struct netlink_ext_ack *extack)
1059 {
1060         struct Qdisc *q = old;
1061         struct net *net = dev_net(dev);
1062
1063         if (parent == NULL) {
1064                 unsigned int i, num_q, ingress;
1065
1066                 ingress = 0;
1067                 num_q = dev->num_tx_queues;
1068                 if ((q && q->flags & TCQ_F_INGRESS) ||
1069                     (new && new->flags & TCQ_F_INGRESS)) {
1070                         num_q = 1;
1071                         ingress = 1;
1072                         if (!dev_ingress_queue(dev)) {
1073                                 NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
1074                                 return -ENOENT;
1075                         }
1076                 }
1077
1078                 if (dev->flags & IFF_UP)
1079                         dev_deactivate(dev);
1080
1081                 qdisc_offload_graft_root(dev, new, old, extack);
1082
1083                 if (new && new->ops->attach && !ingress)
1084                         goto skip;
1085
1086                 for (i = 0; i < num_q; i++) {
1087                         struct netdev_queue *dev_queue = dev_ingress_queue(dev);
1088
1089                         if (!ingress)
1090                                 dev_queue = netdev_get_tx_queue(dev, i);
1091
1092                         old = dev_graft_qdisc(dev_queue, new);
1093                         if (new && i > 0)
1094                                 qdisc_refcount_inc(new);
1095
1096                         if (!ingress)
1097                                 qdisc_put(old);
1098                 }
1099
1100 skip:
1101                 if (!ingress) {
1102                         old = rtnl_dereference(dev->qdisc);
1103                         if (new && !new->ops->attach)
1104                                 qdisc_refcount_inc(new);
1105                         rcu_assign_pointer(dev->qdisc, new ? : &noop_qdisc);
1106
1107                         notify_and_destroy(net, skb, n, classid, old, new);
1108
1109                         if (new && new->ops->attach)
1110                                 new->ops->attach(new);
1111                 } else {
1112                         notify_and_destroy(net, skb, n, classid, old, new);
1113                 }
1114
1115                 if (dev->flags & IFF_UP)
1116                         dev_activate(dev);
1117         } else {
1118                 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
1119                 unsigned long cl;
1120                 int err;
1121
1122                 /* Only support running class lockless if parent is lockless */
1123                 if (new && (new->flags & TCQ_F_NOLOCK) && !(parent->flags & TCQ_F_NOLOCK))
1124                         qdisc_clear_nolock(new);
1125
1126                 if (!cops || !cops->graft)
1127                         return -EOPNOTSUPP;
1128
1129                 cl = cops->find(parent, classid);
1130                 if (!cl) {
1131                         NL_SET_ERR_MSG(extack, "Specified class not found");
1132                         return -ENOENT;
1133                 }
1134
1135                 err = cops->graft(parent, cl, new, &old, extack);
1136                 if (err)
1137                         return err;
1138                 notify_and_destroy(net, skb, n, classid, old, new);
1139         }
1140         return 0;
1141 }
1142
1143 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1144                                    struct netlink_ext_ack *extack)
1145 {
1146         u32 block_index;
1147
1148         if (tca[TCA_INGRESS_BLOCK]) {
1149                 block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1150
1151                 if (!block_index) {
1152                         NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1153                         return -EINVAL;
1154                 }
1155                 if (!sch->ops->ingress_block_set) {
1156                         NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1157                         return -EOPNOTSUPP;
1158                 }
1159                 sch->ops->ingress_block_set(sch, block_index);
1160         }
1161         if (tca[TCA_EGRESS_BLOCK]) {
1162                 block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1163
1164                 if (!block_index) {
1165                         NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1166                         return -EINVAL;
1167                 }
1168                 if (!sch->ops->egress_block_set) {
1169                         NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1170                         return -EOPNOTSUPP;
1171                 }
1172                 sch->ops->egress_block_set(sch, block_index);
1173         }
1174         return 0;
1175 }
1176
1177 /*
1178    Allocate and initialize new qdisc.
1179
1180    Parameters are passed via opt.
1181  */
1182
1183 static struct Qdisc *qdisc_create(struct net_device *dev,
1184                                   struct netdev_queue *dev_queue,
1185                                   u32 parent, u32 handle,
1186                                   struct nlattr **tca, int *errp,
1187                                   struct netlink_ext_ack *extack)
1188 {
1189         int err;
1190         struct nlattr *kind = tca[TCA_KIND];
1191         struct Qdisc *sch;
1192         struct Qdisc_ops *ops;
1193         struct qdisc_size_table *stab;
1194
1195         ops = qdisc_lookup_ops(kind);
1196 #ifdef CONFIG_MODULES
1197         if (ops == NULL && kind != NULL) {
1198                 char name[IFNAMSIZ];
1199                 if (nla_strscpy(name, kind, IFNAMSIZ) >= 0) {
1200                         /* We dropped the RTNL semaphore in order to
1201                          * perform the module load.  So, even if we
1202                          * succeeded in loading the module we have to
1203                          * tell the caller to replay the request.  We
1204                          * indicate this using -EAGAIN.
1205                          * We replay the request because the device may
1206                          * go away in the mean time.
1207                          */
1208                         rtnl_unlock();
1209                         request_module("sch_%s", name);
1210                         rtnl_lock();
1211                         ops = qdisc_lookup_ops(kind);
1212                         if (ops != NULL) {
1213                                 /* We will try again qdisc_lookup_ops,
1214                                  * so don't keep a reference.
1215                                  */
1216                                 module_put(ops->owner);
1217                                 err = -EAGAIN;
1218                                 goto err_out;
1219                         }
1220                 }
1221         }
1222 #endif
1223
1224         err = -ENOENT;
1225         if (!ops) {
1226                 NL_SET_ERR_MSG(extack, "Specified qdisc kind is unknown");
1227                 goto err_out;
1228         }
1229
1230         sch = qdisc_alloc(dev_queue, ops, extack);
1231         if (IS_ERR(sch)) {
1232                 err = PTR_ERR(sch);
1233                 goto err_out2;
1234         }
1235
1236         sch->parent = parent;
1237
1238         if (handle == TC_H_INGRESS) {
1239                 sch->flags |= TCQ_F_INGRESS;
1240                 handle = TC_H_MAKE(TC_H_INGRESS, 0);
1241         } else {
1242                 if (handle == 0) {
1243                         handle = qdisc_alloc_handle(dev);
1244                         if (handle == 0) {
1245                                 NL_SET_ERR_MSG(extack, "Maximum number of qdisc handles was exceeded");
1246                                 err = -ENOSPC;
1247                                 goto err_out3;
1248                         }
1249                 }
1250                 if (!netif_is_multiqueue(dev))
1251                         sch->flags |= TCQ_F_ONETXQUEUE;
1252         }
1253
1254         sch->handle = handle;
1255
1256         /* This exist to keep backward compatible with a userspace
1257          * loophole, what allowed userspace to get IFF_NO_QUEUE
1258          * facility on older kernels by setting tx_queue_len=0 (prior
1259          * to qdisc init), and then forgot to reinit tx_queue_len
1260          * before again attaching a qdisc.
1261          */
1262         if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1263                 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1264                 netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1265         }
1266
1267         err = qdisc_block_indexes_set(sch, tca, extack);
1268         if (err)
1269                 goto err_out3;
1270
1271         if (ops->init) {
1272                 err = ops->init(sch, tca[TCA_OPTIONS], extack);
1273                 if (err != 0)
1274                         goto err_out5;
1275         }
1276
1277         if (tca[TCA_STAB]) {
1278                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1279                 if (IS_ERR(stab)) {
1280                         err = PTR_ERR(stab);
1281                         goto err_out4;
1282                 }
1283                 rcu_assign_pointer(sch->stab, stab);
1284         }
1285         if (tca[TCA_RATE]) {
1286                 err = -EOPNOTSUPP;
1287                 if (sch->flags & TCQ_F_MQROOT) {
1288                         NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1289                         goto err_out4;
1290                 }
1291
1292                 err = gen_new_estimator(&sch->bstats,
1293                                         sch->cpu_bstats,
1294                                         &sch->rate_est,
1295                                         NULL,
1296                                         true,
1297                                         tca[TCA_RATE]);
1298                 if (err) {
1299                         NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1300                         goto err_out4;
1301                 }
1302         }
1303
1304         qdisc_hash_add(sch, false);
1305         trace_qdisc_create(ops, dev, parent);
1306
1307         return sch;
1308
1309 err_out5:
1310         /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1311         if (ops->destroy)
1312                 ops->destroy(sch);
1313 err_out3:
1314         netdev_put(dev, &sch->dev_tracker);
1315         qdisc_free(sch);
1316 err_out2:
1317         module_put(ops->owner);
1318 err_out:
1319         *errp = err;
1320         return NULL;
1321
1322 err_out4:
1323         /*
1324          * Any broken qdiscs that would require a ops->reset() here?
1325          * The qdisc was never in action so it shouldn't be necessary.
1326          */
1327         qdisc_put_stab(rtnl_dereference(sch->stab));
1328         if (ops->destroy)
1329                 ops->destroy(sch);
1330         goto err_out3;
1331 }
1332
1333 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1334                         struct netlink_ext_ack *extack)
1335 {
1336         struct qdisc_size_table *ostab, *stab = NULL;
1337         int err = 0;
1338
1339         if (tca[TCA_OPTIONS]) {
1340                 if (!sch->ops->change) {
1341                         NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1342                         return -EINVAL;
1343                 }
1344                 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1345                         NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1346                         return -EOPNOTSUPP;
1347                 }
1348                 err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1349                 if (err)
1350                         return err;
1351         }
1352
1353         if (tca[TCA_STAB]) {
1354                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1355                 if (IS_ERR(stab))
1356                         return PTR_ERR(stab);
1357         }
1358
1359         ostab = rtnl_dereference(sch->stab);
1360         rcu_assign_pointer(sch->stab, stab);
1361         qdisc_put_stab(ostab);
1362
1363         if (tca[TCA_RATE]) {
1364                 /* NB: ignores errors from replace_estimator
1365                    because change can't be undone. */
1366                 if (sch->flags & TCQ_F_MQROOT)
1367                         goto out;
1368                 gen_replace_estimator(&sch->bstats,
1369                                       sch->cpu_bstats,
1370                                       &sch->rate_est,
1371                                       NULL,
1372                                       true,
1373                                       tca[TCA_RATE]);
1374         }
1375 out:
1376         return 0;
1377 }
1378
1379 struct check_loop_arg {
1380         struct qdisc_walker     w;
1381         struct Qdisc            *p;
1382         int                     depth;
1383 };
1384
1385 static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1386                          struct qdisc_walker *w);
1387
1388 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1389 {
1390         struct check_loop_arg   arg;
1391
1392         if (q->ops->cl_ops == NULL)
1393                 return 0;
1394
1395         arg.w.stop = arg.w.skip = arg.w.count = 0;
1396         arg.w.fn = check_loop_fn;
1397         arg.depth = depth;
1398         arg.p = p;
1399         q->ops->cl_ops->walk(q, &arg.w);
1400         return arg.w.stop ? -ELOOP : 0;
1401 }
1402
1403 static int
1404 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1405 {
1406         struct Qdisc *leaf;
1407         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1408         struct check_loop_arg *arg = (struct check_loop_arg *)w;
1409
1410         leaf = cops->leaf(q, cl);
1411         if (leaf) {
1412                 if (leaf == arg->p || arg->depth > 7)
1413                         return -ELOOP;
1414                 return check_loop(leaf, arg->p, arg->depth + 1);
1415         }
1416         return 0;
1417 }
1418
1419 const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
1420         [TCA_KIND]              = { .type = NLA_STRING },
1421         [TCA_RATE]              = { .type = NLA_BINARY,
1422                                     .len = sizeof(struct tc_estimator) },
1423         [TCA_STAB]              = { .type = NLA_NESTED },
1424         [TCA_DUMP_INVISIBLE]    = { .type = NLA_FLAG },
1425         [TCA_CHAIN]             = { .type = NLA_U32 },
1426         [TCA_INGRESS_BLOCK]     = { .type = NLA_U32 },
1427         [TCA_EGRESS_BLOCK]      = { .type = NLA_U32 },
1428 };
1429
1430 /*
1431  * Delete/get qdisc.
1432  */
1433
1434 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1435                         struct netlink_ext_ack *extack)
1436 {
1437         struct net *net = sock_net(skb->sk);
1438         struct tcmsg *tcm = nlmsg_data(n);
1439         struct nlattr *tca[TCA_MAX + 1];
1440         struct net_device *dev;
1441         u32 clid;
1442         struct Qdisc *q = NULL;
1443         struct Qdisc *p = NULL;
1444         int err;
1445
1446         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1447                                      rtm_tca_policy, extack);
1448         if (err < 0)
1449                 return err;
1450
1451         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1452         if (!dev)
1453                 return -ENODEV;
1454
1455         clid = tcm->tcm_parent;
1456         if (clid) {
1457                 if (clid != TC_H_ROOT) {
1458                         if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1459                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1460                                 if (!p) {
1461                                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1462                                         return -ENOENT;
1463                                 }
1464                                 q = qdisc_leaf(p, clid);
1465                         } else if (dev_ingress_queue(dev)) {
1466                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1467                         }
1468                 } else {
1469                         q = rtnl_dereference(dev->qdisc);
1470                 }
1471                 if (!q) {
1472                         NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1473                         return -ENOENT;
1474                 }
1475
1476                 if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1477                         NL_SET_ERR_MSG(extack, "Invalid handle");
1478                         return -EINVAL;
1479                 }
1480         } else {
1481                 q = qdisc_lookup(dev, tcm->tcm_handle);
1482                 if (!q) {
1483                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1484                         return -ENOENT;
1485                 }
1486         }
1487
1488         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1489                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1490                 return -EINVAL;
1491         }
1492
1493         if (n->nlmsg_type == RTM_DELQDISC) {
1494                 if (!clid) {
1495                         NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1496                         return -EINVAL;
1497                 }
1498                 if (q->handle == 0) {
1499                         NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1500                         return -ENOENT;
1501                 }
1502                 err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1503                 if (err != 0)
1504                         return err;
1505         } else {
1506                 qdisc_notify(net, skb, n, clid, NULL, q);
1507         }
1508         return 0;
1509 }
1510
1511 /*
1512  * Create/change qdisc.
1513  */
1514
1515 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1516                            struct netlink_ext_ack *extack)
1517 {
1518         struct net *net = sock_net(skb->sk);
1519         struct tcmsg *tcm;
1520         struct nlattr *tca[TCA_MAX + 1];
1521         struct net_device *dev;
1522         u32 clid;
1523         struct Qdisc *q, *p;
1524         int err;
1525
1526 replay:
1527         /* Reinit, just in case something touches this. */
1528         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1529                                      rtm_tca_policy, extack);
1530         if (err < 0)
1531                 return err;
1532
1533         tcm = nlmsg_data(n);
1534         clid = tcm->tcm_parent;
1535         q = p = NULL;
1536
1537         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1538         if (!dev)
1539                 return -ENODEV;
1540
1541
1542         if (clid) {
1543                 if (clid != TC_H_ROOT) {
1544                         if (clid != TC_H_INGRESS) {
1545                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1546                                 if (!p) {
1547                                         NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1548                                         return -ENOENT;
1549                                 }
1550                                 q = qdisc_leaf(p, clid);
1551                         } else if (dev_ingress_queue_create(dev)) {
1552                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1553                         }
1554                 } else {
1555                         q = rtnl_dereference(dev->qdisc);
1556                 }
1557
1558                 /* It may be default qdisc, ignore it */
1559                 if (q && q->handle == 0)
1560                         q = NULL;
1561
1562                 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1563                         if (tcm->tcm_handle) {
1564                                 if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1565                                         NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1566                                         return -EEXIST;
1567                                 }
1568                                 if (TC_H_MIN(tcm->tcm_handle)) {
1569                                         NL_SET_ERR_MSG(extack, "Invalid minor handle");
1570                                         return -EINVAL;
1571                                 }
1572                                 q = qdisc_lookup(dev, tcm->tcm_handle);
1573                                 if (!q)
1574                                         goto create_n_graft;
1575                                 if (n->nlmsg_flags & NLM_F_EXCL) {
1576                                         NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1577                                         return -EEXIST;
1578                                 }
1579                                 if (tca[TCA_KIND] &&
1580                                     nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1581                                         NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1582                                         return -EINVAL;
1583                                 }
1584                                 if (q == p ||
1585                                     (p && check_loop(q, p, 0))) {
1586                                         NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1587                                         return -ELOOP;
1588                                 }
1589                                 qdisc_refcount_inc(q);
1590                                 goto graft;
1591                         } else {
1592                                 if (!q)
1593                                         goto create_n_graft;
1594
1595                                 /* This magic test requires explanation.
1596                                  *
1597                                  *   We know, that some child q is already
1598                                  *   attached to this parent and have choice:
1599                                  *   either to change it or to create/graft new one.
1600                                  *
1601                                  *   1. We are allowed to create/graft only
1602                                  *   if CREATE and REPLACE flags are set.
1603                                  *
1604                                  *   2. If EXCL is set, requestor wanted to say,
1605                                  *   that qdisc tcm_handle is not expected
1606                                  *   to exist, so that we choose create/graft too.
1607                                  *
1608                                  *   3. The last case is when no flags are set.
1609                                  *   Alas, it is sort of hole in API, we
1610                                  *   cannot decide what to do unambiguously.
1611                                  *   For now we select create/graft, if
1612                                  *   user gave KIND, which does not match existing.
1613                                  */
1614                                 if ((n->nlmsg_flags & NLM_F_CREATE) &&
1615                                     (n->nlmsg_flags & NLM_F_REPLACE) &&
1616                                     ((n->nlmsg_flags & NLM_F_EXCL) ||
1617                                      (tca[TCA_KIND] &&
1618                                       nla_strcmp(tca[TCA_KIND], q->ops->id))))
1619                                         goto create_n_graft;
1620                         }
1621                 }
1622         } else {
1623                 if (!tcm->tcm_handle) {
1624                         NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1625                         return -EINVAL;
1626                 }
1627                 q = qdisc_lookup(dev, tcm->tcm_handle);
1628         }
1629
1630         /* Change qdisc parameters */
1631         if (!q) {
1632                 NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1633                 return -ENOENT;
1634         }
1635         if (n->nlmsg_flags & NLM_F_EXCL) {
1636                 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1637                 return -EEXIST;
1638         }
1639         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1640                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1641                 return -EINVAL;
1642         }
1643         err = qdisc_change(q, tca, extack);
1644         if (err == 0)
1645                 qdisc_notify(net, skb, n, clid, NULL, q);
1646         return err;
1647
1648 create_n_graft:
1649         if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1650                 NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1651                 return -ENOENT;
1652         }
1653         if (clid == TC_H_INGRESS) {
1654                 if (dev_ingress_queue(dev)) {
1655                         q = qdisc_create(dev, dev_ingress_queue(dev),
1656                                          tcm->tcm_parent, tcm->tcm_parent,
1657                                          tca, &err, extack);
1658                 } else {
1659                         NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1660                         err = -ENOENT;
1661                 }
1662         } else {
1663                 struct netdev_queue *dev_queue;
1664
1665                 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1666                         dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1667                 else if (p)
1668                         dev_queue = p->dev_queue;
1669                 else
1670                         dev_queue = netdev_get_tx_queue(dev, 0);
1671
1672                 q = qdisc_create(dev, dev_queue,
1673                                  tcm->tcm_parent, tcm->tcm_handle,
1674                                  tca, &err, extack);
1675         }
1676         if (q == NULL) {
1677                 if (err == -EAGAIN)
1678                         goto replay;
1679                 return err;
1680         }
1681
1682 graft:
1683         err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1684         if (err) {
1685                 if (q)
1686                         qdisc_put(q);
1687                 return err;
1688         }
1689
1690         return 0;
1691 }
1692
1693 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1694                               struct netlink_callback *cb,
1695                               int *q_idx_p, int s_q_idx, bool recur,
1696                               bool dump_invisible)
1697 {
1698         int ret = 0, q_idx = *q_idx_p;
1699         struct Qdisc *q;
1700         int b;
1701
1702         if (!root)
1703                 return 0;
1704
1705         q = root;
1706         if (q_idx < s_q_idx) {
1707                 q_idx++;
1708         } else {
1709                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1710                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1711                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1712                                   RTM_NEWQDISC) <= 0)
1713                         goto done;
1714                 q_idx++;
1715         }
1716
1717         /* If dumping singletons, there is no qdisc_dev(root) and the singleton
1718          * itself has already been dumped.
1719          *
1720          * If we've already dumped the top-level (ingress) qdisc above and the global
1721          * qdisc hashtable, we don't want to hit it again
1722          */
1723         if (!qdisc_dev(root) || !recur)
1724                 goto out;
1725
1726         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1727                 if (q_idx < s_q_idx) {
1728                         q_idx++;
1729                         continue;
1730                 }
1731                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1732                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1733                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1734                                   RTM_NEWQDISC) <= 0)
1735                         goto done;
1736                 q_idx++;
1737         }
1738
1739 out:
1740         *q_idx_p = q_idx;
1741         return ret;
1742 done:
1743         ret = -1;
1744         goto out;
1745 }
1746
1747 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1748 {
1749         struct net *net = sock_net(skb->sk);
1750         int idx, q_idx;
1751         int s_idx, s_q_idx;
1752         struct net_device *dev;
1753         const struct nlmsghdr *nlh = cb->nlh;
1754         struct nlattr *tca[TCA_MAX + 1];
1755         int err;
1756
1757         s_idx = cb->args[0];
1758         s_q_idx = q_idx = cb->args[1];
1759
1760         idx = 0;
1761         ASSERT_RTNL();
1762
1763         err = nlmsg_parse_deprecated(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
1764                                      rtm_tca_policy, cb->extack);
1765         if (err < 0)
1766                 return err;
1767
1768         for_each_netdev(net, dev) {
1769                 struct netdev_queue *dev_queue;
1770
1771                 if (idx < s_idx)
1772                         goto cont;
1773                 if (idx > s_idx)
1774                         s_q_idx = 0;
1775                 q_idx = 0;
1776
1777                 if (tc_dump_qdisc_root(rtnl_dereference(dev->qdisc),
1778                                        skb, cb, &q_idx, s_q_idx,
1779                                        true, tca[TCA_DUMP_INVISIBLE]) < 0)
1780                         goto done;
1781
1782                 dev_queue = dev_ingress_queue(dev);
1783                 if (dev_queue &&
1784                     tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1785                                        &q_idx, s_q_idx, false,
1786                                        tca[TCA_DUMP_INVISIBLE]) < 0)
1787                         goto done;
1788
1789 cont:
1790                 idx++;
1791         }
1792
1793 done:
1794         cb->args[0] = idx;
1795         cb->args[1] = q_idx;
1796
1797         return skb->len;
1798 }
1799
1800
1801
1802 /************************************************
1803  *      Traffic classes manipulation.           *
1804  ************************************************/
1805
1806 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1807                           unsigned long cl,
1808                           u32 portid, u32 seq, u16 flags, int event)
1809 {
1810         struct tcmsg *tcm;
1811         struct nlmsghdr  *nlh;
1812         unsigned char *b = skb_tail_pointer(skb);
1813         struct gnet_dump d;
1814         const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1815
1816         cond_resched();
1817         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1818         if (!nlh)
1819                 goto out_nlmsg_trim;
1820         tcm = nlmsg_data(nlh);
1821         tcm->tcm_family = AF_UNSPEC;
1822         tcm->tcm__pad1 = 0;
1823         tcm->tcm__pad2 = 0;
1824         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1825         tcm->tcm_parent = q->handle;
1826         tcm->tcm_handle = q->handle;
1827         tcm->tcm_info = 0;
1828         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1829                 goto nla_put_failure;
1830         if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1831                 goto nla_put_failure;
1832
1833         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1834                                          NULL, &d, TCA_PAD) < 0)
1835                 goto nla_put_failure;
1836
1837         if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1838                 goto nla_put_failure;
1839
1840         if (gnet_stats_finish_copy(&d) < 0)
1841                 goto nla_put_failure;
1842
1843         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1844         return skb->len;
1845
1846 out_nlmsg_trim:
1847 nla_put_failure:
1848         nlmsg_trim(skb, b);
1849         return -1;
1850 }
1851
1852 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1853                          struct nlmsghdr *n, struct Qdisc *q,
1854                          unsigned long cl, int event)
1855 {
1856         struct sk_buff *skb;
1857         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1858
1859         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1860         if (!skb)
1861                 return -ENOBUFS;
1862
1863         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1864                 kfree_skb(skb);
1865                 return -EINVAL;
1866         }
1867
1868         return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1869                               n->nlmsg_flags & NLM_F_ECHO);
1870 }
1871
1872 static int tclass_del_notify(struct net *net,
1873                              const struct Qdisc_class_ops *cops,
1874                              struct sk_buff *oskb, struct nlmsghdr *n,
1875                              struct Qdisc *q, unsigned long cl,
1876                              struct netlink_ext_ack *extack)
1877 {
1878         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1879         struct sk_buff *skb;
1880         int err = 0;
1881
1882         if (!cops->delete)
1883                 return -EOPNOTSUPP;
1884
1885         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1886         if (!skb)
1887                 return -ENOBUFS;
1888
1889         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1890                            RTM_DELTCLASS) < 0) {
1891                 kfree_skb(skb);
1892                 return -EINVAL;
1893         }
1894
1895         err = cops->delete(q, cl, extack);
1896         if (err) {
1897                 kfree_skb(skb);
1898                 return err;
1899         }
1900
1901         err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1902                              n->nlmsg_flags & NLM_F_ECHO);
1903         return err;
1904 }
1905
1906 #ifdef CONFIG_NET_CLS
1907
1908 struct tcf_bind_args {
1909         struct tcf_walker w;
1910         unsigned long base;
1911         unsigned long cl;
1912         u32 classid;
1913 };
1914
1915 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
1916 {
1917         struct tcf_bind_args *a = (void *)arg;
1918
1919         if (n && tp->ops->bind_class) {
1920                 struct Qdisc *q = tcf_block_q(tp->chain->block);
1921
1922                 sch_tree_lock(q);
1923                 tp->ops->bind_class(n, a->classid, a->cl, q, a->base);
1924                 sch_tree_unlock(q);
1925         }
1926         return 0;
1927 }
1928
1929 struct tc_bind_class_args {
1930         struct qdisc_walker w;
1931         unsigned long new_cl;
1932         u32 portid;
1933         u32 clid;
1934 };
1935
1936 static int tc_bind_class_walker(struct Qdisc *q, unsigned long cl,
1937                                 struct qdisc_walker *w)
1938 {
1939         struct tc_bind_class_args *a = (struct tc_bind_class_args *)w;
1940         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1941         struct tcf_block *block;
1942         struct tcf_chain *chain;
1943
1944         block = cops->tcf_block(q, cl, NULL);
1945         if (!block)
1946                 return 0;
1947         for (chain = tcf_get_next_chain(block, NULL);
1948              chain;
1949              chain = tcf_get_next_chain(block, chain)) {
1950                 struct tcf_proto *tp;
1951
1952                 for (tp = tcf_get_next_proto(chain, NULL);
1953                      tp; tp = tcf_get_next_proto(chain, tp)) {
1954                         struct tcf_bind_args arg = {};
1955
1956                         arg.w.fn = tcf_node_bind;
1957                         arg.classid = a->clid;
1958                         arg.base = cl;
1959                         arg.cl = a->new_cl;
1960                         tp->ops->walk(tp, &arg.w, true);
1961                 }
1962         }
1963
1964         return 0;
1965 }
1966
1967 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1968                            unsigned long new_cl)
1969 {
1970         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1971         struct tc_bind_class_args args = {};
1972
1973         if (!cops->tcf_block)
1974                 return;
1975         args.portid = portid;
1976         args.clid = clid;
1977         args.new_cl = new_cl;
1978         args.w.fn = tc_bind_class_walker;
1979         q->ops->cl_ops->walk(q, &args.w);
1980 }
1981
1982 #else
1983
1984 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1985                            unsigned long new_cl)
1986 {
1987 }
1988
1989 #endif
1990
1991 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
1992                          struct netlink_ext_ack *extack)
1993 {
1994         struct net *net = sock_net(skb->sk);
1995         struct tcmsg *tcm = nlmsg_data(n);
1996         struct nlattr *tca[TCA_MAX + 1];
1997         struct net_device *dev;
1998         struct Qdisc *q = NULL;
1999         const struct Qdisc_class_ops *cops;
2000         unsigned long cl = 0;
2001         unsigned long new_cl;
2002         u32 portid;
2003         u32 clid;
2004         u32 qid;
2005         int err;
2006
2007         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
2008                                      rtm_tca_policy, extack);
2009         if (err < 0)
2010                 return err;
2011
2012         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
2013         if (!dev)
2014                 return -ENODEV;
2015
2016         /*
2017            parent == TC_H_UNSPEC - unspecified parent.
2018            parent == TC_H_ROOT   - class is root, which has no parent.
2019            parent == X:0         - parent is root class.
2020            parent == X:Y         - parent is a node in hierarchy.
2021            parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
2022
2023            handle == 0:0         - generate handle from kernel pool.
2024            handle == 0:Y         - class is X:Y, where X:0 is qdisc.
2025            handle == X:Y         - clear.
2026            handle == X:0         - root class.
2027          */
2028
2029         /* Step 1. Determine qdisc handle X:0 */
2030
2031         portid = tcm->tcm_parent;
2032         clid = tcm->tcm_handle;
2033         qid = TC_H_MAJ(clid);
2034
2035         if (portid != TC_H_ROOT) {
2036                 u32 qid1 = TC_H_MAJ(portid);
2037
2038                 if (qid && qid1) {
2039                         /* If both majors are known, they must be identical. */
2040                         if (qid != qid1)
2041                                 return -EINVAL;
2042                 } else if (qid1) {
2043                         qid = qid1;
2044                 } else if (qid == 0)
2045                         qid = rtnl_dereference(dev->qdisc)->handle;
2046
2047                 /* Now qid is genuine qdisc handle consistent
2048                  * both with parent and child.
2049                  *
2050                  * TC_H_MAJ(portid) still may be unspecified, complete it now.
2051                  */
2052                 if (portid)
2053                         portid = TC_H_MAKE(qid, portid);
2054         } else {
2055                 if (qid == 0)
2056                         qid = rtnl_dereference(dev->qdisc)->handle;
2057         }
2058
2059         /* OK. Locate qdisc */
2060         q = qdisc_lookup(dev, qid);
2061         if (!q)
2062                 return -ENOENT;
2063
2064         /* An check that it supports classes */
2065         cops = q->ops->cl_ops;
2066         if (cops == NULL)
2067                 return -EINVAL;
2068
2069         /* Now try to get class */
2070         if (clid == 0) {
2071                 if (portid == TC_H_ROOT)
2072                         clid = qid;
2073         } else
2074                 clid = TC_H_MAKE(qid, clid);
2075
2076         if (clid)
2077                 cl = cops->find(q, clid);
2078
2079         if (cl == 0) {
2080                 err = -ENOENT;
2081                 if (n->nlmsg_type != RTM_NEWTCLASS ||
2082                     !(n->nlmsg_flags & NLM_F_CREATE))
2083                         goto out;
2084         } else {
2085                 switch (n->nlmsg_type) {
2086                 case RTM_NEWTCLASS:
2087                         err = -EEXIST;
2088                         if (n->nlmsg_flags & NLM_F_EXCL)
2089                                 goto out;
2090                         break;
2091                 case RTM_DELTCLASS:
2092                         err = tclass_del_notify(net, cops, skb, n, q, cl, extack);
2093                         /* Unbind the class with flilters with 0 */
2094                         tc_bind_tclass(q, portid, clid, 0);
2095                         goto out;
2096                 case RTM_GETTCLASS:
2097                         err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
2098                         goto out;
2099                 default:
2100                         err = -EINVAL;
2101                         goto out;
2102                 }
2103         }
2104
2105         if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
2106                 NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
2107                 return -EOPNOTSUPP;
2108         }
2109
2110         new_cl = cl;
2111         err = -EOPNOTSUPP;
2112         if (cops->change)
2113                 err = cops->change(q, clid, portid, tca, &new_cl, extack);
2114         if (err == 0) {
2115                 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
2116                 /* We just create a new class, need to do reverse binding. */
2117                 if (cl != new_cl)
2118                         tc_bind_tclass(q, portid, clid, new_cl);
2119         }
2120 out:
2121         return err;
2122 }
2123
2124 struct qdisc_dump_args {
2125         struct qdisc_walker     w;
2126         struct sk_buff          *skb;
2127         struct netlink_callback *cb;
2128 };
2129
2130 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
2131                             struct qdisc_walker *arg)
2132 {
2133         struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
2134
2135         return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
2136                               a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
2137                               RTM_NEWTCLASS);
2138 }
2139
2140 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
2141                                 struct tcmsg *tcm, struct netlink_callback *cb,
2142                                 int *t_p, int s_t)
2143 {
2144         struct qdisc_dump_args arg;
2145
2146         if (tc_qdisc_dump_ignore(q, false) ||
2147             *t_p < s_t || !q->ops->cl_ops ||
2148             (tcm->tcm_parent &&
2149              TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2150                 (*t_p)++;
2151                 return 0;
2152         }
2153         if (*t_p > s_t)
2154                 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2155         arg.w.fn = qdisc_class_dump;
2156         arg.skb = skb;
2157         arg.cb = cb;
2158         arg.w.stop  = 0;
2159         arg.w.skip = cb->args[1];
2160         arg.w.count = 0;
2161         q->ops->cl_ops->walk(q, &arg.w);
2162         cb->args[1] = arg.w.count;
2163         if (arg.w.stop)
2164                 return -1;
2165         (*t_p)++;
2166         return 0;
2167 }
2168
2169 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2170                                struct tcmsg *tcm, struct netlink_callback *cb,
2171                                int *t_p, int s_t, bool recur)
2172 {
2173         struct Qdisc *q;
2174         int b;
2175
2176         if (!root)
2177                 return 0;
2178
2179         if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2180                 return -1;
2181
2182         if (!qdisc_dev(root) || !recur)
2183                 return 0;
2184
2185         if (tcm->tcm_parent) {
2186                 q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2187                 if (q && q != root &&
2188                     tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2189                         return -1;
2190                 return 0;
2191         }
2192         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2193                 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2194                         return -1;
2195         }
2196
2197         return 0;
2198 }
2199
2200 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2201 {
2202         struct tcmsg *tcm = nlmsg_data(cb->nlh);
2203         struct net *net = sock_net(skb->sk);
2204         struct netdev_queue *dev_queue;
2205         struct net_device *dev;
2206         int t, s_t;
2207
2208         if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2209                 return 0;
2210         dev = dev_get_by_index(net, tcm->tcm_ifindex);
2211         if (!dev)
2212                 return 0;
2213
2214         s_t = cb->args[0];
2215         t = 0;
2216
2217         if (tc_dump_tclass_root(rtnl_dereference(dev->qdisc),
2218                                 skb, tcm, cb, &t, s_t, true) < 0)
2219                 goto done;
2220
2221         dev_queue = dev_ingress_queue(dev);
2222         if (dev_queue &&
2223             tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
2224                                 &t, s_t, false) < 0)
2225                 goto done;
2226
2227 done:
2228         cb->args[0] = t;
2229
2230         dev_put(dev);
2231         return skb->len;
2232 }
2233
2234 #ifdef CONFIG_PROC_FS
2235 static int psched_show(struct seq_file *seq, void *v)
2236 {
2237         seq_printf(seq, "%08x %08x %08x %08x\n",
2238                    (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2239                    1000000,
2240                    (u32)NSEC_PER_SEC / hrtimer_resolution);
2241
2242         return 0;
2243 }
2244
2245 static int __net_init psched_net_init(struct net *net)
2246 {
2247         struct proc_dir_entry *e;
2248
2249         e = proc_create_single("psched", 0, net->proc_net, psched_show);
2250         if (e == NULL)
2251                 return -ENOMEM;
2252
2253         return 0;
2254 }
2255
2256 static void __net_exit psched_net_exit(struct net *net)
2257 {
2258         remove_proc_entry("psched", net->proc_net);
2259 }
2260 #else
2261 static int __net_init psched_net_init(struct net *net)
2262 {
2263         return 0;
2264 }
2265
2266 static void __net_exit psched_net_exit(struct net *net)
2267 {
2268 }
2269 #endif
2270
2271 static struct pernet_operations psched_net_ops = {
2272         .init = psched_net_init,
2273         .exit = psched_net_exit,
2274 };
2275
2276 static int __init pktsched_init(void)
2277 {
2278         int err;
2279
2280         err = register_pernet_subsys(&psched_net_ops);
2281         if (err) {
2282                 pr_err("pktsched_init: "
2283                        "cannot initialize per netns operations\n");
2284                 return err;
2285         }
2286
2287         register_qdisc(&pfifo_fast_ops);
2288         register_qdisc(&pfifo_qdisc_ops);
2289         register_qdisc(&bfifo_qdisc_ops);
2290         register_qdisc(&pfifo_head_drop_qdisc_ops);
2291         register_qdisc(&mq_qdisc_ops);
2292         register_qdisc(&noqueue_qdisc_ops);
2293
2294         rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2295         rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2296         rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2297                       0);
2298         rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2299         rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2300         rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2301                       0);
2302
2303         return 0;
2304 }
2305
2306 subsys_initcall(pktsched_init);