5ab20c764aa5b8fbdd41e171a5aef03b673d3938
[platform/kernel/linux-rpi.git] / net / sched / sch_api.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * net/sched/sch_api.c  Packet scheduler API.
4  *
5  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
6  *
7  * Fixes:
8  *
9  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
10  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
11  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
12  */
13
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/string.h>
18 #include <linux/errno.h>
19 #include <linux/skbuff.h>
20 #include <linux/init.h>
21 #include <linux/proc_fs.h>
22 #include <linux/seq_file.h>
23 #include <linux/kmod.h>
24 #include <linux/list.h>
25 #include <linux/hrtimer.h>
26 #include <linux/slab.h>
27 #include <linux/hashtable.h>
28
29 #include <net/net_namespace.h>
30 #include <net/sock.h>
31 #include <net/netlink.h>
32 #include <net/pkt_sched.h>
33 #include <net/pkt_cls.h>
34
35 #include <trace/events/qdisc.h>
36
37 /*
38
39    Short review.
40    -------------
41
42    This file consists of two interrelated parts:
43
44    1. queueing disciplines manager frontend.
45    2. traffic classes manager frontend.
46
47    Generally, queueing discipline ("qdisc") is a black box,
48    which is able to enqueue packets and to dequeue them (when
49    device is ready to send something) in order and at times
50    determined by algorithm hidden in it.
51
52    qdisc's are divided to two categories:
53    - "queues", which have no internal structure visible from outside.
54    - "schedulers", which split all the packets to "traffic classes",
55      using "packet classifiers" (look at cls_api.c)
56
57    In turn, classes may have child qdiscs (as rule, queues)
58    attached to them etc. etc. etc.
59
60    The goal of the routines in this file is to translate
61    information supplied by user in the form of handles
62    to more intelligible for kernel form, to make some sanity
63    checks and part of work, which is common to all qdiscs
64    and to provide rtnetlink notifications.
65
66    All real intelligent work is done inside qdisc modules.
67
68
69
70    Every discipline has two major routines: enqueue and dequeue.
71
72    ---dequeue
73
74    dequeue usually returns a skb to send. It is allowed to return NULL,
75    but it does not mean that queue is empty, it just means that
76    discipline does not want to send anything this time.
77    Queue is really empty if q->q.qlen == 0.
78    For complicated disciplines with multiple queues q->q is not
79    real packet queue, but however q->q.qlen must be valid.
80
81    ---enqueue
82
83    enqueue returns 0, if packet was enqueued successfully.
84    If packet (this one or another one) was dropped, it returns
85    not zero error code.
86    NET_XMIT_DROP        - this packet dropped
87      Expected action: do not backoff, but wait until queue will clear.
88    NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
89      Expected action: backoff or ignore
90
91    Auxiliary routines:
92
93    ---peek
94
95    like dequeue but without removing a packet from the queue
96
97    ---reset
98
99    returns qdisc to initial state: purge all buffers, clear all
100    timers, counters (except for statistics) etc.
101
102    ---init
103
104    initializes newly created qdisc.
105
106    ---destroy
107
108    destroys resources allocated by init and during lifetime of qdisc.
109
110    ---change
111
112    changes qdisc parameters.
113  */
114
115 /* Protects list of registered TC modules. It is pure SMP lock. */
116 static DEFINE_RWLOCK(qdisc_mod_lock);
117
118
119 /************************************************
120  *      Queueing disciplines manipulation.      *
121  ************************************************/
122
123
124 /* The list of all installed queueing disciplines. */
125
126 static struct Qdisc_ops *qdisc_base;
127
128 /* Register/unregister queueing discipline */
129
130 int register_qdisc(struct Qdisc_ops *qops)
131 {
132         struct Qdisc_ops *q, **qp;
133         int rc = -EEXIST;
134
135         write_lock(&qdisc_mod_lock);
136         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
137                 if (!strcmp(qops->id, q->id))
138                         goto out;
139
140         if (qops->enqueue == NULL)
141                 qops->enqueue = noop_qdisc_ops.enqueue;
142         if (qops->peek == NULL) {
143                 if (qops->dequeue == NULL)
144                         qops->peek = noop_qdisc_ops.peek;
145                 else
146                         goto out_einval;
147         }
148         if (qops->dequeue == NULL)
149                 qops->dequeue = noop_qdisc_ops.dequeue;
150
151         if (qops->cl_ops) {
152                 const struct Qdisc_class_ops *cops = qops->cl_ops;
153
154                 if (!(cops->find && cops->walk && cops->leaf))
155                         goto out_einval;
156
157                 if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
158                         goto out_einval;
159         }
160
161         qops->next = NULL;
162         *qp = qops;
163         rc = 0;
164 out:
165         write_unlock(&qdisc_mod_lock);
166         return rc;
167
168 out_einval:
169         rc = -EINVAL;
170         goto out;
171 }
172 EXPORT_SYMBOL(register_qdisc);
173
174 int unregister_qdisc(struct Qdisc_ops *qops)
175 {
176         struct Qdisc_ops *q, **qp;
177         int err = -ENOENT;
178
179         write_lock(&qdisc_mod_lock);
180         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
181                 if (q == qops)
182                         break;
183         if (q) {
184                 *qp = q->next;
185                 q->next = NULL;
186                 err = 0;
187         }
188         write_unlock(&qdisc_mod_lock);
189         return err;
190 }
191 EXPORT_SYMBOL(unregister_qdisc);
192
193 /* Get default qdisc if not otherwise specified */
194 void qdisc_get_default(char *name, size_t len)
195 {
196         read_lock(&qdisc_mod_lock);
197         strlcpy(name, default_qdisc_ops->id, len);
198         read_unlock(&qdisc_mod_lock);
199 }
200
201 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
202 {
203         struct Qdisc_ops *q = NULL;
204
205         for (q = qdisc_base; q; q = q->next) {
206                 if (!strcmp(name, q->id)) {
207                         if (!try_module_get(q->owner))
208                                 q = NULL;
209                         break;
210                 }
211         }
212
213         return q;
214 }
215
216 /* Set new default qdisc to use */
217 int qdisc_set_default(const char *name)
218 {
219         const struct Qdisc_ops *ops;
220
221         if (!capable(CAP_NET_ADMIN))
222                 return -EPERM;
223
224         write_lock(&qdisc_mod_lock);
225         ops = qdisc_lookup_default(name);
226         if (!ops) {
227                 /* Not found, drop lock and try to load module */
228                 write_unlock(&qdisc_mod_lock);
229                 request_module("sch_%s", name);
230                 write_lock(&qdisc_mod_lock);
231
232                 ops = qdisc_lookup_default(name);
233         }
234
235         if (ops) {
236                 /* Set new default */
237                 module_put(default_qdisc_ops->owner);
238                 default_qdisc_ops = ops;
239         }
240         write_unlock(&qdisc_mod_lock);
241
242         return ops ? 0 : -ENOENT;
243 }
244
245 #ifdef CONFIG_NET_SCH_DEFAULT
246 /* Set default value from kernel config */
247 static int __init sch_default_qdisc(void)
248 {
249         return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
250 }
251 late_initcall(sch_default_qdisc);
252 #endif
253
254 /* We know handle. Find qdisc among all qdisc's attached to device
255  * (root qdisc, all its children, children of children etc.)
256  * Note: caller either uses rtnl or rcu_read_lock()
257  */
258
259 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
260 {
261         struct Qdisc *q;
262
263         if (!qdisc_dev(root))
264                 return (root->handle == handle ? root : NULL);
265
266         if (!(root->flags & TCQ_F_BUILTIN) &&
267             root->handle == handle)
268                 return root;
269
270         hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle,
271                                    lockdep_rtnl_is_held()) {
272                 if (q->handle == handle)
273                         return q;
274         }
275         return NULL;
276 }
277
278 void qdisc_hash_add(struct Qdisc *q, bool invisible)
279 {
280         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
281                 ASSERT_RTNL();
282                 hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
283                 if (invisible)
284                         q->flags |= TCQ_F_INVISIBLE;
285         }
286 }
287 EXPORT_SYMBOL(qdisc_hash_add);
288
289 void qdisc_hash_del(struct Qdisc *q)
290 {
291         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
292                 ASSERT_RTNL();
293                 hash_del_rcu(&q->hash);
294         }
295 }
296 EXPORT_SYMBOL(qdisc_hash_del);
297
298 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
299 {
300         struct Qdisc *q;
301
302         if (!handle)
303                 return NULL;
304         q = qdisc_match_from_root(rtnl_dereference(dev->qdisc), handle);
305         if (q)
306                 goto out;
307
308         if (dev_ingress_queue(dev))
309                 q = qdisc_match_from_root(
310                         dev_ingress_queue(dev)->qdisc_sleeping,
311                         handle);
312 out:
313         return q;
314 }
315
316 struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle)
317 {
318         struct netdev_queue *nq;
319         struct Qdisc *q;
320
321         if (!handle)
322                 return NULL;
323         q = qdisc_match_from_root(rcu_dereference(dev->qdisc), handle);
324         if (q)
325                 goto out;
326
327         nq = dev_ingress_queue_rcu(dev);
328         if (nq)
329                 q = qdisc_match_from_root(nq->qdisc_sleeping, handle);
330 out:
331         return q;
332 }
333
334 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
335 {
336         unsigned long cl;
337         const struct Qdisc_class_ops *cops = p->ops->cl_ops;
338
339         if (cops == NULL)
340                 return NULL;
341         cl = cops->find(p, classid);
342
343         if (cl == 0)
344                 return NULL;
345         return cops->leaf(p, cl);
346 }
347
348 /* Find queueing discipline by name */
349
350 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
351 {
352         struct Qdisc_ops *q = NULL;
353
354         if (kind) {
355                 read_lock(&qdisc_mod_lock);
356                 for (q = qdisc_base; q; q = q->next) {
357                         if (nla_strcmp(kind, q->id) == 0) {
358                                 if (!try_module_get(q->owner))
359                                         q = NULL;
360                                 break;
361                         }
362                 }
363                 read_unlock(&qdisc_mod_lock);
364         }
365         return q;
366 }
367
368 /* The linklayer setting were not transferred from iproute2, in older
369  * versions, and the rate tables lookup systems have been dropped in
370  * the kernel. To keep backward compatible with older iproute2 tc
371  * utils, we detect the linklayer setting by detecting if the rate
372  * table were modified.
373  *
374  * For linklayer ATM table entries, the rate table will be aligned to
375  * 48 bytes, thus some table entries will contain the same value.  The
376  * mpu (min packet unit) is also encoded into the old rate table, thus
377  * starting from the mpu, we find low and high table entries for
378  * mapping this cell.  If these entries contain the same value, when
379  * the rate tables have been modified for linklayer ATM.
380  *
381  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
382  * and then roundup to the next cell, calc the table entry one below,
383  * and compare.
384  */
385 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
386 {
387         int low       = roundup(r->mpu, 48);
388         int high      = roundup(low+1, 48);
389         int cell_low  = low >> r->cell_log;
390         int cell_high = (high >> r->cell_log) - 1;
391
392         /* rtab is too inaccurate at rates > 100Mbit/s */
393         if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
394                 pr_debug("TC linklayer: Giving up ATM detection\n");
395                 return TC_LINKLAYER_ETHERNET;
396         }
397
398         if ((cell_high > cell_low) && (cell_high < 256)
399             && (rtab[cell_low] == rtab[cell_high])) {
400                 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
401                          cell_low, cell_high, rtab[cell_high]);
402                 return TC_LINKLAYER_ATM;
403         }
404         return TC_LINKLAYER_ETHERNET;
405 }
406
407 static struct qdisc_rate_table *qdisc_rtab_list;
408
409 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
410                                         struct nlattr *tab,
411                                         struct netlink_ext_ack *extack)
412 {
413         struct qdisc_rate_table *rtab;
414
415         if (tab == NULL || r->rate == 0 ||
416             r->cell_log == 0 || r->cell_log >= 32 ||
417             nla_len(tab) != TC_RTAB_SIZE) {
418                 NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
419                 return NULL;
420         }
421
422         for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
423                 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
424                     !memcmp(&rtab->data, nla_data(tab), 1024)) {
425                         rtab->refcnt++;
426                         return rtab;
427                 }
428         }
429
430         rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
431         if (rtab) {
432                 rtab->rate = *r;
433                 rtab->refcnt = 1;
434                 memcpy(rtab->data, nla_data(tab), 1024);
435                 if (r->linklayer == TC_LINKLAYER_UNAWARE)
436                         r->linklayer = __detect_linklayer(r, rtab->data);
437                 rtab->next = qdisc_rtab_list;
438                 qdisc_rtab_list = rtab;
439         } else {
440                 NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
441         }
442         return rtab;
443 }
444 EXPORT_SYMBOL(qdisc_get_rtab);
445
446 void qdisc_put_rtab(struct qdisc_rate_table *tab)
447 {
448         struct qdisc_rate_table *rtab, **rtabp;
449
450         if (!tab || --tab->refcnt)
451                 return;
452
453         for (rtabp = &qdisc_rtab_list;
454              (rtab = *rtabp) != NULL;
455              rtabp = &rtab->next) {
456                 if (rtab == tab) {
457                         *rtabp = rtab->next;
458                         kfree(rtab);
459                         return;
460                 }
461         }
462 }
463 EXPORT_SYMBOL(qdisc_put_rtab);
464
465 static LIST_HEAD(qdisc_stab_list);
466
467 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
468         [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
469         [TCA_STAB_DATA] = { .type = NLA_BINARY },
470 };
471
472 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
473                                                struct netlink_ext_ack *extack)
474 {
475         struct nlattr *tb[TCA_STAB_MAX + 1];
476         struct qdisc_size_table *stab;
477         struct tc_sizespec *s;
478         unsigned int tsize = 0;
479         u16 *tab = NULL;
480         int err;
481
482         err = nla_parse_nested_deprecated(tb, TCA_STAB_MAX, opt, stab_policy,
483                                           extack);
484         if (err < 0)
485                 return ERR_PTR(err);
486         if (!tb[TCA_STAB_BASE]) {
487                 NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
488                 return ERR_PTR(-EINVAL);
489         }
490
491         s = nla_data(tb[TCA_STAB_BASE]);
492
493         if (s->tsize > 0) {
494                 if (!tb[TCA_STAB_DATA]) {
495                         NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
496                         return ERR_PTR(-EINVAL);
497                 }
498                 tab = nla_data(tb[TCA_STAB_DATA]);
499                 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
500         }
501
502         if (tsize != s->tsize || (!tab && tsize > 0)) {
503                 NL_SET_ERR_MSG(extack, "Invalid size of size table");
504                 return ERR_PTR(-EINVAL);
505         }
506
507         list_for_each_entry(stab, &qdisc_stab_list, list) {
508                 if (memcmp(&stab->szopts, s, sizeof(*s)))
509                         continue;
510                 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
511                         continue;
512                 stab->refcnt++;
513                 return stab;
514         }
515
516         if (s->size_log > STAB_SIZE_LOG_MAX ||
517             s->cell_log > STAB_SIZE_LOG_MAX) {
518                 NL_SET_ERR_MSG(extack, "Invalid logarithmic size of size table");
519                 return ERR_PTR(-EINVAL);
520         }
521
522         stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
523         if (!stab)
524                 return ERR_PTR(-ENOMEM);
525
526         stab->refcnt = 1;
527         stab->szopts = *s;
528         if (tsize > 0)
529                 memcpy(stab->data, tab, tsize * sizeof(u16));
530
531         list_add_tail(&stab->list, &qdisc_stab_list);
532
533         return stab;
534 }
535
536 void qdisc_put_stab(struct qdisc_size_table *tab)
537 {
538         if (!tab)
539                 return;
540
541         if (--tab->refcnt == 0) {
542                 list_del(&tab->list);
543                 kfree_rcu(tab, rcu);
544         }
545 }
546 EXPORT_SYMBOL(qdisc_put_stab);
547
548 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
549 {
550         struct nlattr *nest;
551
552         nest = nla_nest_start_noflag(skb, TCA_STAB);
553         if (nest == NULL)
554                 goto nla_put_failure;
555         if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
556                 goto nla_put_failure;
557         nla_nest_end(skb, nest);
558
559         return skb->len;
560
561 nla_put_failure:
562         return -1;
563 }
564
565 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
566                                const struct qdisc_size_table *stab)
567 {
568         int pkt_len, slot;
569
570         pkt_len = skb->len + stab->szopts.overhead;
571         if (unlikely(!stab->szopts.tsize))
572                 goto out;
573
574         slot = pkt_len + stab->szopts.cell_align;
575         if (unlikely(slot < 0))
576                 slot = 0;
577
578         slot >>= stab->szopts.cell_log;
579         if (likely(slot < stab->szopts.tsize))
580                 pkt_len = stab->data[slot];
581         else
582                 pkt_len = stab->data[stab->szopts.tsize - 1] *
583                                 (slot / stab->szopts.tsize) +
584                                 stab->data[slot % stab->szopts.tsize];
585
586         pkt_len <<= stab->szopts.size_log;
587 out:
588         if (unlikely(pkt_len < 1))
589                 pkt_len = 1;
590         qdisc_skb_cb(skb)->pkt_len = pkt_len;
591 }
592 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
593
594 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
595 {
596         if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
597                 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
598                         txt, qdisc->ops->id, qdisc->handle >> 16);
599                 qdisc->flags |= TCQ_F_WARN_NONWC;
600         }
601 }
602 EXPORT_SYMBOL(qdisc_warn_nonwc);
603
604 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
605 {
606         struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
607                                                  timer);
608
609         rcu_read_lock();
610         __netif_schedule(qdisc_root(wd->qdisc));
611         rcu_read_unlock();
612
613         return HRTIMER_NORESTART;
614 }
615
616 void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
617                                  clockid_t clockid)
618 {
619         hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
620         wd->timer.function = qdisc_watchdog;
621         wd->qdisc = qdisc;
622 }
623 EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
624
625 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
626 {
627         qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
628 }
629 EXPORT_SYMBOL(qdisc_watchdog_init);
630
631 void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires,
632                                       u64 delta_ns)
633 {
634         if (test_bit(__QDISC_STATE_DEACTIVATED,
635                      &qdisc_root_sleeping(wd->qdisc)->state))
636                 return;
637
638         if (hrtimer_is_queued(&wd->timer)) {
639                 /* If timer is already set in [expires, expires + delta_ns],
640                  * do not reprogram it.
641                  */
642                 if (wd->last_expires - expires <= delta_ns)
643                         return;
644         }
645
646         wd->last_expires = expires;
647         hrtimer_start_range_ns(&wd->timer,
648                                ns_to_ktime(expires),
649                                delta_ns,
650                                HRTIMER_MODE_ABS_PINNED);
651 }
652 EXPORT_SYMBOL(qdisc_watchdog_schedule_range_ns);
653
654 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
655 {
656         hrtimer_cancel(&wd->timer);
657 }
658 EXPORT_SYMBOL(qdisc_watchdog_cancel);
659
660 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
661 {
662         struct hlist_head *h;
663         unsigned int i;
664
665         h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
666
667         if (h != NULL) {
668                 for (i = 0; i < n; i++)
669                         INIT_HLIST_HEAD(&h[i]);
670         }
671         return h;
672 }
673
674 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
675 {
676         struct Qdisc_class_common *cl;
677         struct hlist_node *next;
678         struct hlist_head *nhash, *ohash;
679         unsigned int nsize, nmask, osize;
680         unsigned int i, h;
681
682         /* Rehash when load factor exceeds 0.75 */
683         if (clhash->hashelems * 4 <= clhash->hashsize * 3)
684                 return;
685         nsize = clhash->hashsize * 2;
686         nmask = nsize - 1;
687         nhash = qdisc_class_hash_alloc(nsize);
688         if (nhash == NULL)
689                 return;
690
691         ohash = clhash->hash;
692         osize = clhash->hashsize;
693
694         sch_tree_lock(sch);
695         for (i = 0; i < osize; i++) {
696                 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
697                         h = qdisc_class_hash(cl->classid, nmask);
698                         hlist_add_head(&cl->hnode, &nhash[h]);
699                 }
700         }
701         clhash->hash     = nhash;
702         clhash->hashsize = nsize;
703         clhash->hashmask = nmask;
704         sch_tree_unlock(sch);
705
706         kvfree(ohash);
707 }
708 EXPORT_SYMBOL(qdisc_class_hash_grow);
709
710 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
711 {
712         unsigned int size = 4;
713
714         clhash->hash = qdisc_class_hash_alloc(size);
715         if (!clhash->hash)
716                 return -ENOMEM;
717         clhash->hashsize  = size;
718         clhash->hashmask  = size - 1;
719         clhash->hashelems = 0;
720         return 0;
721 }
722 EXPORT_SYMBOL(qdisc_class_hash_init);
723
724 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
725 {
726         kvfree(clhash->hash);
727 }
728 EXPORT_SYMBOL(qdisc_class_hash_destroy);
729
730 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
731                              struct Qdisc_class_common *cl)
732 {
733         unsigned int h;
734
735         INIT_HLIST_NODE(&cl->hnode);
736         h = qdisc_class_hash(cl->classid, clhash->hashmask);
737         hlist_add_head(&cl->hnode, &clhash->hash[h]);
738         clhash->hashelems++;
739 }
740 EXPORT_SYMBOL(qdisc_class_hash_insert);
741
742 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
743                              struct Qdisc_class_common *cl)
744 {
745         hlist_del(&cl->hnode);
746         clhash->hashelems--;
747 }
748 EXPORT_SYMBOL(qdisc_class_hash_remove);
749
750 /* Allocate an unique handle from space managed by kernel
751  * Possible range is [8000-FFFF]:0000 (0x8000 values)
752  */
753 static u32 qdisc_alloc_handle(struct net_device *dev)
754 {
755         int i = 0x8000;
756         static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
757
758         do {
759                 autohandle += TC_H_MAKE(0x10000U, 0);
760                 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
761                         autohandle = TC_H_MAKE(0x80000000U, 0);
762                 if (!qdisc_lookup(dev, autohandle))
763                         return autohandle;
764                 cond_resched();
765         } while (--i > 0);
766
767         return 0;
768 }
769
770 void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len)
771 {
772         bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
773         const struct Qdisc_class_ops *cops;
774         unsigned long cl;
775         u32 parentid;
776         bool notify;
777         int drops;
778
779         if (n == 0 && len == 0)
780                 return;
781         drops = max_t(int, n, 0);
782         rcu_read_lock();
783         while ((parentid = sch->parent)) {
784                 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
785                         break;
786
787                 if (sch->flags & TCQ_F_NOPARENT)
788                         break;
789                 /* Notify parent qdisc only if child qdisc becomes empty.
790                  *
791                  * If child was empty even before update then backlog
792                  * counter is screwed and we skip notification because
793                  * parent class is already passive.
794                  *
795                  * If the original child was offloaded then it is allowed
796                  * to be seem as empty, so the parent is notified anyway.
797                  */
798                 notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
799                                                        !qdisc_is_offloaded);
800                 /* TODO: perform the search on a per txq basis */
801                 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
802                 if (sch == NULL) {
803                         WARN_ON_ONCE(parentid != TC_H_ROOT);
804                         break;
805                 }
806                 cops = sch->ops->cl_ops;
807                 if (notify && cops->qlen_notify) {
808                         cl = cops->find(sch, parentid);
809                         cops->qlen_notify(sch, cl);
810                 }
811                 sch->q.qlen -= n;
812                 sch->qstats.backlog -= len;
813                 __qdisc_qstats_drop(sch, drops);
814         }
815         rcu_read_unlock();
816 }
817 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
818
819 int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type,
820                               void *type_data)
821 {
822         struct net_device *dev = qdisc_dev(sch);
823         int err;
824
825         sch->flags &= ~TCQ_F_OFFLOADED;
826         if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
827                 return 0;
828
829         err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
830         if (err == -EOPNOTSUPP)
831                 return 0;
832
833         if (!err)
834                 sch->flags |= TCQ_F_OFFLOADED;
835
836         return err;
837 }
838 EXPORT_SYMBOL(qdisc_offload_dump_helper);
839
840 void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
841                                 struct Qdisc *new, struct Qdisc *old,
842                                 enum tc_setup_type type, void *type_data,
843                                 struct netlink_ext_ack *extack)
844 {
845         bool any_qdisc_is_offloaded;
846         int err;
847
848         if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
849                 return;
850
851         err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
852
853         /* Don't report error if the graft is part of destroy operation. */
854         if (!err || !new || new == &noop_qdisc)
855                 return;
856
857         /* Don't report error if the parent, the old child and the new
858          * one are not offloaded.
859          */
860         any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED;
861         any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED;
862         any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED;
863
864         if (any_qdisc_is_offloaded)
865                 NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
866 }
867 EXPORT_SYMBOL(qdisc_offload_graft_helper);
868
869 static void qdisc_offload_graft_root(struct net_device *dev,
870                                      struct Qdisc *new, struct Qdisc *old,
871                                      struct netlink_ext_ack *extack)
872 {
873         struct tc_root_qopt_offload graft_offload = {
874                 .command        = TC_ROOT_GRAFT,
875                 .handle         = new ? new->handle : 0,
876                 .ingress        = (new && new->flags & TCQ_F_INGRESS) ||
877                                   (old && old->flags & TCQ_F_INGRESS),
878         };
879
880         qdisc_offload_graft_helper(dev, NULL, new, old,
881                                    TC_SETUP_ROOT_QDISC, &graft_offload, extack);
882 }
883
884 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
885                          u32 portid, u32 seq, u16 flags, int event)
886 {
887         struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
888         struct gnet_stats_queue __percpu *cpu_qstats = NULL;
889         struct tcmsg *tcm;
890         struct nlmsghdr  *nlh;
891         unsigned char *b = skb_tail_pointer(skb);
892         struct gnet_dump d;
893         struct qdisc_size_table *stab;
894         u32 block_index;
895         __u32 qlen;
896
897         cond_resched();
898         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
899         if (!nlh)
900                 goto out_nlmsg_trim;
901         tcm = nlmsg_data(nlh);
902         tcm->tcm_family = AF_UNSPEC;
903         tcm->tcm__pad1 = 0;
904         tcm->tcm__pad2 = 0;
905         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
906         tcm->tcm_parent = clid;
907         tcm->tcm_handle = q->handle;
908         tcm->tcm_info = refcount_read(&q->refcnt);
909         if (nla_put_string(skb, TCA_KIND, q->ops->id))
910                 goto nla_put_failure;
911         if (q->ops->ingress_block_get) {
912                 block_index = q->ops->ingress_block_get(q);
913                 if (block_index &&
914                     nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
915                         goto nla_put_failure;
916         }
917         if (q->ops->egress_block_get) {
918                 block_index = q->ops->egress_block_get(q);
919                 if (block_index &&
920                     nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
921                         goto nla_put_failure;
922         }
923         if (q->ops->dump && q->ops->dump(q, skb) < 0)
924                 goto nla_put_failure;
925         if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
926                 goto nla_put_failure;
927         qlen = qdisc_qlen_sum(q);
928
929         stab = rtnl_dereference(q->stab);
930         if (stab && qdisc_dump_stab(skb, stab) < 0)
931                 goto nla_put_failure;
932
933         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
934                                          NULL, &d, TCA_PAD) < 0)
935                 goto nla_put_failure;
936
937         if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
938                 goto nla_put_failure;
939
940         if (qdisc_is_percpu_stats(q)) {
941                 cpu_bstats = q->cpu_bstats;
942                 cpu_qstats = q->cpu_qstats;
943         }
944
945         if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
946                                   &d, cpu_bstats, &q->bstats) < 0 ||
947             gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
948             gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
949                 goto nla_put_failure;
950
951         if (gnet_stats_finish_copy(&d) < 0)
952                 goto nla_put_failure;
953
954         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
955         return skb->len;
956
957 out_nlmsg_trim:
958 nla_put_failure:
959         nlmsg_trim(skb, b);
960         return -1;
961 }
962
963 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
964 {
965         if (q->flags & TCQ_F_BUILTIN)
966                 return true;
967         if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
968                 return true;
969
970         return false;
971 }
972
973 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
974                         struct nlmsghdr *n, u32 clid,
975                         struct Qdisc *old, struct Qdisc *new)
976 {
977         struct sk_buff *skb;
978         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
979
980         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
981         if (!skb)
982                 return -ENOBUFS;
983
984         if (old && !tc_qdisc_dump_ignore(old, false)) {
985                 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
986                                   0, RTM_DELQDISC) < 0)
987                         goto err_out;
988         }
989         if (new && !tc_qdisc_dump_ignore(new, false)) {
990                 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
991                                   old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
992                         goto err_out;
993         }
994
995         if (skb->len)
996                 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
997                                       n->nlmsg_flags & NLM_F_ECHO);
998
999 err_out:
1000         kfree_skb(skb);
1001         return -EINVAL;
1002 }
1003
1004 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
1005                                struct nlmsghdr *n, u32 clid,
1006                                struct Qdisc *old, struct Qdisc *new)
1007 {
1008         if (new || old)
1009                 qdisc_notify(net, skb, n, clid, old, new);
1010
1011         if (old)
1012                 qdisc_put(old);
1013 }
1014
1015 static void qdisc_clear_nolock(struct Qdisc *sch)
1016 {
1017         sch->flags &= ~TCQ_F_NOLOCK;
1018         if (!(sch->flags & TCQ_F_CPUSTATS))
1019                 return;
1020
1021         free_percpu(sch->cpu_bstats);
1022         free_percpu(sch->cpu_qstats);
1023         sch->cpu_bstats = NULL;
1024         sch->cpu_qstats = NULL;
1025         sch->flags &= ~TCQ_F_CPUSTATS;
1026 }
1027
1028 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
1029  * to device "dev".
1030  *
1031  * When appropriate send a netlink notification using 'skb'
1032  * and "n".
1033  *
1034  * On success, destroy old qdisc.
1035  */
1036
1037 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
1038                        struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
1039                        struct Qdisc *new, struct Qdisc *old,
1040                        struct netlink_ext_ack *extack)
1041 {
1042         struct Qdisc *q = old;
1043         struct net *net = dev_net(dev);
1044
1045         if (parent == NULL) {
1046                 unsigned int i, num_q, ingress;
1047
1048                 ingress = 0;
1049                 num_q = dev->num_tx_queues;
1050                 if ((q && q->flags & TCQ_F_INGRESS) ||
1051                     (new && new->flags & TCQ_F_INGRESS)) {
1052                         num_q = 1;
1053                         ingress = 1;
1054                         if (!dev_ingress_queue(dev)) {
1055                                 NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
1056                                 return -ENOENT;
1057                         }
1058                 }
1059
1060                 if (dev->flags & IFF_UP)
1061                         dev_deactivate(dev);
1062
1063                 qdisc_offload_graft_root(dev, new, old, extack);
1064
1065                 if (new && new->ops->attach && !ingress)
1066                         goto skip;
1067
1068                 for (i = 0; i < num_q; i++) {
1069                         struct netdev_queue *dev_queue = dev_ingress_queue(dev);
1070
1071                         if (!ingress)
1072                                 dev_queue = netdev_get_tx_queue(dev, i);
1073
1074                         old = dev_graft_qdisc(dev_queue, new);
1075                         if (new && i > 0)
1076                                 qdisc_refcount_inc(new);
1077
1078                         if (!ingress)
1079                                 qdisc_put(old);
1080                 }
1081
1082 skip:
1083                 if (!ingress) {
1084                         old = rtnl_dereference(dev->qdisc);
1085                         if (new && !new->ops->attach)
1086                                 qdisc_refcount_inc(new);
1087                         rcu_assign_pointer(dev->qdisc, new ? : &noop_qdisc);
1088
1089                         notify_and_destroy(net, skb, n, classid, old, new);
1090
1091                         if (new && new->ops->attach)
1092                                 new->ops->attach(new);
1093                 } else {
1094                         notify_and_destroy(net, skb, n, classid, old, new);
1095                 }
1096
1097                 if (dev->flags & IFF_UP)
1098                         dev_activate(dev);
1099         } else {
1100                 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
1101                 unsigned long cl;
1102                 int err;
1103
1104                 /* Only support running class lockless if parent is lockless */
1105                 if (new && (new->flags & TCQ_F_NOLOCK) && !(parent->flags & TCQ_F_NOLOCK))
1106                         qdisc_clear_nolock(new);
1107
1108                 if (!cops || !cops->graft)
1109                         return -EOPNOTSUPP;
1110
1111                 cl = cops->find(parent, classid);
1112                 if (!cl) {
1113                         NL_SET_ERR_MSG(extack, "Specified class not found");
1114                         return -ENOENT;
1115                 }
1116
1117                 err = cops->graft(parent, cl, new, &old, extack);
1118                 if (err)
1119                         return err;
1120                 notify_and_destroy(net, skb, n, classid, old, new);
1121         }
1122         return 0;
1123 }
1124
1125 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1126                                    struct netlink_ext_ack *extack)
1127 {
1128         u32 block_index;
1129
1130         if (tca[TCA_INGRESS_BLOCK]) {
1131                 block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1132
1133                 if (!block_index) {
1134                         NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1135                         return -EINVAL;
1136                 }
1137                 if (!sch->ops->ingress_block_set) {
1138                         NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1139                         return -EOPNOTSUPP;
1140                 }
1141                 sch->ops->ingress_block_set(sch, block_index);
1142         }
1143         if (tca[TCA_EGRESS_BLOCK]) {
1144                 block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1145
1146                 if (!block_index) {
1147                         NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1148                         return -EINVAL;
1149                 }
1150                 if (!sch->ops->egress_block_set) {
1151                         NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1152                         return -EOPNOTSUPP;
1153                 }
1154                 sch->ops->egress_block_set(sch, block_index);
1155         }
1156         return 0;
1157 }
1158
1159 /*
1160    Allocate and initialize new qdisc.
1161
1162    Parameters are passed via opt.
1163  */
1164
1165 static struct Qdisc *qdisc_create(struct net_device *dev,
1166                                   struct netdev_queue *dev_queue,
1167                                   struct Qdisc *p, u32 parent, u32 handle,
1168                                   struct nlattr **tca, int *errp,
1169                                   struct netlink_ext_ack *extack)
1170 {
1171         int err;
1172         struct nlattr *kind = tca[TCA_KIND];
1173         struct Qdisc *sch;
1174         struct Qdisc_ops *ops;
1175         struct qdisc_size_table *stab;
1176
1177         ops = qdisc_lookup_ops(kind);
1178 #ifdef CONFIG_MODULES
1179         if (ops == NULL && kind != NULL) {
1180                 char name[IFNAMSIZ];
1181                 if (nla_strscpy(name, kind, IFNAMSIZ) >= 0) {
1182                         /* We dropped the RTNL semaphore in order to
1183                          * perform the module load.  So, even if we
1184                          * succeeded in loading the module we have to
1185                          * tell the caller to replay the request.  We
1186                          * indicate this using -EAGAIN.
1187                          * We replay the request because the device may
1188                          * go away in the mean time.
1189                          */
1190                         rtnl_unlock();
1191                         request_module("sch_%s", name);
1192                         rtnl_lock();
1193                         ops = qdisc_lookup_ops(kind);
1194                         if (ops != NULL) {
1195                                 /* We will try again qdisc_lookup_ops,
1196                                  * so don't keep a reference.
1197                                  */
1198                                 module_put(ops->owner);
1199                                 err = -EAGAIN;
1200                                 goto err_out;
1201                         }
1202                 }
1203         }
1204 #endif
1205
1206         err = -ENOENT;
1207         if (!ops) {
1208                 NL_SET_ERR_MSG(extack, "Specified qdisc kind is unknown");
1209                 goto err_out;
1210         }
1211
1212         sch = qdisc_alloc(dev_queue, ops, extack);
1213         if (IS_ERR(sch)) {
1214                 err = PTR_ERR(sch);
1215                 goto err_out2;
1216         }
1217
1218         sch->parent = parent;
1219
1220         if (handle == TC_H_INGRESS) {
1221                 sch->flags |= TCQ_F_INGRESS;
1222                 handle = TC_H_MAKE(TC_H_INGRESS, 0);
1223         } else {
1224                 if (handle == 0) {
1225                         handle = qdisc_alloc_handle(dev);
1226                         if (handle == 0) {
1227                                 NL_SET_ERR_MSG(extack, "Maximum number of qdisc handles was exceeded");
1228                                 err = -ENOSPC;
1229                                 goto err_out3;
1230                         }
1231                 }
1232                 if (!netif_is_multiqueue(dev))
1233                         sch->flags |= TCQ_F_ONETXQUEUE;
1234         }
1235
1236         sch->handle = handle;
1237
1238         /* This exist to keep backward compatible with a userspace
1239          * loophole, what allowed userspace to get IFF_NO_QUEUE
1240          * facility on older kernels by setting tx_queue_len=0 (prior
1241          * to qdisc init), and then forgot to reinit tx_queue_len
1242          * before again attaching a qdisc.
1243          */
1244         if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1245                 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1246                 netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1247         }
1248
1249         err = qdisc_block_indexes_set(sch, tca, extack);
1250         if (err)
1251                 goto err_out3;
1252
1253         if (ops->init) {
1254                 err = ops->init(sch, tca[TCA_OPTIONS], extack);
1255                 if (err != 0)
1256                         goto err_out5;
1257         }
1258
1259         if (tca[TCA_STAB]) {
1260                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1261                 if (IS_ERR(stab)) {
1262                         err = PTR_ERR(stab);
1263                         goto err_out4;
1264                 }
1265                 rcu_assign_pointer(sch->stab, stab);
1266         }
1267         if (tca[TCA_RATE]) {
1268                 seqcount_t *running;
1269
1270                 err = -EOPNOTSUPP;
1271                 if (sch->flags & TCQ_F_MQROOT) {
1272                         NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1273                         goto err_out4;
1274                 }
1275
1276                 if (sch->parent != TC_H_ROOT &&
1277                     !(sch->flags & TCQ_F_INGRESS) &&
1278                     (!p || !(p->flags & TCQ_F_MQROOT)))
1279                         running = qdisc_root_sleeping_running(sch);
1280                 else
1281                         running = &sch->running;
1282
1283                 err = gen_new_estimator(&sch->bstats,
1284                                         sch->cpu_bstats,
1285                                         &sch->rate_est,
1286                                         NULL,
1287                                         running,
1288                                         tca[TCA_RATE]);
1289                 if (err) {
1290                         NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1291                         goto err_out4;
1292                 }
1293         }
1294
1295         qdisc_hash_add(sch, false);
1296         trace_qdisc_create(ops, dev, parent);
1297
1298         return sch;
1299
1300 err_out5:
1301         /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1302         if (ops->destroy)
1303                 ops->destroy(sch);
1304 err_out3:
1305         dev_put(dev);
1306         qdisc_free(sch);
1307 err_out2:
1308         module_put(ops->owner);
1309 err_out:
1310         *errp = err;
1311         return NULL;
1312
1313 err_out4:
1314         /*
1315          * Any broken qdiscs that would require a ops->reset() here?
1316          * The qdisc was never in action so it shouldn't be necessary.
1317          */
1318         qdisc_put_stab(rtnl_dereference(sch->stab));
1319         if (ops->destroy)
1320                 ops->destroy(sch);
1321         goto err_out3;
1322 }
1323
1324 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1325                         struct netlink_ext_ack *extack)
1326 {
1327         struct qdisc_size_table *ostab, *stab = NULL;
1328         int err = 0;
1329
1330         if (tca[TCA_OPTIONS]) {
1331                 if (!sch->ops->change) {
1332                         NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1333                         return -EINVAL;
1334                 }
1335                 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1336                         NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1337                         return -EOPNOTSUPP;
1338                 }
1339                 err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1340                 if (err)
1341                         return err;
1342         }
1343
1344         if (tca[TCA_STAB]) {
1345                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1346                 if (IS_ERR(stab))
1347                         return PTR_ERR(stab);
1348         }
1349
1350         ostab = rtnl_dereference(sch->stab);
1351         rcu_assign_pointer(sch->stab, stab);
1352         qdisc_put_stab(ostab);
1353
1354         if (tca[TCA_RATE]) {
1355                 /* NB: ignores errors from replace_estimator
1356                    because change can't be undone. */
1357                 if (sch->flags & TCQ_F_MQROOT)
1358                         goto out;
1359                 gen_replace_estimator(&sch->bstats,
1360                                       sch->cpu_bstats,
1361                                       &sch->rate_est,
1362                                       NULL,
1363                                       qdisc_root_sleeping_running(sch),
1364                                       tca[TCA_RATE]);
1365         }
1366 out:
1367         return 0;
1368 }
1369
1370 struct check_loop_arg {
1371         struct qdisc_walker     w;
1372         struct Qdisc            *p;
1373         int                     depth;
1374 };
1375
1376 static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1377                          struct qdisc_walker *w);
1378
1379 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1380 {
1381         struct check_loop_arg   arg;
1382
1383         if (q->ops->cl_ops == NULL)
1384                 return 0;
1385
1386         arg.w.stop = arg.w.skip = arg.w.count = 0;
1387         arg.w.fn = check_loop_fn;
1388         arg.depth = depth;
1389         arg.p = p;
1390         q->ops->cl_ops->walk(q, &arg.w);
1391         return arg.w.stop ? -ELOOP : 0;
1392 }
1393
1394 static int
1395 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1396 {
1397         struct Qdisc *leaf;
1398         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1399         struct check_loop_arg *arg = (struct check_loop_arg *)w;
1400
1401         leaf = cops->leaf(q, cl);
1402         if (leaf) {
1403                 if (leaf == arg->p || arg->depth > 7)
1404                         return -ELOOP;
1405                 return check_loop(leaf, arg->p, arg->depth + 1);
1406         }
1407         return 0;
1408 }
1409
1410 const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
1411         [TCA_KIND]              = { .type = NLA_STRING },
1412         [TCA_RATE]              = { .type = NLA_BINARY,
1413                                     .len = sizeof(struct tc_estimator) },
1414         [TCA_STAB]              = { .type = NLA_NESTED },
1415         [TCA_DUMP_INVISIBLE]    = { .type = NLA_FLAG },
1416         [TCA_CHAIN]             = { .type = NLA_U32 },
1417         [TCA_INGRESS_BLOCK]     = { .type = NLA_U32 },
1418         [TCA_EGRESS_BLOCK]      = { .type = NLA_U32 },
1419 };
1420
1421 /*
1422  * Delete/get qdisc.
1423  */
1424
1425 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1426                         struct netlink_ext_ack *extack)
1427 {
1428         struct net *net = sock_net(skb->sk);
1429         struct tcmsg *tcm = nlmsg_data(n);
1430         struct nlattr *tca[TCA_MAX + 1];
1431         struct net_device *dev;
1432         u32 clid;
1433         struct Qdisc *q = NULL;
1434         struct Qdisc *p = NULL;
1435         int err;
1436
1437         if ((n->nlmsg_type != RTM_GETQDISC) &&
1438             !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1439                 return -EPERM;
1440
1441         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1442                                      rtm_tca_policy, extack);
1443         if (err < 0)
1444                 return err;
1445
1446         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1447         if (!dev)
1448                 return -ENODEV;
1449
1450         clid = tcm->tcm_parent;
1451         if (clid) {
1452                 if (clid != TC_H_ROOT) {
1453                         if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1454                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1455                                 if (!p) {
1456                                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1457                                         return -ENOENT;
1458                                 }
1459                                 q = qdisc_leaf(p, clid);
1460                         } else if (dev_ingress_queue(dev)) {
1461                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1462                         }
1463                 } else {
1464                         q = rtnl_dereference(dev->qdisc);
1465                 }
1466                 if (!q) {
1467                         NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1468                         return -ENOENT;
1469                 }
1470
1471                 if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1472                         NL_SET_ERR_MSG(extack, "Invalid handle");
1473                         return -EINVAL;
1474                 }
1475         } else {
1476                 q = qdisc_lookup(dev, tcm->tcm_handle);
1477                 if (!q) {
1478                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1479                         return -ENOENT;
1480                 }
1481         }
1482
1483         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1484                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1485                 return -EINVAL;
1486         }
1487
1488         if (n->nlmsg_type == RTM_DELQDISC) {
1489                 if (!clid) {
1490                         NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1491                         return -EINVAL;
1492                 }
1493                 if (q->handle == 0) {
1494                         NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1495                         return -ENOENT;
1496                 }
1497                 err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1498                 if (err != 0)
1499                         return err;
1500         } else {
1501                 qdisc_notify(net, skb, n, clid, NULL, q);
1502         }
1503         return 0;
1504 }
1505
1506 /*
1507  * Create/change qdisc.
1508  */
1509
1510 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1511                            struct netlink_ext_ack *extack)
1512 {
1513         struct net *net = sock_net(skb->sk);
1514         struct tcmsg *tcm;
1515         struct nlattr *tca[TCA_MAX + 1];
1516         struct net_device *dev;
1517         u32 clid;
1518         struct Qdisc *q, *p;
1519         int err;
1520
1521         if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1522                 return -EPERM;
1523
1524 replay:
1525         /* Reinit, just in case something touches this. */
1526         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1527                                      rtm_tca_policy, extack);
1528         if (err < 0)
1529                 return err;
1530
1531         tcm = nlmsg_data(n);
1532         clid = tcm->tcm_parent;
1533         q = p = NULL;
1534
1535         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1536         if (!dev)
1537                 return -ENODEV;
1538
1539
1540         if (clid) {
1541                 if (clid != TC_H_ROOT) {
1542                         if (clid != TC_H_INGRESS) {
1543                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1544                                 if (!p) {
1545                                         NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1546                                         return -ENOENT;
1547                                 }
1548                                 q = qdisc_leaf(p, clid);
1549                         } else if (dev_ingress_queue_create(dev)) {
1550                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1551                         }
1552                 } else {
1553                         q = rtnl_dereference(dev->qdisc);
1554                 }
1555
1556                 /* It may be default qdisc, ignore it */
1557                 if (q && q->handle == 0)
1558                         q = NULL;
1559
1560                 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1561                         if (tcm->tcm_handle) {
1562                                 if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1563                                         NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1564                                         return -EEXIST;
1565                                 }
1566                                 if (TC_H_MIN(tcm->tcm_handle)) {
1567                                         NL_SET_ERR_MSG(extack, "Invalid minor handle");
1568                                         return -EINVAL;
1569                                 }
1570                                 q = qdisc_lookup(dev, tcm->tcm_handle);
1571                                 if (!q)
1572                                         goto create_n_graft;
1573                                 if (n->nlmsg_flags & NLM_F_EXCL) {
1574                                         NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1575                                         return -EEXIST;
1576                                 }
1577                                 if (tca[TCA_KIND] &&
1578                                     nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1579                                         NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1580                                         return -EINVAL;
1581                                 }
1582                                 if (q == p ||
1583                                     (p && check_loop(q, p, 0))) {
1584                                         NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1585                                         return -ELOOP;
1586                                 }
1587                                 qdisc_refcount_inc(q);
1588                                 goto graft;
1589                         } else {
1590                                 if (!q)
1591                                         goto create_n_graft;
1592
1593                                 /* This magic test requires explanation.
1594                                  *
1595                                  *   We know, that some child q is already
1596                                  *   attached to this parent and have choice:
1597                                  *   either to change it or to create/graft new one.
1598                                  *
1599                                  *   1. We are allowed to create/graft only
1600                                  *   if CREATE and REPLACE flags are set.
1601                                  *
1602                                  *   2. If EXCL is set, requestor wanted to say,
1603                                  *   that qdisc tcm_handle is not expected
1604                                  *   to exist, so that we choose create/graft too.
1605                                  *
1606                                  *   3. The last case is when no flags are set.
1607                                  *   Alas, it is sort of hole in API, we
1608                                  *   cannot decide what to do unambiguously.
1609                                  *   For now we select create/graft, if
1610                                  *   user gave KIND, which does not match existing.
1611                                  */
1612                                 if ((n->nlmsg_flags & NLM_F_CREATE) &&
1613                                     (n->nlmsg_flags & NLM_F_REPLACE) &&
1614                                     ((n->nlmsg_flags & NLM_F_EXCL) ||
1615                                      (tca[TCA_KIND] &&
1616                                       nla_strcmp(tca[TCA_KIND], q->ops->id))))
1617                                         goto create_n_graft;
1618                         }
1619                 }
1620         } else {
1621                 if (!tcm->tcm_handle) {
1622                         NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1623                         return -EINVAL;
1624                 }
1625                 q = qdisc_lookup(dev, tcm->tcm_handle);
1626         }
1627
1628         /* Change qdisc parameters */
1629         if (!q) {
1630                 NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1631                 return -ENOENT;
1632         }
1633         if (n->nlmsg_flags & NLM_F_EXCL) {
1634                 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1635                 return -EEXIST;
1636         }
1637         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1638                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1639                 return -EINVAL;
1640         }
1641         err = qdisc_change(q, tca, extack);
1642         if (err == 0)
1643                 qdisc_notify(net, skb, n, clid, NULL, q);
1644         return err;
1645
1646 create_n_graft:
1647         if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1648                 NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1649                 return -ENOENT;
1650         }
1651         if (clid == TC_H_INGRESS) {
1652                 if (dev_ingress_queue(dev)) {
1653                         q = qdisc_create(dev, dev_ingress_queue(dev), p,
1654                                          tcm->tcm_parent, tcm->tcm_parent,
1655                                          tca, &err, extack);
1656                 } else {
1657                         NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1658                         err = -ENOENT;
1659                 }
1660         } else {
1661                 struct netdev_queue *dev_queue;
1662
1663                 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1664                         dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1665                 else if (p)
1666                         dev_queue = p->dev_queue;
1667                 else
1668                         dev_queue = netdev_get_tx_queue(dev, 0);
1669
1670                 q = qdisc_create(dev, dev_queue, p,
1671                                  tcm->tcm_parent, tcm->tcm_handle,
1672                                  tca, &err, extack);
1673         }
1674         if (q == NULL) {
1675                 if (err == -EAGAIN)
1676                         goto replay;
1677                 return err;
1678         }
1679
1680 graft:
1681         err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1682         if (err) {
1683                 if (q)
1684                         qdisc_put(q);
1685                 return err;
1686         }
1687
1688         return 0;
1689 }
1690
1691 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1692                               struct netlink_callback *cb,
1693                               int *q_idx_p, int s_q_idx, bool recur,
1694                               bool dump_invisible)
1695 {
1696         int ret = 0, q_idx = *q_idx_p;
1697         struct Qdisc *q;
1698         int b;
1699
1700         if (!root)
1701                 return 0;
1702
1703         q = root;
1704         if (q_idx < s_q_idx) {
1705                 q_idx++;
1706         } else {
1707                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1708                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1709                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1710                                   RTM_NEWQDISC) <= 0)
1711                         goto done;
1712                 q_idx++;
1713         }
1714
1715         /* If dumping singletons, there is no qdisc_dev(root) and the singleton
1716          * itself has already been dumped.
1717          *
1718          * If we've already dumped the top-level (ingress) qdisc above and the global
1719          * qdisc hashtable, we don't want to hit it again
1720          */
1721         if (!qdisc_dev(root) || !recur)
1722                 goto out;
1723
1724         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1725                 if (q_idx < s_q_idx) {
1726                         q_idx++;
1727                         continue;
1728                 }
1729                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1730                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1731                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1732                                   RTM_NEWQDISC) <= 0)
1733                         goto done;
1734                 q_idx++;
1735         }
1736
1737 out:
1738         *q_idx_p = q_idx;
1739         return ret;
1740 done:
1741         ret = -1;
1742         goto out;
1743 }
1744
1745 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1746 {
1747         struct net *net = sock_net(skb->sk);
1748         int idx, q_idx;
1749         int s_idx, s_q_idx;
1750         struct net_device *dev;
1751         const struct nlmsghdr *nlh = cb->nlh;
1752         struct nlattr *tca[TCA_MAX + 1];
1753         int err;
1754
1755         s_idx = cb->args[0];
1756         s_q_idx = q_idx = cb->args[1];
1757
1758         idx = 0;
1759         ASSERT_RTNL();
1760
1761         err = nlmsg_parse_deprecated(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
1762                                      rtm_tca_policy, cb->extack);
1763         if (err < 0)
1764                 return err;
1765
1766         for_each_netdev(net, dev) {
1767                 struct netdev_queue *dev_queue;
1768
1769                 if (idx < s_idx)
1770                         goto cont;
1771                 if (idx > s_idx)
1772                         s_q_idx = 0;
1773                 q_idx = 0;
1774
1775                 if (tc_dump_qdisc_root(rtnl_dereference(dev->qdisc),
1776                                        skb, cb, &q_idx, s_q_idx,
1777                                        true, tca[TCA_DUMP_INVISIBLE]) < 0)
1778                         goto done;
1779
1780                 dev_queue = dev_ingress_queue(dev);
1781                 if (dev_queue &&
1782                     tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1783                                        &q_idx, s_q_idx, false,
1784                                        tca[TCA_DUMP_INVISIBLE]) < 0)
1785                         goto done;
1786
1787 cont:
1788                 idx++;
1789         }
1790
1791 done:
1792         cb->args[0] = idx;
1793         cb->args[1] = q_idx;
1794
1795         return skb->len;
1796 }
1797
1798
1799
1800 /************************************************
1801  *      Traffic classes manipulation.           *
1802  ************************************************/
1803
1804 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1805                           unsigned long cl,
1806                           u32 portid, u32 seq, u16 flags, int event)
1807 {
1808         struct tcmsg *tcm;
1809         struct nlmsghdr  *nlh;
1810         unsigned char *b = skb_tail_pointer(skb);
1811         struct gnet_dump d;
1812         const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1813
1814         cond_resched();
1815         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1816         if (!nlh)
1817                 goto out_nlmsg_trim;
1818         tcm = nlmsg_data(nlh);
1819         tcm->tcm_family = AF_UNSPEC;
1820         tcm->tcm__pad1 = 0;
1821         tcm->tcm__pad2 = 0;
1822         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1823         tcm->tcm_parent = q->handle;
1824         tcm->tcm_handle = q->handle;
1825         tcm->tcm_info = 0;
1826         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1827                 goto nla_put_failure;
1828         if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1829                 goto nla_put_failure;
1830
1831         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1832                                          NULL, &d, TCA_PAD) < 0)
1833                 goto nla_put_failure;
1834
1835         if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1836                 goto nla_put_failure;
1837
1838         if (gnet_stats_finish_copy(&d) < 0)
1839                 goto nla_put_failure;
1840
1841         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1842         return skb->len;
1843
1844 out_nlmsg_trim:
1845 nla_put_failure:
1846         nlmsg_trim(skb, b);
1847         return -1;
1848 }
1849
1850 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1851                          struct nlmsghdr *n, struct Qdisc *q,
1852                          unsigned long cl, int event)
1853 {
1854         struct sk_buff *skb;
1855         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1856
1857         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1858         if (!skb)
1859                 return -ENOBUFS;
1860
1861         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1862                 kfree_skb(skb);
1863                 return -EINVAL;
1864         }
1865
1866         return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1867                               n->nlmsg_flags & NLM_F_ECHO);
1868 }
1869
1870 static int tclass_del_notify(struct net *net,
1871                              const struct Qdisc_class_ops *cops,
1872                              struct sk_buff *oskb, struct nlmsghdr *n,
1873                              struct Qdisc *q, unsigned long cl,
1874                              struct netlink_ext_ack *extack)
1875 {
1876         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1877         struct sk_buff *skb;
1878         int err = 0;
1879
1880         if (!cops->delete)
1881                 return -EOPNOTSUPP;
1882
1883         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1884         if (!skb)
1885                 return -ENOBUFS;
1886
1887         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1888                            RTM_DELTCLASS) < 0) {
1889                 kfree_skb(skb);
1890                 return -EINVAL;
1891         }
1892
1893         err = cops->delete(q, cl, extack);
1894         if (err) {
1895                 kfree_skb(skb);
1896                 return err;
1897         }
1898
1899         err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1900                              n->nlmsg_flags & NLM_F_ECHO);
1901         return err;
1902 }
1903
1904 #ifdef CONFIG_NET_CLS
1905
1906 struct tcf_bind_args {
1907         struct tcf_walker w;
1908         unsigned long base;
1909         unsigned long cl;
1910         u32 classid;
1911 };
1912
1913 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
1914 {
1915         struct tcf_bind_args *a = (void *)arg;
1916
1917         if (tp->ops->bind_class) {
1918                 struct Qdisc *q = tcf_block_q(tp->chain->block);
1919
1920                 sch_tree_lock(q);
1921                 tp->ops->bind_class(n, a->classid, a->cl, q, a->base);
1922                 sch_tree_unlock(q);
1923         }
1924         return 0;
1925 }
1926
1927 struct tc_bind_class_args {
1928         struct qdisc_walker w;
1929         unsigned long new_cl;
1930         u32 portid;
1931         u32 clid;
1932 };
1933
1934 static int tc_bind_class_walker(struct Qdisc *q, unsigned long cl,
1935                                 struct qdisc_walker *w)
1936 {
1937         struct tc_bind_class_args *a = (struct tc_bind_class_args *)w;
1938         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1939         struct tcf_block *block;
1940         struct tcf_chain *chain;
1941
1942         block = cops->tcf_block(q, cl, NULL);
1943         if (!block)
1944                 return 0;
1945         for (chain = tcf_get_next_chain(block, NULL);
1946              chain;
1947              chain = tcf_get_next_chain(block, chain)) {
1948                 struct tcf_proto *tp;
1949
1950                 for (tp = tcf_get_next_proto(chain, NULL);
1951                      tp; tp = tcf_get_next_proto(chain, tp)) {
1952                         struct tcf_bind_args arg = {};
1953
1954                         arg.w.fn = tcf_node_bind;
1955                         arg.classid = a->clid;
1956                         arg.base = cl;
1957                         arg.cl = a->new_cl;
1958                         tp->ops->walk(tp, &arg.w, true);
1959                 }
1960         }
1961
1962         return 0;
1963 }
1964
1965 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1966                            unsigned long new_cl)
1967 {
1968         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1969         struct tc_bind_class_args args = {};
1970
1971         if (!cops->tcf_block)
1972                 return;
1973         args.portid = portid;
1974         args.clid = clid;
1975         args.new_cl = new_cl;
1976         args.w.fn = tc_bind_class_walker;
1977         q->ops->cl_ops->walk(q, &args.w);
1978 }
1979
1980 #else
1981
1982 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1983                            unsigned long new_cl)
1984 {
1985 }
1986
1987 #endif
1988
1989 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
1990                          struct netlink_ext_ack *extack)
1991 {
1992         struct net *net = sock_net(skb->sk);
1993         struct tcmsg *tcm = nlmsg_data(n);
1994         struct nlattr *tca[TCA_MAX + 1];
1995         struct net_device *dev;
1996         struct Qdisc *q = NULL;
1997         const struct Qdisc_class_ops *cops;
1998         unsigned long cl = 0;
1999         unsigned long new_cl;
2000         u32 portid;
2001         u32 clid;
2002         u32 qid;
2003         int err;
2004
2005         if ((n->nlmsg_type != RTM_GETTCLASS) &&
2006             !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
2007                 return -EPERM;
2008
2009         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
2010                                      rtm_tca_policy, extack);
2011         if (err < 0)
2012                 return err;
2013
2014         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
2015         if (!dev)
2016                 return -ENODEV;
2017
2018         /*
2019            parent == TC_H_UNSPEC - unspecified parent.
2020            parent == TC_H_ROOT   - class is root, which has no parent.
2021            parent == X:0         - parent is root class.
2022            parent == X:Y         - parent is a node in hierarchy.
2023            parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
2024
2025            handle == 0:0         - generate handle from kernel pool.
2026            handle == 0:Y         - class is X:Y, where X:0 is qdisc.
2027            handle == X:Y         - clear.
2028            handle == X:0         - root class.
2029          */
2030
2031         /* Step 1. Determine qdisc handle X:0 */
2032
2033         portid = tcm->tcm_parent;
2034         clid = tcm->tcm_handle;
2035         qid = TC_H_MAJ(clid);
2036
2037         if (portid != TC_H_ROOT) {
2038                 u32 qid1 = TC_H_MAJ(portid);
2039
2040                 if (qid && qid1) {
2041                         /* If both majors are known, they must be identical. */
2042                         if (qid != qid1)
2043                                 return -EINVAL;
2044                 } else if (qid1) {
2045                         qid = qid1;
2046                 } else if (qid == 0)
2047                         qid = rtnl_dereference(dev->qdisc)->handle;
2048
2049                 /* Now qid is genuine qdisc handle consistent
2050                  * both with parent and child.
2051                  *
2052                  * TC_H_MAJ(portid) still may be unspecified, complete it now.
2053                  */
2054                 if (portid)
2055                         portid = TC_H_MAKE(qid, portid);
2056         } else {
2057                 if (qid == 0)
2058                         qid = rtnl_dereference(dev->qdisc)->handle;
2059         }
2060
2061         /* OK. Locate qdisc */
2062         q = qdisc_lookup(dev, qid);
2063         if (!q)
2064                 return -ENOENT;
2065
2066         /* An check that it supports classes */
2067         cops = q->ops->cl_ops;
2068         if (cops == NULL)
2069                 return -EINVAL;
2070
2071         /* Now try to get class */
2072         if (clid == 0) {
2073                 if (portid == TC_H_ROOT)
2074                         clid = qid;
2075         } else
2076                 clid = TC_H_MAKE(qid, clid);
2077
2078         if (clid)
2079                 cl = cops->find(q, clid);
2080
2081         if (cl == 0) {
2082                 err = -ENOENT;
2083                 if (n->nlmsg_type != RTM_NEWTCLASS ||
2084                     !(n->nlmsg_flags & NLM_F_CREATE))
2085                         goto out;
2086         } else {
2087                 switch (n->nlmsg_type) {
2088                 case RTM_NEWTCLASS:
2089                         err = -EEXIST;
2090                         if (n->nlmsg_flags & NLM_F_EXCL)
2091                                 goto out;
2092                         break;
2093                 case RTM_DELTCLASS:
2094                         err = tclass_del_notify(net, cops, skb, n, q, cl, extack);
2095                         /* Unbind the class with flilters with 0 */
2096                         tc_bind_tclass(q, portid, clid, 0);
2097                         goto out;
2098                 case RTM_GETTCLASS:
2099                         err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
2100                         goto out;
2101                 default:
2102                         err = -EINVAL;
2103                         goto out;
2104                 }
2105         }
2106
2107         if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
2108                 NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
2109                 return -EOPNOTSUPP;
2110         }
2111
2112         new_cl = cl;
2113         err = -EOPNOTSUPP;
2114         if (cops->change)
2115                 err = cops->change(q, clid, portid, tca, &new_cl, extack);
2116         if (err == 0) {
2117                 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
2118                 /* We just create a new class, need to do reverse binding. */
2119                 if (cl != new_cl)
2120                         tc_bind_tclass(q, portid, clid, new_cl);
2121         }
2122 out:
2123         return err;
2124 }
2125
2126 struct qdisc_dump_args {
2127         struct qdisc_walker     w;
2128         struct sk_buff          *skb;
2129         struct netlink_callback *cb;
2130 };
2131
2132 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
2133                             struct qdisc_walker *arg)
2134 {
2135         struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
2136
2137         return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
2138                               a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
2139                               RTM_NEWTCLASS);
2140 }
2141
2142 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
2143                                 struct tcmsg *tcm, struct netlink_callback *cb,
2144                                 int *t_p, int s_t)
2145 {
2146         struct qdisc_dump_args arg;
2147
2148         if (tc_qdisc_dump_ignore(q, false) ||
2149             *t_p < s_t || !q->ops->cl_ops ||
2150             (tcm->tcm_parent &&
2151              TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2152                 (*t_p)++;
2153                 return 0;
2154         }
2155         if (*t_p > s_t)
2156                 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2157         arg.w.fn = qdisc_class_dump;
2158         arg.skb = skb;
2159         arg.cb = cb;
2160         arg.w.stop  = 0;
2161         arg.w.skip = cb->args[1];
2162         arg.w.count = 0;
2163         q->ops->cl_ops->walk(q, &arg.w);
2164         cb->args[1] = arg.w.count;
2165         if (arg.w.stop)
2166                 return -1;
2167         (*t_p)++;
2168         return 0;
2169 }
2170
2171 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2172                                struct tcmsg *tcm, struct netlink_callback *cb,
2173                                int *t_p, int s_t, bool recur)
2174 {
2175         struct Qdisc *q;
2176         int b;
2177
2178         if (!root)
2179                 return 0;
2180
2181         if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2182                 return -1;
2183
2184         if (!qdisc_dev(root) || !recur)
2185                 return 0;
2186
2187         if (tcm->tcm_parent) {
2188                 q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2189                 if (q && q != root &&
2190                     tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2191                         return -1;
2192                 return 0;
2193         }
2194         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2195                 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2196                         return -1;
2197         }
2198
2199         return 0;
2200 }
2201
2202 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2203 {
2204         struct tcmsg *tcm = nlmsg_data(cb->nlh);
2205         struct net *net = sock_net(skb->sk);
2206         struct netdev_queue *dev_queue;
2207         struct net_device *dev;
2208         int t, s_t;
2209
2210         if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2211                 return 0;
2212         dev = dev_get_by_index(net, tcm->tcm_ifindex);
2213         if (!dev)
2214                 return 0;
2215
2216         s_t = cb->args[0];
2217         t = 0;
2218
2219         if (tc_dump_tclass_root(rtnl_dereference(dev->qdisc),
2220                                 skb, tcm, cb, &t, s_t, true) < 0)
2221                 goto done;
2222
2223         dev_queue = dev_ingress_queue(dev);
2224         if (dev_queue &&
2225             tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
2226                                 &t, s_t, false) < 0)
2227                 goto done;
2228
2229 done:
2230         cb->args[0] = t;
2231
2232         dev_put(dev);
2233         return skb->len;
2234 }
2235
2236 #ifdef CONFIG_PROC_FS
2237 static int psched_show(struct seq_file *seq, void *v)
2238 {
2239         seq_printf(seq, "%08x %08x %08x %08x\n",
2240                    (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2241                    1000000,
2242                    (u32)NSEC_PER_SEC / hrtimer_resolution);
2243
2244         return 0;
2245 }
2246
2247 static int __net_init psched_net_init(struct net *net)
2248 {
2249         struct proc_dir_entry *e;
2250
2251         e = proc_create_single("psched", 0, net->proc_net, psched_show);
2252         if (e == NULL)
2253                 return -ENOMEM;
2254
2255         return 0;
2256 }
2257
2258 static void __net_exit psched_net_exit(struct net *net)
2259 {
2260         remove_proc_entry("psched", net->proc_net);
2261 }
2262 #else
2263 static int __net_init psched_net_init(struct net *net)
2264 {
2265         return 0;
2266 }
2267
2268 static void __net_exit psched_net_exit(struct net *net)
2269 {
2270 }
2271 #endif
2272
2273 static struct pernet_operations psched_net_ops = {
2274         .init = psched_net_init,
2275         .exit = psched_net_exit,
2276 };
2277
2278 static int __init pktsched_init(void)
2279 {
2280         int err;
2281
2282         err = register_pernet_subsys(&psched_net_ops);
2283         if (err) {
2284                 pr_err("pktsched_init: "
2285                        "cannot initialize per netns operations\n");
2286                 return err;
2287         }
2288
2289         register_qdisc(&pfifo_fast_ops);
2290         register_qdisc(&pfifo_qdisc_ops);
2291         register_qdisc(&bfifo_qdisc_ops);
2292         register_qdisc(&pfifo_head_drop_qdisc_ops);
2293         register_qdisc(&mq_qdisc_ops);
2294         register_qdisc(&noqueue_qdisc_ops);
2295
2296         rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2297         rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2298         rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2299                       0);
2300         rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2301         rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2302         rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2303                       0);
2304
2305         return 0;
2306 }
2307
2308 subsys_initcall(pktsched_init);