Merge tag 'scsi-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/jejb/scsi
[platform/kernel/linux-starfive.git] / net / sched / sch_api.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * net/sched/sch_api.c  Packet scheduler API.
4  *
5  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
6  *
7  * Fixes:
8  *
9  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
10  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
11  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
12  */
13
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/string.h>
18 #include <linux/errno.h>
19 #include <linux/skbuff.h>
20 #include <linux/init.h>
21 #include <linux/proc_fs.h>
22 #include <linux/seq_file.h>
23 #include <linux/kmod.h>
24 #include <linux/list.h>
25 #include <linux/hrtimer.h>
26 #include <linux/slab.h>
27 #include <linux/hashtable.h>
28
29 #include <net/net_namespace.h>
30 #include <net/sock.h>
31 #include <net/netlink.h>
32 #include <net/pkt_sched.h>
33 #include <net/pkt_cls.h>
34
35 #include <trace/events/qdisc.h>
36
37 /*
38
39    Short review.
40    -------------
41
42    This file consists of two interrelated parts:
43
44    1. queueing disciplines manager frontend.
45    2. traffic classes manager frontend.
46
47    Generally, queueing discipline ("qdisc") is a black box,
48    which is able to enqueue packets and to dequeue them (when
49    device is ready to send something) in order and at times
50    determined by algorithm hidden in it.
51
52    qdisc's are divided to two categories:
53    - "queues", which have no internal structure visible from outside.
54    - "schedulers", which split all the packets to "traffic classes",
55      using "packet classifiers" (look at cls_api.c)
56
57    In turn, classes may have child qdiscs (as rule, queues)
58    attached to them etc. etc. etc.
59
60    The goal of the routines in this file is to translate
61    information supplied by user in the form of handles
62    to more intelligible for kernel form, to make some sanity
63    checks and part of work, which is common to all qdiscs
64    and to provide rtnetlink notifications.
65
66    All real intelligent work is done inside qdisc modules.
67
68
69
70    Every discipline has two major routines: enqueue and dequeue.
71
72    ---dequeue
73
74    dequeue usually returns a skb to send. It is allowed to return NULL,
75    but it does not mean that queue is empty, it just means that
76    discipline does not want to send anything this time.
77    Queue is really empty if q->q.qlen == 0.
78    For complicated disciplines with multiple queues q->q is not
79    real packet queue, but however q->q.qlen must be valid.
80
81    ---enqueue
82
83    enqueue returns 0, if packet was enqueued successfully.
84    If packet (this one or another one) was dropped, it returns
85    not zero error code.
86    NET_XMIT_DROP        - this packet dropped
87      Expected action: do not backoff, but wait until queue will clear.
88    NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
89      Expected action: backoff or ignore
90
91    Auxiliary routines:
92
93    ---peek
94
95    like dequeue but without removing a packet from the queue
96
97    ---reset
98
99    returns qdisc to initial state: purge all buffers, clear all
100    timers, counters (except for statistics) etc.
101
102    ---init
103
104    initializes newly created qdisc.
105
106    ---destroy
107
108    destroys resources allocated by init and during lifetime of qdisc.
109
110    ---change
111
112    changes qdisc parameters.
113  */
114
115 /* Protects list of registered TC modules. It is pure SMP lock. */
116 static DEFINE_RWLOCK(qdisc_mod_lock);
117
118
119 /************************************************
120  *      Queueing disciplines manipulation.      *
121  ************************************************/
122
123
124 /* The list of all installed queueing disciplines. */
125
126 static struct Qdisc_ops *qdisc_base;
127
128 /* Register/unregister queueing discipline */
129
130 int register_qdisc(struct Qdisc_ops *qops)
131 {
132         struct Qdisc_ops *q, **qp;
133         int rc = -EEXIST;
134
135         write_lock(&qdisc_mod_lock);
136         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
137                 if (!strcmp(qops->id, q->id))
138                         goto out;
139
140         if (qops->enqueue == NULL)
141                 qops->enqueue = noop_qdisc_ops.enqueue;
142         if (qops->peek == NULL) {
143                 if (qops->dequeue == NULL)
144                         qops->peek = noop_qdisc_ops.peek;
145                 else
146                         goto out_einval;
147         }
148         if (qops->dequeue == NULL)
149                 qops->dequeue = noop_qdisc_ops.dequeue;
150
151         if (qops->cl_ops) {
152                 const struct Qdisc_class_ops *cops = qops->cl_ops;
153
154                 if (!(cops->find && cops->walk && cops->leaf))
155                         goto out_einval;
156
157                 if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
158                         goto out_einval;
159         }
160
161         qops->next = NULL;
162         *qp = qops;
163         rc = 0;
164 out:
165         write_unlock(&qdisc_mod_lock);
166         return rc;
167
168 out_einval:
169         rc = -EINVAL;
170         goto out;
171 }
172 EXPORT_SYMBOL(register_qdisc);
173
174 int unregister_qdisc(struct Qdisc_ops *qops)
175 {
176         struct Qdisc_ops *q, **qp;
177         int err = -ENOENT;
178
179         write_lock(&qdisc_mod_lock);
180         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
181                 if (q == qops)
182                         break;
183         if (q) {
184                 *qp = q->next;
185                 q->next = NULL;
186                 err = 0;
187         }
188         write_unlock(&qdisc_mod_lock);
189         return err;
190 }
191 EXPORT_SYMBOL(unregister_qdisc);
192
193 /* Get default qdisc if not otherwise specified */
194 void qdisc_get_default(char *name, size_t len)
195 {
196         read_lock(&qdisc_mod_lock);
197         strlcpy(name, default_qdisc_ops->id, len);
198         read_unlock(&qdisc_mod_lock);
199 }
200
201 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
202 {
203         struct Qdisc_ops *q = NULL;
204
205         for (q = qdisc_base; q; q = q->next) {
206                 if (!strcmp(name, q->id)) {
207                         if (!try_module_get(q->owner))
208                                 q = NULL;
209                         break;
210                 }
211         }
212
213         return q;
214 }
215
216 /* Set new default qdisc to use */
217 int qdisc_set_default(const char *name)
218 {
219         const struct Qdisc_ops *ops;
220
221         if (!capable(CAP_NET_ADMIN))
222                 return -EPERM;
223
224         write_lock(&qdisc_mod_lock);
225         ops = qdisc_lookup_default(name);
226         if (!ops) {
227                 /* Not found, drop lock and try to load module */
228                 write_unlock(&qdisc_mod_lock);
229                 request_module("sch_%s", name);
230                 write_lock(&qdisc_mod_lock);
231
232                 ops = qdisc_lookup_default(name);
233         }
234
235         if (ops) {
236                 /* Set new default */
237                 module_put(default_qdisc_ops->owner);
238                 default_qdisc_ops = ops;
239         }
240         write_unlock(&qdisc_mod_lock);
241
242         return ops ? 0 : -ENOENT;
243 }
244
245 #ifdef CONFIG_NET_SCH_DEFAULT
246 /* Set default value from kernel config */
247 static int __init sch_default_qdisc(void)
248 {
249         return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
250 }
251 late_initcall(sch_default_qdisc);
252 #endif
253
254 /* We know handle. Find qdisc among all qdisc's attached to device
255  * (root qdisc, all its children, children of children etc.)
256  * Note: caller either uses rtnl or rcu_read_lock()
257  */
258
259 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
260 {
261         struct Qdisc *q;
262
263         if (!qdisc_dev(root))
264                 return (root->handle == handle ? root : NULL);
265
266         if (!(root->flags & TCQ_F_BUILTIN) &&
267             root->handle == handle)
268                 return root;
269
270         hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle,
271                                    lockdep_rtnl_is_held()) {
272                 if (q->handle == handle)
273                         return q;
274         }
275         return NULL;
276 }
277
278 void qdisc_hash_add(struct Qdisc *q, bool invisible)
279 {
280         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
281                 ASSERT_RTNL();
282                 hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
283                 if (invisible)
284                         q->flags |= TCQ_F_INVISIBLE;
285         }
286 }
287 EXPORT_SYMBOL(qdisc_hash_add);
288
289 void qdisc_hash_del(struct Qdisc *q)
290 {
291         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
292                 ASSERT_RTNL();
293                 hash_del_rcu(&q->hash);
294         }
295 }
296 EXPORT_SYMBOL(qdisc_hash_del);
297
298 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
299 {
300         struct Qdisc *q;
301
302         if (!handle)
303                 return NULL;
304         q = qdisc_match_from_root(rtnl_dereference(dev->qdisc), handle);
305         if (q)
306                 goto out;
307
308         if (dev_ingress_queue(dev))
309                 q = qdisc_match_from_root(
310                         dev_ingress_queue(dev)->qdisc_sleeping,
311                         handle);
312 out:
313         return q;
314 }
315
316 struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle)
317 {
318         struct netdev_queue *nq;
319         struct Qdisc *q;
320
321         if (!handle)
322                 return NULL;
323         q = qdisc_match_from_root(rcu_dereference(dev->qdisc), handle);
324         if (q)
325                 goto out;
326
327         nq = dev_ingress_queue_rcu(dev);
328         if (nq)
329                 q = qdisc_match_from_root(nq->qdisc_sleeping, handle);
330 out:
331         return q;
332 }
333
334 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
335 {
336         unsigned long cl;
337         const struct Qdisc_class_ops *cops = p->ops->cl_ops;
338
339         if (cops == NULL)
340                 return NULL;
341         cl = cops->find(p, classid);
342
343         if (cl == 0)
344                 return NULL;
345         return cops->leaf(p, cl);
346 }
347
348 /* Find queueing discipline by name */
349
350 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
351 {
352         struct Qdisc_ops *q = NULL;
353
354         if (kind) {
355                 read_lock(&qdisc_mod_lock);
356                 for (q = qdisc_base; q; q = q->next) {
357                         if (nla_strcmp(kind, q->id) == 0) {
358                                 if (!try_module_get(q->owner))
359                                         q = NULL;
360                                 break;
361                         }
362                 }
363                 read_unlock(&qdisc_mod_lock);
364         }
365         return q;
366 }
367
368 /* The linklayer setting were not transferred from iproute2, in older
369  * versions, and the rate tables lookup systems have been dropped in
370  * the kernel. To keep backward compatible with older iproute2 tc
371  * utils, we detect the linklayer setting by detecting if the rate
372  * table were modified.
373  *
374  * For linklayer ATM table entries, the rate table will be aligned to
375  * 48 bytes, thus some table entries will contain the same value.  The
376  * mpu (min packet unit) is also encoded into the old rate table, thus
377  * starting from the mpu, we find low and high table entries for
378  * mapping this cell.  If these entries contain the same value, when
379  * the rate tables have been modified for linklayer ATM.
380  *
381  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
382  * and then roundup to the next cell, calc the table entry one below,
383  * and compare.
384  */
385 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
386 {
387         int low       = roundup(r->mpu, 48);
388         int high      = roundup(low+1, 48);
389         int cell_low  = low >> r->cell_log;
390         int cell_high = (high >> r->cell_log) - 1;
391
392         /* rtab is too inaccurate at rates > 100Mbit/s */
393         if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
394                 pr_debug("TC linklayer: Giving up ATM detection\n");
395                 return TC_LINKLAYER_ETHERNET;
396         }
397
398         if ((cell_high > cell_low) && (cell_high < 256)
399             && (rtab[cell_low] == rtab[cell_high])) {
400                 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
401                          cell_low, cell_high, rtab[cell_high]);
402                 return TC_LINKLAYER_ATM;
403         }
404         return TC_LINKLAYER_ETHERNET;
405 }
406
407 static struct qdisc_rate_table *qdisc_rtab_list;
408
409 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
410                                         struct nlattr *tab,
411                                         struct netlink_ext_ack *extack)
412 {
413         struct qdisc_rate_table *rtab;
414
415         if (tab == NULL || r->rate == 0 ||
416             r->cell_log == 0 || r->cell_log >= 32 ||
417             nla_len(tab) != TC_RTAB_SIZE) {
418                 NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
419                 return NULL;
420         }
421
422         for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
423                 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
424                     !memcmp(&rtab->data, nla_data(tab), 1024)) {
425                         rtab->refcnt++;
426                         return rtab;
427                 }
428         }
429
430         rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
431         if (rtab) {
432                 rtab->rate = *r;
433                 rtab->refcnt = 1;
434                 memcpy(rtab->data, nla_data(tab), 1024);
435                 if (r->linklayer == TC_LINKLAYER_UNAWARE)
436                         r->linklayer = __detect_linklayer(r, rtab->data);
437                 rtab->next = qdisc_rtab_list;
438                 qdisc_rtab_list = rtab;
439         } else {
440                 NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
441         }
442         return rtab;
443 }
444 EXPORT_SYMBOL(qdisc_get_rtab);
445
446 void qdisc_put_rtab(struct qdisc_rate_table *tab)
447 {
448         struct qdisc_rate_table *rtab, **rtabp;
449
450         if (!tab || --tab->refcnt)
451                 return;
452
453         for (rtabp = &qdisc_rtab_list;
454              (rtab = *rtabp) != NULL;
455              rtabp = &rtab->next) {
456                 if (rtab == tab) {
457                         *rtabp = rtab->next;
458                         kfree(rtab);
459                         return;
460                 }
461         }
462 }
463 EXPORT_SYMBOL(qdisc_put_rtab);
464
465 static LIST_HEAD(qdisc_stab_list);
466
467 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
468         [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
469         [TCA_STAB_DATA] = { .type = NLA_BINARY },
470 };
471
472 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
473                                                struct netlink_ext_ack *extack)
474 {
475         struct nlattr *tb[TCA_STAB_MAX + 1];
476         struct qdisc_size_table *stab;
477         struct tc_sizespec *s;
478         unsigned int tsize = 0;
479         u16 *tab = NULL;
480         int err;
481
482         err = nla_parse_nested_deprecated(tb, TCA_STAB_MAX, opt, stab_policy,
483                                           extack);
484         if (err < 0)
485                 return ERR_PTR(err);
486         if (!tb[TCA_STAB_BASE]) {
487                 NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
488                 return ERR_PTR(-EINVAL);
489         }
490
491         s = nla_data(tb[TCA_STAB_BASE]);
492
493         if (s->tsize > 0) {
494                 if (!tb[TCA_STAB_DATA]) {
495                         NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
496                         return ERR_PTR(-EINVAL);
497                 }
498                 tab = nla_data(tb[TCA_STAB_DATA]);
499                 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
500         }
501
502         if (tsize != s->tsize || (!tab && tsize > 0)) {
503                 NL_SET_ERR_MSG(extack, "Invalid size of size table");
504                 return ERR_PTR(-EINVAL);
505         }
506
507         list_for_each_entry(stab, &qdisc_stab_list, list) {
508                 if (memcmp(&stab->szopts, s, sizeof(*s)))
509                         continue;
510                 if (tsize > 0 &&
511                     memcmp(stab->data, tab, flex_array_size(stab, data, tsize)))
512                         continue;
513                 stab->refcnt++;
514                 return stab;
515         }
516
517         if (s->size_log > STAB_SIZE_LOG_MAX ||
518             s->cell_log > STAB_SIZE_LOG_MAX) {
519                 NL_SET_ERR_MSG(extack, "Invalid logarithmic size of size table");
520                 return ERR_PTR(-EINVAL);
521         }
522
523         stab = kmalloc(struct_size(stab, data, tsize), GFP_KERNEL);
524         if (!stab)
525                 return ERR_PTR(-ENOMEM);
526
527         stab->refcnt = 1;
528         stab->szopts = *s;
529         if (tsize > 0)
530                 memcpy(stab->data, tab, flex_array_size(stab, data, tsize));
531
532         list_add_tail(&stab->list, &qdisc_stab_list);
533
534         return stab;
535 }
536
537 void qdisc_put_stab(struct qdisc_size_table *tab)
538 {
539         if (!tab)
540                 return;
541
542         if (--tab->refcnt == 0) {
543                 list_del(&tab->list);
544                 kfree_rcu(tab, rcu);
545         }
546 }
547 EXPORT_SYMBOL(qdisc_put_stab);
548
549 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
550 {
551         struct nlattr *nest;
552
553         nest = nla_nest_start_noflag(skb, TCA_STAB);
554         if (nest == NULL)
555                 goto nla_put_failure;
556         if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
557                 goto nla_put_failure;
558         nla_nest_end(skb, nest);
559
560         return skb->len;
561
562 nla_put_failure:
563         return -1;
564 }
565
566 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
567                                const struct qdisc_size_table *stab)
568 {
569         int pkt_len, slot;
570
571         pkt_len = skb->len + stab->szopts.overhead;
572         if (unlikely(!stab->szopts.tsize))
573                 goto out;
574
575         slot = pkt_len + stab->szopts.cell_align;
576         if (unlikely(slot < 0))
577                 slot = 0;
578
579         slot >>= stab->szopts.cell_log;
580         if (likely(slot < stab->szopts.tsize))
581                 pkt_len = stab->data[slot];
582         else
583                 pkt_len = stab->data[stab->szopts.tsize - 1] *
584                                 (slot / stab->szopts.tsize) +
585                                 stab->data[slot % stab->szopts.tsize];
586
587         pkt_len <<= stab->szopts.size_log;
588 out:
589         if (unlikely(pkt_len < 1))
590                 pkt_len = 1;
591         qdisc_skb_cb(skb)->pkt_len = pkt_len;
592 }
593 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
594
595 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
596 {
597         if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
598                 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
599                         txt, qdisc->ops->id, qdisc->handle >> 16);
600                 qdisc->flags |= TCQ_F_WARN_NONWC;
601         }
602 }
603 EXPORT_SYMBOL(qdisc_warn_nonwc);
604
605 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
606 {
607         struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
608                                                  timer);
609
610         rcu_read_lock();
611         __netif_schedule(qdisc_root(wd->qdisc));
612         rcu_read_unlock();
613
614         return HRTIMER_NORESTART;
615 }
616
617 void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
618                                  clockid_t clockid)
619 {
620         hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
621         wd->timer.function = qdisc_watchdog;
622         wd->qdisc = qdisc;
623 }
624 EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
625
626 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
627 {
628         qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
629 }
630 EXPORT_SYMBOL(qdisc_watchdog_init);
631
632 void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires,
633                                       u64 delta_ns)
634 {
635         if (test_bit(__QDISC_STATE_DEACTIVATED,
636                      &qdisc_root_sleeping(wd->qdisc)->state))
637                 return;
638
639         if (hrtimer_is_queued(&wd->timer)) {
640                 /* If timer is already set in [expires, expires + delta_ns],
641                  * do not reprogram it.
642                  */
643                 if (wd->last_expires - expires <= delta_ns)
644                         return;
645         }
646
647         wd->last_expires = expires;
648         hrtimer_start_range_ns(&wd->timer,
649                                ns_to_ktime(expires),
650                                delta_ns,
651                                HRTIMER_MODE_ABS_PINNED);
652 }
653 EXPORT_SYMBOL(qdisc_watchdog_schedule_range_ns);
654
655 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
656 {
657         hrtimer_cancel(&wd->timer);
658 }
659 EXPORT_SYMBOL(qdisc_watchdog_cancel);
660
661 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
662 {
663         struct hlist_head *h;
664         unsigned int i;
665
666         h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
667
668         if (h != NULL) {
669                 for (i = 0; i < n; i++)
670                         INIT_HLIST_HEAD(&h[i]);
671         }
672         return h;
673 }
674
675 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
676 {
677         struct Qdisc_class_common *cl;
678         struct hlist_node *next;
679         struct hlist_head *nhash, *ohash;
680         unsigned int nsize, nmask, osize;
681         unsigned int i, h;
682
683         /* Rehash when load factor exceeds 0.75 */
684         if (clhash->hashelems * 4 <= clhash->hashsize * 3)
685                 return;
686         nsize = clhash->hashsize * 2;
687         nmask = nsize - 1;
688         nhash = qdisc_class_hash_alloc(nsize);
689         if (nhash == NULL)
690                 return;
691
692         ohash = clhash->hash;
693         osize = clhash->hashsize;
694
695         sch_tree_lock(sch);
696         for (i = 0; i < osize; i++) {
697                 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
698                         h = qdisc_class_hash(cl->classid, nmask);
699                         hlist_add_head(&cl->hnode, &nhash[h]);
700                 }
701         }
702         clhash->hash     = nhash;
703         clhash->hashsize = nsize;
704         clhash->hashmask = nmask;
705         sch_tree_unlock(sch);
706
707         kvfree(ohash);
708 }
709 EXPORT_SYMBOL(qdisc_class_hash_grow);
710
711 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
712 {
713         unsigned int size = 4;
714
715         clhash->hash = qdisc_class_hash_alloc(size);
716         if (!clhash->hash)
717                 return -ENOMEM;
718         clhash->hashsize  = size;
719         clhash->hashmask  = size - 1;
720         clhash->hashelems = 0;
721         return 0;
722 }
723 EXPORT_SYMBOL(qdisc_class_hash_init);
724
725 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
726 {
727         kvfree(clhash->hash);
728 }
729 EXPORT_SYMBOL(qdisc_class_hash_destroy);
730
731 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
732                              struct Qdisc_class_common *cl)
733 {
734         unsigned int h;
735
736         INIT_HLIST_NODE(&cl->hnode);
737         h = qdisc_class_hash(cl->classid, clhash->hashmask);
738         hlist_add_head(&cl->hnode, &clhash->hash[h]);
739         clhash->hashelems++;
740 }
741 EXPORT_SYMBOL(qdisc_class_hash_insert);
742
743 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
744                              struct Qdisc_class_common *cl)
745 {
746         hlist_del(&cl->hnode);
747         clhash->hashelems--;
748 }
749 EXPORT_SYMBOL(qdisc_class_hash_remove);
750
751 /* Allocate an unique handle from space managed by kernel
752  * Possible range is [8000-FFFF]:0000 (0x8000 values)
753  */
754 static u32 qdisc_alloc_handle(struct net_device *dev)
755 {
756         int i = 0x8000;
757         static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
758
759         do {
760                 autohandle += TC_H_MAKE(0x10000U, 0);
761                 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
762                         autohandle = TC_H_MAKE(0x80000000U, 0);
763                 if (!qdisc_lookup(dev, autohandle))
764                         return autohandle;
765                 cond_resched();
766         } while (--i > 0);
767
768         return 0;
769 }
770
771 void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len)
772 {
773         bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
774         const struct Qdisc_class_ops *cops;
775         unsigned long cl;
776         u32 parentid;
777         bool notify;
778         int drops;
779
780         if (n == 0 && len == 0)
781                 return;
782         drops = max_t(int, n, 0);
783         rcu_read_lock();
784         while ((parentid = sch->parent)) {
785                 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
786                         break;
787
788                 if (sch->flags & TCQ_F_NOPARENT)
789                         break;
790                 /* Notify parent qdisc only if child qdisc becomes empty.
791                  *
792                  * If child was empty even before update then backlog
793                  * counter is screwed and we skip notification because
794                  * parent class is already passive.
795                  *
796                  * If the original child was offloaded then it is allowed
797                  * to be seem as empty, so the parent is notified anyway.
798                  */
799                 notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
800                                                        !qdisc_is_offloaded);
801                 /* TODO: perform the search on a per txq basis */
802                 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
803                 if (sch == NULL) {
804                         WARN_ON_ONCE(parentid != TC_H_ROOT);
805                         break;
806                 }
807                 cops = sch->ops->cl_ops;
808                 if (notify && cops->qlen_notify) {
809                         cl = cops->find(sch, parentid);
810                         cops->qlen_notify(sch, cl);
811                 }
812                 sch->q.qlen -= n;
813                 sch->qstats.backlog -= len;
814                 __qdisc_qstats_drop(sch, drops);
815         }
816         rcu_read_unlock();
817 }
818 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
819
820 int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type,
821                               void *type_data)
822 {
823         struct net_device *dev = qdisc_dev(sch);
824         int err;
825
826         sch->flags &= ~TCQ_F_OFFLOADED;
827         if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
828                 return 0;
829
830         err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
831         if (err == -EOPNOTSUPP)
832                 return 0;
833
834         if (!err)
835                 sch->flags |= TCQ_F_OFFLOADED;
836
837         return err;
838 }
839 EXPORT_SYMBOL(qdisc_offload_dump_helper);
840
841 void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
842                                 struct Qdisc *new, struct Qdisc *old,
843                                 enum tc_setup_type type, void *type_data,
844                                 struct netlink_ext_ack *extack)
845 {
846         bool any_qdisc_is_offloaded;
847         int err;
848
849         if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
850                 return;
851
852         err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
853
854         /* Don't report error if the graft is part of destroy operation. */
855         if (!err || !new || new == &noop_qdisc)
856                 return;
857
858         /* Don't report error if the parent, the old child and the new
859          * one are not offloaded.
860          */
861         any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED;
862         any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED;
863         any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED;
864
865         if (any_qdisc_is_offloaded)
866                 NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
867 }
868 EXPORT_SYMBOL(qdisc_offload_graft_helper);
869
870 static void qdisc_offload_graft_root(struct net_device *dev,
871                                      struct Qdisc *new, struct Qdisc *old,
872                                      struct netlink_ext_ack *extack)
873 {
874         struct tc_root_qopt_offload graft_offload = {
875                 .command        = TC_ROOT_GRAFT,
876                 .handle         = new ? new->handle : 0,
877                 .ingress        = (new && new->flags & TCQ_F_INGRESS) ||
878                                   (old && old->flags & TCQ_F_INGRESS),
879         };
880
881         qdisc_offload_graft_helper(dev, NULL, new, old,
882                                    TC_SETUP_ROOT_QDISC, &graft_offload, extack);
883 }
884
885 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
886                          u32 portid, u32 seq, u16 flags, int event)
887 {
888         struct gnet_stats_basic_sync __percpu *cpu_bstats = NULL;
889         struct gnet_stats_queue __percpu *cpu_qstats = NULL;
890         struct tcmsg *tcm;
891         struct nlmsghdr  *nlh;
892         unsigned char *b = skb_tail_pointer(skb);
893         struct gnet_dump d;
894         struct qdisc_size_table *stab;
895         u32 block_index;
896         __u32 qlen;
897
898         cond_resched();
899         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
900         if (!nlh)
901                 goto out_nlmsg_trim;
902         tcm = nlmsg_data(nlh);
903         tcm->tcm_family = AF_UNSPEC;
904         tcm->tcm__pad1 = 0;
905         tcm->tcm__pad2 = 0;
906         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
907         tcm->tcm_parent = clid;
908         tcm->tcm_handle = q->handle;
909         tcm->tcm_info = refcount_read(&q->refcnt);
910         if (nla_put_string(skb, TCA_KIND, q->ops->id))
911                 goto nla_put_failure;
912         if (q->ops->ingress_block_get) {
913                 block_index = q->ops->ingress_block_get(q);
914                 if (block_index &&
915                     nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
916                         goto nla_put_failure;
917         }
918         if (q->ops->egress_block_get) {
919                 block_index = q->ops->egress_block_get(q);
920                 if (block_index &&
921                     nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
922                         goto nla_put_failure;
923         }
924         if (q->ops->dump && q->ops->dump(q, skb) < 0)
925                 goto nla_put_failure;
926         if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
927                 goto nla_put_failure;
928         qlen = qdisc_qlen_sum(q);
929
930         stab = rtnl_dereference(q->stab);
931         if (stab && qdisc_dump_stab(skb, stab) < 0)
932                 goto nla_put_failure;
933
934         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
935                                          NULL, &d, TCA_PAD) < 0)
936                 goto nla_put_failure;
937
938         if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
939                 goto nla_put_failure;
940
941         if (qdisc_is_percpu_stats(q)) {
942                 cpu_bstats = q->cpu_bstats;
943                 cpu_qstats = q->cpu_qstats;
944         }
945
946         if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats, true) < 0 ||
947             gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
948             gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
949                 goto nla_put_failure;
950
951         if (gnet_stats_finish_copy(&d) < 0)
952                 goto nla_put_failure;
953
954         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
955         return skb->len;
956
957 out_nlmsg_trim:
958 nla_put_failure:
959         nlmsg_trim(skb, b);
960         return -1;
961 }
962
963 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
964 {
965         if (q->flags & TCQ_F_BUILTIN)
966                 return true;
967         if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
968                 return true;
969
970         return false;
971 }
972
973 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
974                         struct nlmsghdr *n, u32 clid,
975                         struct Qdisc *old, struct Qdisc *new)
976 {
977         struct sk_buff *skb;
978         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
979
980         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
981         if (!skb)
982                 return -ENOBUFS;
983
984         if (old && !tc_qdisc_dump_ignore(old, false)) {
985                 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
986                                   0, RTM_DELQDISC) < 0)
987                         goto err_out;
988         }
989         if (new && !tc_qdisc_dump_ignore(new, false)) {
990                 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
991                                   old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
992                         goto err_out;
993         }
994
995         if (skb->len)
996                 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
997                                       n->nlmsg_flags & NLM_F_ECHO);
998
999 err_out:
1000         kfree_skb(skb);
1001         return -EINVAL;
1002 }
1003
1004 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
1005                                struct nlmsghdr *n, u32 clid,
1006                                struct Qdisc *old, struct Qdisc *new)
1007 {
1008         if (new || old)
1009                 qdisc_notify(net, skb, n, clid, old, new);
1010
1011         if (old)
1012                 qdisc_put(old);
1013 }
1014
1015 static void qdisc_clear_nolock(struct Qdisc *sch)
1016 {
1017         sch->flags &= ~TCQ_F_NOLOCK;
1018         if (!(sch->flags & TCQ_F_CPUSTATS))
1019                 return;
1020
1021         free_percpu(sch->cpu_bstats);
1022         free_percpu(sch->cpu_qstats);
1023         sch->cpu_bstats = NULL;
1024         sch->cpu_qstats = NULL;
1025         sch->flags &= ~TCQ_F_CPUSTATS;
1026 }
1027
1028 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
1029  * to device "dev".
1030  *
1031  * When appropriate send a netlink notification using 'skb'
1032  * and "n".
1033  *
1034  * On success, destroy old qdisc.
1035  */
1036
1037 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
1038                        struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
1039                        struct Qdisc *new, struct Qdisc *old,
1040                        struct netlink_ext_ack *extack)
1041 {
1042         struct Qdisc *q = old;
1043         struct net *net = dev_net(dev);
1044
1045         if (parent == NULL) {
1046                 unsigned int i, num_q, ingress;
1047
1048                 ingress = 0;
1049                 num_q = dev->num_tx_queues;
1050                 if ((q && q->flags & TCQ_F_INGRESS) ||
1051                     (new && new->flags & TCQ_F_INGRESS)) {
1052                         num_q = 1;
1053                         ingress = 1;
1054                         if (!dev_ingress_queue(dev)) {
1055                                 NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
1056                                 return -ENOENT;
1057                         }
1058                 }
1059
1060                 if (dev->flags & IFF_UP)
1061                         dev_deactivate(dev);
1062
1063                 qdisc_offload_graft_root(dev, new, old, extack);
1064
1065                 if (new && new->ops->attach && !ingress)
1066                         goto skip;
1067
1068                 for (i = 0; i < num_q; i++) {
1069                         struct netdev_queue *dev_queue = dev_ingress_queue(dev);
1070
1071                         if (!ingress)
1072                                 dev_queue = netdev_get_tx_queue(dev, i);
1073
1074                         old = dev_graft_qdisc(dev_queue, new);
1075                         if (new && i > 0)
1076                                 qdisc_refcount_inc(new);
1077
1078                         if (!ingress)
1079                                 qdisc_put(old);
1080                 }
1081
1082 skip:
1083                 if (!ingress) {
1084                         notify_and_destroy(net, skb, n, classid,
1085                                            rtnl_dereference(dev->qdisc), new);
1086                         if (new && !new->ops->attach)
1087                                 qdisc_refcount_inc(new);
1088                         rcu_assign_pointer(dev->qdisc, new ? : &noop_qdisc);
1089
1090                         if (new && new->ops->attach)
1091                                 new->ops->attach(new);
1092                 } else {
1093                         notify_and_destroy(net, skb, n, classid, old, new);
1094                 }
1095
1096                 if (dev->flags & IFF_UP)
1097                         dev_activate(dev);
1098         } else {
1099                 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
1100                 unsigned long cl;
1101                 int err;
1102
1103                 /* Only support running class lockless if parent is lockless */
1104                 if (new && (new->flags & TCQ_F_NOLOCK) && !(parent->flags & TCQ_F_NOLOCK))
1105                         qdisc_clear_nolock(new);
1106
1107                 if (!cops || !cops->graft)
1108                         return -EOPNOTSUPP;
1109
1110                 cl = cops->find(parent, classid);
1111                 if (!cl) {
1112                         NL_SET_ERR_MSG(extack, "Specified class not found");
1113                         return -ENOENT;
1114                 }
1115
1116                 err = cops->graft(parent, cl, new, &old, extack);
1117                 if (err)
1118                         return err;
1119                 notify_and_destroy(net, skb, n, classid, old, new);
1120         }
1121         return 0;
1122 }
1123
1124 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1125                                    struct netlink_ext_ack *extack)
1126 {
1127         u32 block_index;
1128
1129         if (tca[TCA_INGRESS_BLOCK]) {
1130                 block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1131
1132                 if (!block_index) {
1133                         NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1134                         return -EINVAL;
1135                 }
1136                 if (!sch->ops->ingress_block_set) {
1137                         NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1138                         return -EOPNOTSUPP;
1139                 }
1140                 sch->ops->ingress_block_set(sch, block_index);
1141         }
1142         if (tca[TCA_EGRESS_BLOCK]) {
1143                 block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1144
1145                 if (!block_index) {
1146                         NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1147                         return -EINVAL;
1148                 }
1149                 if (!sch->ops->egress_block_set) {
1150                         NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1151                         return -EOPNOTSUPP;
1152                 }
1153                 sch->ops->egress_block_set(sch, block_index);
1154         }
1155         return 0;
1156 }
1157
1158 /*
1159    Allocate and initialize new qdisc.
1160
1161    Parameters are passed via opt.
1162  */
1163
1164 static struct Qdisc *qdisc_create(struct net_device *dev,
1165                                   struct netdev_queue *dev_queue,
1166                                   struct Qdisc *p, u32 parent, u32 handle,
1167                                   struct nlattr **tca, int *errp,
1168                                   struct netlink_ext_ack *extack)
1169 {
1170         int err;
1171         struct nlattr *kind = tca[TCA_KIND];
1172         struct Qdisc *sch;
1173         struct Qdisc_ops *ops;
1174         struct qdisc_size_table *stab;
1175
1176         ops = qdisc_lookup_ops(kind);
1177 #ifdef CONFIG_MODULES
1178         if (ops == NULL && kind != NULL) {
1179                 char name[IFNAMSIZ];
1180                 if (nla_strscpy(name, kind, IFNAMSIZ) >= 0) {
1181                         /* We dropped the RTNL semaphore in order to
1182                          * perform the module load.  So, even if we
1183                          * succeeded in loading the module we have to
1184                          * tell the caller to replay the request.  We
1185                          * indicate this using -EAGAIN.
1186                          * We replay the request because the device may
1187                          * go away in the mean time.
1188                          */
1189                         rtnl_unlock();
1190                         request_module("sch_%s", name);
1191                         rtnl_lock();
1192                         ops = qdisc_lookup_ops(kind);
1193                         if (ops != NULL) {
1194                                 /* We will try again qdisc_lookup_ops,
1195                                  * so don't keep a reference.
1196                                  */
1197                                 module_put(ops->owner);
1198                                 err = -EAGAIN;
1199                                 goto err_out;
1200                         }
1201                 }
1202         }
1203 #endif
1204
1205         err = -ENOENT;
1206         if (!ops) {
1207                 NL_SET_ERR_MSG(extack, "Specified qdisc kind is unknown");
1208                 goto err_out;
1209         }
1210
1211         sch = qdisc_alloc(dev_queue, ops, extack);
1212         if (IS_ERR(sch)) {
1213                 err = PTR_ERR(sch);
1214                 goto err_out2;
1215         }
1216
1217         sch->parent = parent;
1218
1219         if (handle == TC_H_INGRESS) {
1220                 sch->flags |= TCQ_F_INGRESS;
1221                 handle = TC_H_MAKE(TC_H_INGRESS, 0);
1222         } else {
1223                 if (handle == 0) {
1224                         handle = qdisc_alloc_handle(dev);
1225                         if (handle == 0) {
1226                                 NL_SET_ERR_MSG(extack, "Maximum number of qdisc handles was exceeded");
1227                                 err = -ENOSPC;
1228                                 goto err_out3;
1229                         }
1230                 }
1231                 if (!netif_is_multiqueue(dev))
1232                         sch->flags |= TCQ_F_ONETXQUEUE;
1233         }
1234
1235         sch->handle = handle;
1236
1237         /* This exist to keep backward compatible with a userspace
1238          * loophole, what allowed userspace to get IFF_NO_QUEUE
1239          * facility on older kernels by setting tx_queue_len=0 (prior
1240          * to qdisc init), and then forgot to reinit tx_queue_len
1241          * before again attaching a qdisc.
1242          */
1243         if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1244                 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1245                 netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1246         }
1247
1248         err = qdisc_block_indexes_set(sch, tca, extack);
1249         if (err)
1250                 goto err_out3;
1251
1252         if (ops->init) {
1253                 err = ops->init(sch, tca[TCA_OPTIONS], extack);
1254                 if (err != 0)
1255                         goto err_out5;
1256         }
1257
1258         if (tca[TCA_STAB]) {
1259                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1260                 if (IS_ERR(stab)) {
1261                         err = PTR_ERR(stab);
1262                         goto err_out4;
1263                 }
1264                 rcu_assign_pointer(sch->stab, stab);
1265         }
1266         if (tca[TCA_RATE]) {
1267                 err = -EOPNOTSUPP;
1268                 if (sch->flags & TCQ_F_MQROOT) {
1269                         NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1270                         goto err_out4;
1271                 }
1272
1273                 err = gen_new_estimator(&sch->bstats,
1274                                         sch->cpu_bstats,
1275                                         &sch->rate_est,
1276                                         NULL,
1277                                         true,
1278                                         tca[TCA_RATE]);
1279                 if (err) {
1280                         NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1281                         goto err_out4;
1282                 }
1283         }
1284
1285         qdisc_hash_add(sch, false);
1286         trace_qdisc_create(ops, dev, parent);
1287
1288         return sch;
1289
1290 err_out5:
1291         /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1292         if (ops->destroy)
1293                 ops->destroy(sch);
1294 err_out3:
1295         dev_put_track(dev, &sch->dev_tracker);
1296         qdisc_free(sch);
1297 err_out2:
1298         module_put(ops->owner);
1299 err_out:
1300         *errp = err;
1301         return NULL;
1302
1303 err_out4:
1304         /*
1305          * Any broken qdiscs that would require a ops->reset() here?
1306          * The qdisc was never in action so it shouldn't be necessary.
1307          */
1308         qdisc_put_stab(rtnl_dereference(sch->stab));
1309         if (ops->destroy)
1310                 ops->destroy(sch);
1311         goto err_out3;
1312 }
1313
1314 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1315                         struct netlink_ext_ack *extack)
1316 {
1317         struct qdisc_size_table *ostab, *stab = NULL;
1318         int err = 0;
1319
1320         if (tca[TCA_OPTIONS]) {
1321                 if (!sch->ops->change) {
1322                         NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1323                         return -EINVAL;
1324                 }
1325                 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1326                         NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1327                         return -EOPNOTSUPP;
1328                 }
1329                 err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1330                 if (err)
1331                         return err;
1332         }
1333
1334         if (tca[TCA_STAB]) {
1335                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1336                 if (IS_ERR(stab))
1337                         return PTR_ERR(stab);
1338         }
1339
1340         ostab = rtnl_dereference(sch->stab);
1341         rcu_assign_pointer(sch->stab, stab);
1342         qdisc_put_stab(ostab);
1343
1344         if (tca[TCA_RATE]) {
1345                 /* NB: ignores errors from replace_estimator
1346                    because change can't be undone. */
1347                 if (sch->flags & TCQ_F_MQROOT)
1348                         goto out;
1349                 gen_replace_estimator(&sch->bstats,
1350                                       sch->cpu_bstats,
1351                                       &sch->rate_est,
1352                                       NULL,
1353                                       true,
1354                                       tca[TCA_RATE]);
1355         }
1356 out:
1357         return 0;
1358 }
1359
1360 struct check_loop_arg {
1361         struct qdisc_walker     w;
1362         struct Qdisc            *p;
1363         int                     depth;
1364 };
1365
1366 static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1367                          struct qdisc_walker *w);
1368
1369 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1370 {
1371         struct check_loop_arg   arg;
1372
1373         if (q->ops->cl_ops == NULL)
1374                 return 0;
1375
1376         arg.w.stop = arg.w.skip = arg.w.count = 0;
1377         arg.w.fn = check_loop_fn;
1378         arg.depth = depth;
1379         arg.p = p;
1380         q->ops->cl_ops->walk(q, &arg.w);
1381         return arg.w.stop ? -ELOOP : 0;
1382 }
1383
1384 static int
1385 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1386 {
1387         struct Qdisc *leaf;
1388         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1389         struct check_loop_arg *arg = (struct check_loop_arg *)w;
1390
1391         leaf = cops->leaf(q, cl);
1392         if (leaf) {
1393                 if (leaf == arg->p || arg->depth > 7)
1394                         return -ELOOP;
1395                 return check_loop(leaf, arg->p, arg->depth + 1);
1396         }
1397         return 0;
1398 }
1399
1400 const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
1401         [TCA_KIND]              = { .type = NLA_STRING },
1402         [TCA_RATE]              = { .type = NLA_BINARY,
1403                                     .len = sizeof(struct tc_estimator) },
1404         [TCA_STAB]              = { .type = NLA_NESTED },
1405         [TCA_DUMP_INVISIBLE]    = { .type = NLA_FLAG },
1406         [TCA_CHAIN]             = { .type = NLA_U32 },
1407         [TCA_INGRESS_BLOCK]     = { .type = NLA_U32 },
1408         [TCA_EGRESS_BLOCK]      = { .type = NLA_U32 },
1409 };
1410
1411 /*
1412  * Delete/get qdisc.
1413  */
1414
1415 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1416                         struct netlink_ext_ack *extack)
1417 {
1418         struct net *net = sock_net(skb->sk);
1419         struct tcmsg *tcm = nlmsg_data(n);
1420         struct nlattr *tca[TCA_MAX + 1];
1421         struct net_device *dev;
1422         u32 clid;
1423         struct Qdisc *q = NULL;
1424         struct Qdisc *p = NULL;
1425         int err;
1426
1427         if ((n->nlmsg_type != RTM_GETQDISC) &&
1428             !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1429                 return -EPERM;
1430
1431         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1432                                      rtm_tca_policy, extack);
1433         if (err < 0)
1434                 return err;
1435
1436         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1437         if (!dev)
1438                 return -ENODEV;
1439
1440         clid = tcm->tcm_parent;
1441         if (clid) {
1442                 if (clid != TC_H_ROOT) {
1443                         if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1444                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1445                                 if (!p) {
1446                                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1447                                         return -ENOENT;
1448                                 }
1449                                 q = qdisc_leaf(p, clid);
1450                         } else if (dev_ingress_queue(dev)) {
1451                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1452                         }
1453                 } else {
1454                         q = rtnl_dereference(dev->qdisc);
1455                 }
1456                 if (!q) {
1457                         NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1458                         return -ENOENT;
1459                 }
1460
1461                 if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1462                         NL_SET_ERR_MSG(extack, "Invalid handle");
1463                         return -EINVAL;
1464                 }
1465         } else {
1466                 q = qdisc_lookup(dev, tcm->tcm_handle);
1467                 if (!q) {
1468                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1469                         return -ENOENT;
1470                 }
1471         }
1472
1473         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1474                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1475                 return -EINVAL;
1476         }
1477
1478         if (n->nlmsg_type == RTM_DELQDISC) {
1479                 if (!clid) {
1480                         NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1481                         return -EINVAL;
1482                 }
1483                 if (q->handle == 0) {
1484                         NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1485                         return -ENOENT;
1486                 }
1487                 err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1488                 if (err != 0)
1489                         return err;
1490         } else {
1491                 qdisc_notify(net, skb, n, clid, NULL, q);
1492         }
1493         return 0;
1494 }
1495
1496 /*
1497  * Create/change qdisc.
1498  */
1499
1500 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1501                            struct netlink_ext_ack *extack)
1502 {
1503         struct net *net = sock_net(skb->sk);
1504         struct tcmsg *tcm;
1505         struct nlattr *tca[TCA_MAX + 1];
1506         struct net_device *dev;
1507         u32 clid;
1508         struct Qdisc *q, *p;
1509         int err;
1510
1511         if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1512                 return -EPERM;
1513
1514 replay:
1515         /* Reinit, just in case something touches this. */
1516         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1517                                      rtm_tca_policy, extack);
1518         if (err < 0)
1519                 return err;
1520
1521         tcm = nlmsg_data(n);
1522         clid = tcm->tcm_parent;
1523         q = p = NULL;
1524
1525         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1526         if (!dev)
1527                 return -ENODEV;
1528
1529
1530         if (clid) {
1531                 if (clid != TC_H_ROOT) {
1532                         if (clid != TC_H_INGRESS) {
1533                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1534                                 if (!p) {
1535                                         NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1536                                         return -ENOENT;
1537                                 }
1538                                 q = qdisc_leaf(p, clid);
1539                         } else if (dev_ingress_queue_create(dev)) {
1540                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1541                         }
1542                 } else {
1543                         q = rtnl_dereference(dev->qdisc);
1544                 }
1545
1546                 /* It may be default qdisc, ignore it */
1547                 if (q && q->handle == 0)
1548                         q = NULL;
1549
1550                 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1551                         if (tcm->tcm_handle) {
1552                                 if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1553                                         NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1554                                         return -EEXIST;
1555                                 }
1556                                 if (TC_H_MIN(tcm->tcm_handle)) {
1557                                         NL_SET_ERR_MSG(extack, "Invalid minor handle");
1558                                         return -EINVAL;
1559                                 }
1560                                 q = qdisc_lookup(dev, tcm->tcm_handle);
1561                                 if (!q)
1562                                         goto create_n_graft;
1563                                 if (n->nlmsg_flags & NLM_F_EXCL) {
1564                                         NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1565                                         return -EEXIST;
1566                                 }
1567                                 if (tca[TCA_KIND] &&
1568                                     nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1569                                         NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1570                                         return -EINVAL;
1571                                 }
1572                                 if (q == p ||
1573                                     (p && check_loop(q, p, 0))) {
1574                                         NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1575                                         return -ELOOP;
1576                                 }
1577                                 qdisc_refcount_inc(q);
1578                                 goto graft;
1579                         } else {
1580                                 if (!q)
1581                                         goto create_n_graft;
1582
1583                                 /* This magic test requires explanation.
1584                                  *
1585                                  *   We know, that some child q is already
1586                                  *   attached to this parent and have choice:
1587                                  *   either to change it or to create/graft new one.
1588                                  *
1589                                  *   1. We are allowed to create/graft only
1590                                  *   if CREATE and REPLACE flags are set.
1591                                  *
1592                                  *   2. If EXCL is set, requestor wanted to say,
1593                                  *   that qdisc tcm_handle is not expected
1594                                  *   to exist, so that we choose create/graft too.
1595                                  *
1596                                  *   3. The last case is when no flags are set.
1597                                  *   Alas, it is sort of hole in API, we
1598                                  *   cannot decide what to do unambiguously.
1599                                  *   For now we select create/graft, if
1600                                  *   user gave KIND, which does not match existing.
1601                                  */
1602                                 if ((n->nlmsg_flags & NLM_F_CREATE) &&
1603                                     (n->nlmsg_flags & NLM_F_REPLACE) &&
1604                                     ((n->nlmsg_flags & NLM_F_EXCL) ||
1605                                      (tca[TCA_KIND] &&
1606                                       nla_strcmp(tca[TCA_KIND], q->ops->id))))
1607                                         goto create_n_graft;
1608                         }
1609                 }
1610         } else {
1611                 if (!tcm->tcm_handle) {
1612                         NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1613                         return -EINVAL;
1614                 }
1615                 q = qdisc_lookup(dev, tcm->tcm_handle);
1616         }
1617
1618         /* Change qdisc parameters */
1619         if (!q) {
1620                 NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1621                 return -ENOENT;
1622         }
1623         if (n->nlmsg_flags & NLM_F_EXCL) {
1624                 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1625                 return -EEXIST;
1626         }
1627         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1628                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1629                 return -EINVAL;
1630         }
1631         err = qdisc_change(q, tca, extack);
1632         if (err == 0)
1633                 qdisc_notify(net, skb, n, clid, NULL, q);
1634         return err;
1635
1636 create_n_graft:
1637         if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1638                 NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1639                 return -ENOENT;
1640         }
1641         if (clid == TC_H_INGRESS) {
1642                 if (dev_ingress_queue(dev)) {
1643                         q = qdisc_create(dev, dev_ingress_queue(dev), p,
1644                                          tcm->tcm_parent, tcm->tcm_parent,
1645                                          tca, &err, extack);
1646                 } else {
1647                         NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1648                         err = -ENOENT;
1649                 }
1650         } else {
1651                 struct netdev_queue *dev_queue;
1652
1653                 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1654                         dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1655                 else if (p)
1656                         dev_queue = p->dev_queue;
1657                 else
1658                         dev_queue = netdev_get_tx_queue(dev, 0);
1659
1660                 q = qdisc_create(dev, dev_queue, p,
1661                                  tcm->tcm_parent, tcm->tcm_handle,
1662                                  tca, &err, extack);
1663         }
1664         if (q == NULL) {
1665                 if (err == -EAGAIN)
1666                         goto replay;
1667                 return err;
1668         }
1669
1670 graft:
1671         err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1672         if (err) {
1673                 if (q)
1674                         qdisc_put(q);
1675                 return err;
1676         }
1677
1678         return 0;
1679 }
1680
1681 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1682                               struct netlink_callback *cb,
1683                               int *q_idx_p, int s_q_idx, bool recur,
1684                               bool dump_invisible)
1685 {
1686         int ret = 0, q_idx = *q_idx_p;
1687         struct Qdisc *q;
1688         int b;
1689
1690         if (!root)
1691                 return 0;
1692
1693         q = root;
1694         if (q_idx < s_q_idx) {
1695                 q_idx++;
1696         } else {
1697                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1698                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1699                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1700                                   RTM_NEWQDISC) <= 0)
1701                         goto done;
1702                 q_idx++;
1703         }
1704
1705         /* If dumping singletons, there is no qdisc_dev(root) and the singleton
1706          * itself has already been dumped.
1707          *
1708          * If we've already dumped the top-level (ingress) qdisc above and the global
1709          * qdisc hashtable, we don't want to hit it again
1710          */
1711         if (!qdisc_dev(root) || !recur)
1712                 goto out;
1713
1714         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1715                 if (q_idx < s_q_idx) {
1716                         q_idx++;
1717                         continue;
1718                 }
1719                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1720                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1721                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1722                                   RTM_NEWQDISC) <= 0)
1723                         goto done;
1724                 q_idx++;
1725         }
1726
1727 out:
1728         *q_idx_p = q_idx;
1729         return ret;
1730 done:
1731         ret = -1;
1732         goto out;
1733 }
1734
1735 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1736 {
1737         struct net *net = sock_net(skb->sk);
1738         int idx, q_idx;
1739         int s_idx, s_q_idx;
1740         struct net_device *dev;
1741         const struct nlmsghdr *nlh = cb->nlh;
1742         struct nlattr *tca[TCA_MAX + 1];
1743         int err;
1744
1745         s_idx = cb->args[0];
1746         s_q_idx = q_idx = cb->args[1];
1747
1748         idx = 0;
1749         ASSERT_RTNL();
1750
1751         err = nlmsg_parse_deprecated(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
1752                                      rtm_tca_policy, cb->extack);
1753         if (err < 0)
1754                 return err;
1755
1756         for_each_netdev(net, dev) {
1757                 struct netdev_queue *dev_queue;
1758
1759                 if (idx < s_idx)
1760                         goto cont;
1761                 if (idx > s_idx)
1762                         s_q_idx = 0;
1763                 q_idx = 0;
1764
1765                 if (tc_dump_qdisc_root(rtnl_dereference(dev->qdisc),
1766                                        skb, cb, &q_idx, s_q_idx,
1767                                        true, tca[TCA_DUMP_INVISIBLE]) < 0)
1768                         goto done;
1769
1770                 dev_queue = dev_ingress_queue(dev);
1771                 if (dev_queue &&
1772                     tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1773                                        &q_idx, s_q_idx, false,
1774                                        tca[TCA_DUMP_INVISIBLE]) < 0)
1775                         goto done;
1776
1777 cont:
1778                 idx++;
1779         }
1780
1781 done:
1782         cb->args[0] = idx;
1783         cb->args[1] = q_idx;
1784
1785         return skb->len;
1786 }
1787
1788
1789
1790 /************************************************
1791  *      Traffic classes manipulation.           *
1792  ************************************************/
1793
1794 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1795                           unsigned long cl,
1796                           u32 portid, u32 seq, u16 flags, int event)
1797 {
1798         struct tcmsg *tcm;
1799         struct nlmsghdr  *nlh;
1800         unsigned char *b = skb_tail_pointer(skb);
1801         struct gnet_dump d;
1802         const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1803
1804         cond_resched();
1805         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1806         if (!nlh)
1807                 goto out_nlmsg_trim;
1808         tcm = nlmsg_data(nlh);
1809         tcm->tcm_family = AF_UNSPEC;
1810         tcm->tcm__pad1 = 0;
1811         tcm->tcm__pad2 = 0;
1812         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1813         tcm->tcm_parent = q->handle;
1814         tcm->tcm_handle = q->handle;
1815         tcm->tcm_info = 0;
1816         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1817                 goto nla_put_failure;
1818         if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1819                 goto nla_put_failure;
1820
1821         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1822                                          NULL, &d, TCA_PAD) < 0)
1823                 goto nla_put_failure;
1824
1825         if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1826                 goto nla_put_failure;
1827
1828         if (gnet_stats_finish_copy(&d) < 0)
1829                 goto nla_put_failure;
1830
1831         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1832         return skb->len;
1833
1834 out_nlmsg_trim:
1835 nla_put_failure:
1836         nlmsg_trim(skb, b);
1837         return -1;
1838 }
1839
1840 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1841                          struct nlmsghdr *n, struct Qdisc *q,
1842                          unsigned long cl, int event)
1843 {
1844         struct sk_buff *skb;
1845         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1846
1847         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1848         if (!skb)
1849                 return -ENOBUFS;
1850
1851         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1852                 kfree_skb(skb);
1853                 return -EINVAL;
1854         }
1855
1856         return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1857                               n->nlmsg_flags & NLM_F_ECHO);
1858 }
1859
1860 static int tclass_del_notify(struct net *net,
1861                              const struct Qdisc_class_ops *cops,
1862                              struct sk_buff *oskb, struct nlmsghdr *n,
1863                              struct Qdisc *q, unsigned long cl,
1864                              struct netlink_ext_ack *extack)
1865 {
1866         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1867         struct sk_buff *skb;
1868         int err = 0;
1869
1870         if (!cops->delete)
1871                 return -EOPNOTSUPP;
1872
1873         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1874         if (!skb)
1875                 return -ENOBUFS;
1876
1877         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1878                            RTM_DELTCLASS) < 0) {
1879                 kfree_skb(skb);
1880                 return -EINVAL;
1881         }
1882
1883         err = cops->delete(q, cl, extack);
1884         if (err) {
1885                 kfree_skb(skb);
1886                 return err;
1887         }
1888
1889         err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1890                              n->nlmsg_flags & NLM_F_ECHO);
1891         return err;
1892 }
1893
1894 #ifdef CONFIG_NET_CLS
1895
1896 struct tcf_bind_args {
1897         struct tcf_walker w;
1898         unsigned long base;
1899         unsigned long cl;
1900         u32 classid;
1901 };
1902
1903 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
1904 {
1905         struct tcf_bind_args *a = (void *)arg;
1906
1907         if (tp->ops->bind_class) {
1908                 struct Qdisc *q = tcf_block_q(tp->chain->block);
1909
1910                 sch_tree_lock(q);
1911                 tp->ops->bind_class(n, a->classid, a->cl, q, a->base);
1912                 sch_tree_unlock(q);
1913         }
1914         return 0;
1915 }
1916
1917 struct tc_bind_class_args {
1918         struct qdisc_walker w;
1919         unsigned long new_cl;
1920         u32 portid;
1921         u32 clid;
1922 };
1923
1924 static int tc_bind_class_walker(struct Qdisc *q, unsigned long cl,
1925                                 struct qdisc_walker *w)
1926 {
1927         struct tc_bind_class_args *a = (struct tc_bind_class_args *)w;
1928         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1929         struct tcf_block *block;
1930         struct tcf_chain *chain;
1931
1932         block = cops->tcf_block(q, cl, NULL);
1933         if (!block)
1934                 return 0;
1935         for (chain = tcf_get_next_chain(block, NULL);
1936              chain;
1937              chain = tcf_get_next_chain(block, chain)) {
1938                 struct tcf_proto *tp;
1939
1940                 for (tp = tcf_get_next_proto(chain, NULL);
1941                      tp; tp = tcf_get_next_proto(chain, tp)) {
1942                         struct tcf_bind_args arg = {};
1943
1944                         arg.w.fn = tcf_node_bind;
1945                         arg.classid = a->clid;
1946                         arg.base = cl;
1947                         arg.cl = a->new_cl;
1948                         tp->ops->walk(tp, &arg.w, true);
1949                 }
1950         }
1951
1952         return 0;
1953 }
1954
1955 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1956                            unsigned long new_cl)
1957 {
1958         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1959         struct tc_bind_class_args args = {};
1960
1961         if (!cops->tcf_block)
1962                 return;
1963         args.portid = portid;
1964         args.clid = clid;
1965         args.new_cl = new_cl;
1966         args.w.fn = tc_bind_class_walker;
1967         q->ops->cl_ops->walk(q, &args.w);
1968 }
1969
1970 #else
1971
1972 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1973                            unsigned long new_cl)
1974 {
1975 }
1976
1977 #endif
1978
1979 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
1980                          struct netlink_ext_ack *extack)
1981 {
1982         struct net *net = sock_net(skb->sk);
1983         struct tcmsg *tcm = nlmsg_data(n);
1984         struct nlattr *tca[TCA_MAX + 1];
1985         struct net_device *dev;
1986         struct Qdisc *q = NULL;
1987         const struct Qdisc_class_ops *cops;
1988         unsigned long cl = 0;
1989         unsigned long new_cl;
1990         u32 portid;
1991         u32 clid;
1992         u32 qid;
1993         int err;
1994
1995         if ((n->nlmsg_type != RTM_GETTCLASS) &&
1996             !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1997                 return -EPERM;
1998
1999         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
2000                                      rtm_tca_policy, extack);
2001         if (err < 0)
2002                 return err;
2003
2004         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
2005         if (!dev)
2006                 return -ENODEV;
2007
2008         /*
2009            parent == TC_H_UNSPEC - unspecified parent.
2010            parent == TC_H_ROOT   - class is root, which has no parent.
2011            parent == X:0         - parent is root class.
2012            parent == X:Y         - parent is a node in hierarchy.
2013            parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
2014
2015            handle == 0:0         - generate handle from kernel pool.
2016            handle == 0:Y         - class is X:Y, where X:0 is qdisc.
2017            handle == X:Y         - clear.
2018            handle == X:0         - root class.
2019          */
2020
2021         /* Step 1. Determine qdisc handle X:0 */
2022
2023         portid = tcm->tcm_parent;
2024         clid = tcm->tcm_handle;
2025         qid = TC_H_MAJ(clid);
2026
2027         if (portid != TC_H_ROOT) {
2028                 u32 qid1 = TC_H_MAJ(portid);
2029
2030                 if (qid && qid1) {
2031                         /* If both majors are known, they must be identical. */
2032                         if (qid != qid1)
2033                                 return -EINVAL;
2034                 } else if (qid1) {
2035                         qid = qid1;
2036                 } else if (qid == 0)
2037                         qid = rtnl_dereference(dev->qdisc)->handle;
2038
2039                 /* Now qid is genuine qdisc handle consistent
2040                  * both with parent and child.
2041                  *
2042                  * TC_H_MAJ(portid) still may be unspecified, complete it now.
2043                  */
2044                 if (portid)
2045                         portid = TC_H_MAKE(qid, portid);
2046         } else {
2047                 if (qid == 0)
2048                         qid = rtnl_dereference(dev->qdisc)->handle;
2049         }
2050
2051         /* OK. Locate qdisc */
2052         q = qdisc_lookup(dev, qid);
2053         if (!q)
2054                 return -ENOENT;
2055
2056         /* An check that it supports classes */
2057         cops = q->ops->cl_ops;
2058         if (cops == NULL)
2059                 return -EINVAL;
2060
2061         /* Now try to get class */
2062         if (clid == 0) {
2063                 if (portid == TC_H_ROOT)
2064                         clid = qid;
2065         } else
2066                 clid = TC_H_MAKE(qid, clid);
2067
2068         if (clid)
2069                 cl = cops->find(q, clid);
2070
2071         if (cl == 0) {
2072                 err = -ENOENT;
2073                 if (n->nlmsg_type != RTM_NEWTCLASS ||
2074                     !(n->nlmsg_flags & NLM_F_CREATE))
2075                         goto out;
2076         } else {
2077                 switch (n->nlmsg_type) {
2078                 case RTM_NEWTCLASS:
2079                         err = -EEXIST;
2080                         if (n->nlmsg_flags & NLM_F_EXCL)
2081                                 goto out;
2082                         break;
2083                 case RTM_DELTCLASS:
2084                         err = tclass_del_notify(net, cops, skb, n, q, cl, extack);
2085                         /* Unbind the class with flilters with 0 */
2086                         tc_bind_tclass(q, portid, clid, 0);
2087                         goto out;
2088                 case RTM_GETTCLASS:
2089                         err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
2090                         goto out;
2091                 default:
2092                         err = -EINVAL;
2093                         goto out;
2094                 }
2095         }
2096
2097         if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
2098                 NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
2099                 return -EOPNOTSUPP;
2100         }
2101
2102         new_cl = cl;
2103         err = -EOPNOTSUPP;
2104         if (cops->change)
2105                 err = cops->change(q, clid, portid, tca, &new_cl, extack);
2106         if (err == 0) {
2107                 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
2108                 /* We just create a new class, need to do reverse binding. */
2109                 if (cl != new_cl)
2110                         tc_bind_tclass(q, portid, clid, new_cl);
2111         }
2112 out:
2113         return err;
2114 }
2115
2116 struct qdisc_dump_args {
2117         struct qdisc_walker     w;
2118         struct sk_buff          *skb;
2119         struct netlink_callback *cb;
2120 };
2121
2122 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
2123                             struct qdisc_walker *arg)
2124 {
2125         struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
2126
2127         return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
2128                               a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
2129                               RTM_NEWTCLASS);
2130 }
2131
2132 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
2133                                 struct tcmsg *tcm, struct netlink_callback *cb,
2134                                 int *t_p, int s_t)
2135 {
2136         struct qdisc_dump_args arg;
2137
2138         if (tc_qdisc_dump_ignore(q, false) ||
2139             *t_p < s_t || !q->ops->cl_ops ||
2140             (tcm->tcm_parent &&
2141              TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2142                 (*t_p)++;
2143                 return 0;
2144         }
2145         if (*t_p > s_t)
2146                 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2147         arg.w.fn = qdisc_class_dump;
2148         arg.skb = skb;
2149         arg.cb = cb;
2150         arg.w.stop  = 0;
2151         arg.w.skip = cb->args[1];
2152         arg.w.count = 0;
2153         q->ops->cl_ops->walk(q, &arg.w);
2154         cb->args[1] = arg.w.count;
2155         if (arg.w.stop)
2156                 return -1;
2157         (*t_p)++;
2158         return 0;
2159 }
2160
2161 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2162                                struct tcmsg *tcm, struct netlink_callback *cb,
2163                                int *t_p, int s_t, bool recur)
2164 {
2165         struct Qdisc *q;
2166         int b;
2167
2168         if (!root)
2169                 return 0;
2170
2171         if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2172                 return -1;
2173
2174         if (!qdisc_dev(root) || !recur)
2175                 return 0;
2176
2177         if (tcm->tcm_parent) {
2178                 q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2179                 if (q && q != root &&
2180                     tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2181                         return -1;
2182                 return 0;
2183         }
2184         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2185                 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2186                         return -1;
2187         }
2188
2189         return 0;
2190 }
2191
2192 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2193 {
2194         struct tcmsg *tcm = nlmsg_data(cb->nlh);
2195         struct net *net = sock_net(skb->sk);
2196         struct netdev_queue *dev_queue;
2197         struct net_device *dev;
2198         int t, s_t;
2199
2200         if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2201                 return 0;
2202         dev = dev_get_by_index(net, tcm->tcm_ifindex);
2203         if (!dev)
2204                 return 0;
2205
2206         s_t = cb->args[0];
2207         t = 0;
2208
2209         if (tc_dump_tclass_root(rtnl_dereference(dev->qdisc),
2210                                 skb, tcm, cb, &t, s_t, true) < 0)
2211                 goto done;
2212
2213         dev_queue = dev_ingress_queue(dev);
2214         if (dev_queue &&
2215             tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
2216                                 &t, s_t, false) < 0)
2217                 goto done;
2218
2219 done:
2220         cb->args[0] = t;
2221
2222         dev_put(dev);
2223         return skb->len;
2224 }
2225
2226 #ifdef CONFIG_PROC_FS
2227 static int psched_show(struct seq_file *seq, void *v)
2228 {
2229         seq_printf(seq, "%08x %08x %08x %08x\n",
2230                    (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2231                    1000000,
2232                    (u32)NSEC_PER_SEC / hrtimer_resolution);
2233
2234         return 0;
2235 }
2236
2237 static int __net_init psched_net_init(struct net *net)
2238 {
2239         struct proc_dir_entry *e;
2240
2241         e = proc_create_single("psched", 0, net->proc_net, psched_show);
2242         if (e == NULL)
2243                 return -ENOMEM;
2244
2245         return 0;
2246 }
2247
2248 static void __net_exit psched_net_exit(struct net *net)
2249 {
2250         remove_proc_entry("psched", net->proc_net);
2251 }
2252 #else
2253 static int __net_init psched_net_init(struct net *net)
2254 {
2255         return 0;
2256 }
2257
2258 static void __net_exit psched_net_exit(struct net *net)
2259 {
2260 }
2261 #endif
2262
2263 static struct pernet_operations psched_net_ops = {
2264         .init = psched_net_init,
2265         .exit = psched_net_exit,
2266 };
2267
2268 static int __init pktsched_init(void)
2269 {
2270         int err;
2271
2272         err = register_pernet_subsys(&psched_net_ops);
2273         if (err) {
2274                 pr_err("pktsched_init: "
2275                        "cannot initialize per netns operations\n");
2276                 return err;
2277         }
2278
2279         register_qdisc(&pfifo_fast_ops);
2280         register_qdisc(&pfifo_qdisc_ops);
2281         register_qdisc(&bfifo_qdisc_ops);
2282         register_qdisc(&pfifo_head_drop_qdisc_ops);
2283         register_qdisc(&mq_qdisc_ops);
2284         register_qdisc(&noqueue_qdisc_ops);
2285
2286         rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2287         rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2288         rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2289                       0);
2290         rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2291         rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2292         rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2293                       0);
2294
2295         return 0;
2296 }
2297
2298 subsys_initcall(pktsched_init);