net: fix NULL pointer in skb_segment_list
[platform/kernel/linux-rpi.git] / net / sched / sch_cbq.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * net/sched/sch_cbq.c  Class-Based Queueing discipline.
4  *
5  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
6  */
7
8 #include <linux/module.h>
9 #include <linux/slab.h>
10 #include <linux/types.h>
11 #include <linux/kernel.h>
12 #include <linux/string.h>
13 #include <linux/errno.h>
14 #include <linux/skbuff.h>
15 #include <net/netlink.h>
16 #include <net/pkt_sched.h>
17 #include <net/pkt_cls.h>
18
19
20 /*      Class-Based Queueing (CBQ) algorithm.
21         =======================================
22
23         Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
24                  Management Models for Packet Networks",
25                  IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
26
27                  [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
28
29                  [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
30                  Parameters", 1996
31
32                  [4] Sally Floyd and Michael Speer, "Experimental Results
33                  for Class-Based Queueing", 1998, not published.
34
35         -----------------------------------------------------------------------
36
37         Algorithm skeleton was taken from NS simulator cbq.cc.
38         If someone wants to check this code against the LBL version,
39         he should take into account that ONLY the skeleton was borrowed,
40         the implementation is different. Particularly:
41
42         --- The WRR algorithm is different. Our version looks more
43         reasonable (I hope) and works when quanta are allowed to be
44         less than MTU, which is always the case when real time classes
45         have small rates. Note, that the statement of [3] is
46         incomplete, delay may actually be estimated even if class
47         per-round allotment is less than MTU. Namely, if per-round
48         allotment is W*r_i, and r_1+...+r_k = r < 1
49
50         delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
51
52         In the worst case we have IntServ estimate with D = W*r+k*MTU
53         and C = MTU*r. The proof (if correct at all) is trivial.
54
55
56         --- It seems that cbq-2.0 is not very accurate. At least, I cannot
57         interpret some places, which look like wrong translations
58         from NS. Anyone is advised to find these differences
59         and explain to me, why I am wrong 8).
60
61         --- Linux has no EOI event, so that we cannot estimate true class
62         idle time. Workaround is to consider the next dequeue event
63         as sign that previous packet is finished. This is wrong because of
64         internal device queueing, but on a permanently loaded link it is true.
65         Moreover, combined with clock integrator, this scheme looks
66         very close to an ideal solution.  */
67
68 struct cbq_sched_data;
69
70
71 struct cbq_class {
72         struct Qdisc_class_common common;
73         struct cbq_class        *next_alive;    /* next class with backlog in this priority band */
74
75 /* Parameters */
76         unsigned char           priority;       /* class priority */
77         unsigned char           priority2;      /* priority to be used after overlimit */
78         unsigned char           ewma_log;       /* time constant for idle time calculation */
79
80         u32                     defmap;
81
82         /* Link-sharing scheduler parameters */
83         long                    maxidle;        /* Class parameters: see below. */
84         long                    offtime;
85         long                    minidle;
86         u32                     avpkt;
87         struct qdisc_rate_table *R_tab;
88
89         /* General scheduler (WRR) parameters */
90         long                    allot;
91         long                    quantum;        /* Allotment per WRR round */
92         long                    weight;         /* Relative allotment: see below */
93
94         struct Qdisc            *qdisc;         /* Ptr to CBQ discipline */
95         struct cbq_class        *split;         /* Ptr to split node */
96         struct cbq_class        *share;         /* Ptr to LS parent in the class tree */
97         struct cbq_class        *tparent;       /* Ptr to tree parent in the class tree */
98         struct cbq_class        *borrow;        /* NULL if class is bandwidth limited;
99                                                    parent otherwise */
100         struct cbq_class        *sibling;       /* Sibling chain */
101         struct cbq_class        *children;      /* Pointer to children chain */
102
103         struct Qdisc            *q;             /* Elementary queueing discipline */
104
105
106 /* Variables */
107         unsigned char           cpriority;      /* Effective priority */
108         unsigned char           delayed;
109         unsigned char           level;          /* level of the class in hierarchy:
110                                                    0 for leaf classes, and maximal
111                                                    level of children + 1 for nodes.
112                                                  */
113
114         psched_time_t           last;           /* Last end of service */
115         psched_time_t           undertime;
116         long                    avgidle;
117         long                    deficit;        /* Saved deficit for WRR */
118         psched_time_t           penalized;
119         struct gnet_stats_basic_packed bstats;
120         struct gnet_stats_queue qstats;
121         struct net_rate_estimator __rcu *rate_est;
122         struct tc_cbq_xstats    xstats;
123
124         struct tcf_proto __rcu  *filter_list;
125         struct tcf_block        *block;
126
127         int                     filters;
128
129         struct cbq_class        *defaults[TC_PRIO_MAX + 1];
130 };
131
132 struct cbq_sched_data {
133         struct Qdisc_class_hash clhash;                 /* Hash table of all classes */
134         int                     nclasses[TC_CBQ_MAXPRIO + 1];
135         unsigned int            quanta[TC_CBQ_MAXPRIO + 1];
136
137         struct cbq_class        link;
138
139         unsigned int            activemask;
140         struct cbq_class        *active[TC_CBQ_MAXPRIO + 1];    /* List of all classes
141                                                                    with backlog */
142
143 #ifdef CONFIG_NET_CLS_ACT
144         struct cbq_class        *rx_class;
145 #endif
146         struct cbq_class        *tx_class;
147         struct cbq_class        *tx_borrowed;
148         int                     tx_len;
149         psched_time_t           now;            /* Cached timestamp */
150         unsigned int            pmask;
151
152         struct hrtimer          delay_timer;
153         struct qdisc_watchdog   watchdog;       /* Watchdog timer,
154                                                    started when CBQ has
155                                                    backlog, but cannot
156                                                    transmit just now */
157         psched_tdiff_t          wd_expires;
158         int                     toplevel;
159         u32                     hgenerator;
160 };
161
162
163 #define L2T(cl, len)    qdisc_l2t((cl)->R_tab, len)
164
165 static inline struct cbq_class *
166 cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
167 {
168         struct Qdisc_class_common *clc;
169
170         clc = qdisc_class_find(&q->clhash, classid);
171         if (clc == NULL)
172                 return NULL;
173         return container_of(clc, struct cbq_class, common);
174 }
175
176 #ifdef CONFIG_NET_CLS_ACT
177
178 static struct cbq_class *
179 cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
180 {
181         struct cbq_class *cl;
182
183         for (cl = this->tparent; cl; cl = cl->tparent) {
184                 struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT];
185
186                 if (new != NULL && new != this)
187                         return new;
188         }
189         return NULL;
190 }
191
192 #endif
193
194 /* Classify packet. The procedure is pretty complicated, but
195  * it allows us to combine link sharing and priority scheduling
196  * transparently.
197  *
198  * Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
199  * so that it resolves to split nodes. Then packets are classified
200  * by logical priority, or a more specific classifier may be attached
201  * to the split node.
202  */
203
204 static struct cbq_class *
205 cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
206 {
207         struct cbq_sched_data *q = qdisc_priv(sch);
208         struct cbq_class *head = &q->link;
209         struct cbq_class **defmap;
210         struct cbq_class *cl = NULL;
211         u32 prio = skb->priority;
212         struct tcf_proto *fl;
213         struct tcf_result res;
214
215         /*
216          *  Step 1. If skb->priority points to one of our classes, use it.
217          */
218         if (TC_H_MAJ(prio ^ sch->handle) == 0 &&
219             (cl = cbq_class_lookup(q, prio)) != NULL)
220                 return cl;
221
222         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
223         for (;;) {
224                 int result = 0;
225                 defmap = head->defaults;
226
227                 fl = rcu_dereference_bh(head->filter_list);
228                 /*
229                  * Step 2+n. Apply classifier.
230                  */
231                 result = tcf_classify(skb, NULL, fl, &res, true);
232                 if (!fl || result < 0)
233                         goto fallback;
234                 if (result == TC_ACT_SHOT)
235                         return NULL;
236
237                 cl = (void *)res.class;
238                 if (!cl) {
239                         if (TC_H_MAJ(res.classid))
240                                 cl = cbq_class_lookup(q, res.classid);
241                         else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL)
242                                 cl = defmap[TC_PRIO_BESTEFFORT];
243
244                         if (cl == NULL)
245                                 goto fallback;
246                 }
247                 if (cl->level >= head->level)
248                         goto fallback;
249 #ifdef CONFIG_NET_CLS_ACT
250                 switch (result) {
251                 case TC_ACT_QUEUED:
252                 case TC_ACT_STOLEN:
253                 case TC_ACT_TRAP:
254                         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
255                         fallthrough;
256                 case TC_ACT_RECLASSIFY:
257                         return cbq_reclassify(skb, cl);
258                 }
259 #endif
260                 if (cl->level == 0)
261                         return cl;
262
263                 /*
264                  * Step 3+n. If classifier selected a link sharing class,
265                  *         apply agency specific classifier.
266                  *         Repeat this procedure until we hit a leaf node.
267                  */
268                 head = cl;
269         }
270
271 fallback:
272         cl = head;
273
274         /*
275          * Step 4. No success...
276          */
277         if (TC_H_MAJ(prio) == 0 &&
278             !(cl = head->defaults[prio & TC_PRIO_MAX]) &&
279             !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
280                 return head;
281
282         return cl;
283 }
284
285 /*
286  * A packet has just been enqueued on the empty class.
287  * cbq_activate_class adds it to the tail of active class list
288  * of its priority band.
289  */
290
291 static inline void cbq_activate_class(struct cbq_class *cl)
292 {
293         struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
294         int prio = cl->cpriority;
295         struct cbq_class *cl_tail;
296
297         cl_tail = q->active[prio];
298         q->active[prio] = cl;
299
300         if (cl_tail != NULL) {
301                 cl->next_alive = cl_tail->next_alive;
302                 cl_tail->next_alive = cl;
303         } else {
304                 cl->next_alive = cl;
305                 q->activemask |= (1<<prio);
306         }
307 }
308
309 /*
310  * Unlink class from active chain.
311  * Note that this same procedure is done directly in cbq_dequeue*
312  * during round-robin procedure.
313  */
314
315 static void cbq_deactivate_class(struct cbq_class *this)
316 {
317         struct cbq_sched_data *q = qdisc_priv(this->qdisc);
318         int prio = this->cpriority;
319         struct cbq_class *cl;
320         struct cbq_class *cl_prev = q->active[prio];
321
322         do {
323                 cl = cl_prev->next_alive;
324                 if (cl == this) {
325                         cl_prev->next_alive = cl->next_alive;
326                         cl->next_alive = NULL;
327
328                         if (cl == q->active[prio]) {
329                                 q->active[prio] = cl_prev;
330                                 if (cl == q->active[prio]) {
331                                         q->active[prio] = NULL;
332                                         q->activemask &= ~(1<<prio);
333                                         return;
334                                 }
335                         }
336                         return;
337                 }
338         } while ((cl_prev = cl) != q->active[prio]);
339 }
340
341 static void
342 cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
343 {
344         int toplevel = q->toplevel;
345
346         if (toplevel > cl->level) {
347                 psched_time_t now = psched_get_time();
348
349                 do {
350                         if (cl->undertime < now) {
351                                 q->toplevel = cl->level;
352                                 return;
353                         }
354                 } while ((cl = cl->borrow) != NULL && toplevel > cl->level);
355         }
356 }
357
358 static int
359 cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
360             struct sk_buff **to_free)
361 {
362         struct cbq_sched_data *q = qdisc_priv(sch);
363         int ret;
364         struct cbq_class *cl = cbq_classify(skb, sch, &ret);
365
366 #ifdef CONFIG_NET_CLS_ACT
367         q->rx_class = cl;
368 #endif
369         if (cl == NULL) {
370                 if (ret & __NET_XMIT_BYPASS)
371                         qdisc_qstats_drop(sch);
372                 __qdisc_drop(skb, to_free);
373                 return ret;
374         }
375
376         ret = qdisc_enqueue(skb, cl->q, to_free);
377         if (ret == NET_XMIT_SUCCESS) {
378                 sch->q.qlen++;
379                 cbq_mark_toplevel(q, cl);
380                 if (!cl->next_alive)
381                         cbq_activate_class(cl);
382                 return ret;
383         }
384
385         if (net_xmit_drop_count(ret)) {
386                 qdisc_qstats_drop(sch);
387                 cbq_mark_toplevel(q, cl);
388                 cl->qstats.drops++;
389         }
390         return ret;
391 }
392
393 /* Overlimit action: penalize leaf class by adding offtime */
394 static void cbq_overlimit(struct cbq_class *cl)
395 {
396         struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
397         psched_tdiff_t delay = cl->undertime - q->now;
398
399         if (!cl->delayed) {
400                 delay += cl->offtime;
401
402                 /*
403                  * Class goes to sleep, so that it will have no
404                  * chance to work avgidle. Let's forgive it 8)
405                  *
406                  * BTW cbq-2.0 has a crap in this
407                  * place, apparently they forgot to shift it by cl->ewma_log.
408                  */
409                 if (cl->avgidle < 0)
410                         delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
411                 if (cl->avgidle < cl->minidle)
412                         cl->avgidle = cl->minidle;
413                 if (delay <= 0)
414                         delay = 1;
415                 cl->undertime = q->now + delay;
416
417                 cl->xstats.overactions++;
418                 cl->delayed = 1;
419         }
420         if (q->wd_expires == 0 || q->wd_expires > delay)
421                 q->wd_expires = delay;
422
423         /* Dirty work! We must schedule wakeups based on
424          * real available rate, rather than leaf rate,
425          * which may be tiny (even zero).
426          */
427         if (q->toplevel == TC_CBQ_MAXLEVEL) {
428                 struct cbq_class *b;
429                 psched_tdiff_t base_delay = q->wd_expires;
430
431                 for (b = cl->borrow; b; b = b->borrow) {
432                         delay = b->undertime - q->now;
433                         if (delay < base_delay) {
434                                 if (delay <= 0)
435                                         delay = 1;
436                                 base_delay = delay;
437                         }
438                 }
439
440                 q->wd_expires = base_delay;
441         }
442 }
443
444 static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
445                                        psched_time_t now)
446 {
447         struct cbq_class *cl;
448         struct cbq_class *cl_prev = q->active[prio];
449         psched_time_t sched = now;
450
451         if (cl_prev == NULL)
452                 return 0;
453
454         do {
455                 cl = cl_prev->next_alive;
456                 if (now - cl->penalized > 0) {
457                         cl_prev->next_alive = cl->next_alive;
458                         cl->next_alive = NULL;
459                         cl->cpriority = cl->priority;
460                         cl->delayed = 0;
461                         cbq_activate_class(cl);
462
463                         if (cl == q->active[prio]) {
464                                 q->active[prio] = cl_prev;
465                                 if (cl == q->active[prio]) {
466                                         q->active[prio] = NULL;
467                                         return 0;
468                                 }
469                         }
470
471                         cl = cl_prev->next_alive;
472                 } else if (sched - cl->penalized > 0)
473                         sched = cl->penalized;
474         } while ((cl_prev = cl) != q->active[prio]);
475
476         return sched - now;
477 }
478
479 static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
480 {
481         struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
482                                                 delay_timer);
483         struct Qdisc *sch = q->watchdog.qdisc;
484         psched_time_t now;
485         psched_tdiff_t delay = 0;
486         unsigned int pmask;
487
488         now = psched_get_time();
489
490         pmask = q->pmask;
491         q->pmask = 0;
492
493         while (pmask) {
494                 int prio = ffz(~pmask);
495                 psched_tdiff_t tmp;
496
497                 pmask &= ~(1<<prio);
498
499                 tmp = cbq_undelay_prio(q, prio, now);
500                 if (tmp > 0) {
501                         q->pmask |= 1<<prio;
502                         if (tmp < delay || delay == 0)
503                                 delay = tmp;
504                 }
505         }
506
507         if (delay) {
508                 ktime_t time;
509
510                 time = 0;
511                 time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay));
512                 hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS_PINNED);
513         }
514
515         __netif_schedule(qdisc_root(sch));
516         return HRTIMER_NORESTART;
517 }
518
519 /*
520  * It is mission critical procedure.
521  *
522  * We "regenerate" toplevel cutoff, if transmitting class
523  * has backlog and it is not regulated. It is not part of
524  * original CBQ description, but looks more reasonable.
525  * Probably, it is wrong. This question needs further investigation.
526  */
527
528 static inline void
529 cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
530                     struct cbq_class *borrowed)
531 {
532         if (cl && q->toplevel >= borrowed->level) {
533                 if (cl->q->q.qlen > 1) {
534                         do {
535                                 if (borrowed->undertime == PSCHED_PASTPERFECT) {
536                                         q->toplevel = borrowed->level;
537                                         return;
538                                 }
539                         } while ((borrowed = borrowed->borrow) != NULL);
540                 }
541 #if 0
542         /* It is not necessary now. Uncommenting it
543            will save CPU cycles, but decrease fairness.
544          */
545                 q->toplevel = TC_CBQ_MAXLEVEL;
546 #endif
547         }
548 }
549
550 static void
551 cbq_update(struct cbq_sched_data *q)
552 {
553         struct cbq_class *this = q->tx_class;
554         struct cbq_class *cl = this;
555         int len = q->tx_len;
556         psched_time_t now;
557
558         q->tx_class = NULL;
559         /* Time integrator. We calculate EOS time
560          * by adding expected packet transmission time.
561          */
562         now = q->now + L2T(&q->link, len);
563
564         for ( ; cl; cl = cl->share) {
565                 long avgidle = cl->avgidle;
566                 long idle;
567
568                 cl->bstats.packets++;
569                 cl->bstats.bytes += len;
570
571                 /*
572                  * (now - last) is total time between packet right edges.
573                  * (last_pktlen/rate) is "virtual" busy time, so that
574                  *
575                  *      idle = (now - last) - last_pktlen/rate
576                  */
577
578                 idle = now - cl->last;
579                 if ((unsigned long)idle > 128*1024*1024) {
580                         avgidle = cl->maxidle;
581                 } else {
582                         idle -= L2T(cl, len);
583
584                 /* true_avgidle := (1-W)*true_avgidle + W*idle,
585                  * where W=2^{-ewma_log}. But cl->avgidle is scaled:
586                  * cl->avgidle == true_avgidle/W,
587                  * hence:
588                  */
589                         avgidle += idle - (avgidle>>cl->ewma_log);
590                 }
591
592                 if (avgidle <= 0) {
593                         /* Overlimit or at-limit */
594
595                         if (avgidle < cl->minidle)
596                                 avgidle = cl->minidle;
597
598                         cl->avgidle = avgidle;
599
600                         /* Calculate expected time, when this class
601                          * will be allowed to send.
602                          * It will occur, when:
603                          * (1-W)*true_avgidle + W*delay = 0, i.e.
604                          * idle = (1/W - 1)*(-true_avgidle)
605                          * or
606                          * idle = (1 - W)*(-cl->avgidle);
607                          */
608                         idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
609
610                         /*
611                          * That is not all.
612                          * To maintain the rate allocated to the class,
613                          * we add to undertime virtual clock,
614                          * necessary to complete transmitted packet.
615                          * (len/phys_bandwidth has been already passed
616                          * to the moment of cbq_update)
617                          */
618
619                         idle -= L2T(&q->link, len);
620                         idle += L2T(cl, len);
621
622                         cl->undertime = now + idle;
623                 } else {
624                         /* Underlimit */
625
626                         cl->undertime = PSCHED_PASTPERFECT;
627                         if (avgidle > cl->maxidle)
628                                 cl->avgidle = cl->maxidle;
629                         else
630                                 cl->avgidle = avgidle;
631                 }
632                 if ((s64)(now - cl->last) > 0)
633                         cl->last = now;
634         }
635
636         cbq_update_toplevel(q, this, q->tx_borrowed);
637 }
638
639 static inline struct cbq_class *
640 cbq_under_limit(struct cbq_class *cl)
641 {
642         struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
643         struct cbq_class *this_cl = cl;
644
645         if (cl->tparent == NULL)
646                 return cl;
647
648         if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
649                 cl->delayed = 0;
650                 return cl;
651         }
652
653         do {
654                 /* It is very suspicious place. Now overlimit
655                  * action is generated for not bounded classes
656                  * only if link is completely congested.
657                  * Though it is in agree with ancestor-only paradigm,
658                  * it looks very stupid. Particularly,
659                  * it means that this chunk of code will either
660                  * never be called or result in strong amplification
661                  * of burstiness. Dangerous, silly, and, however,
662                  * no another solution exists.
663                  */
664                 cl = cl->borrow;
665                 if (!cl) {
666                         this_cl->qstats.overlimits++;
667                         cbq_overlimit(this_cl);
668                         return NULL;
669                 }
670                 if (cl->level > q->toplevel)
671                         return NULL;
672         } while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
673
674         cl->delayed = 0;
675         return cl;
676 }
677
678 static inline struct sk_buff *
679 cbq_dequeue_prio(struct Qdisc *sch, int prio)
680 {
681         struct cbq_sched_data *q = qdisc_priv(sch);
682         struct cbq_class *cl_tail, *cl_prev, *cl;
683         struct sk_buff *skb;
684         int deficit;
685
686         cl_tail = cl_prev = q->active[prio];
687         cl = cl_prev->next_alive;
688
689         do {
690                 deficit = 0;
691
692                 /* Start round */
693                 do {
694                         struct cbq_class *borrow = cl;
695
696                         if (cl->q->q.qlen &&
697                             (borrow = cbq_under_limit(cl)) == NULL)
698                                 goto skip_class;
699
700                         if (cl->deficit <= 0) {
701                                 /* Class exhausted its allotment per
702                                  * this round. Switch to the next one.
703                                  */
704                                 deficit = 1;
705                                 cl->deficit += cl->quantum;
706                                 goto next_class;
707                         }
708
709                         skb = cl->q->dequeue(cl->q);
710
711                         /* Class did not give us any skb :-(
712                          * It could occur even if cl->q->q.qlen != 0
713                          * f.e. if cl->q == "tbf"
714                          */
715                         if (skb == NULL)
716                                 goto skip_class;
717
718                         cl->deficit -= qdisc_pkt_len(skb);
719                         q->tx_class = cl;
720                         q->tx_borrowed = borrow;
721                         if (borrow != cl) {
722 #ifndef CBQ_XSTATS_BORROWS_BYTES
723                                 borrow->xstats.borrows++;
724                                 cl->xstats.borrows++;
725 #else
726                                 borrow->xstats.borrows += qdisc_pkt_len(skb);
727                                 cl->xstats.borrows += qdisc_pkt_len(skb);
728 #endif
729                         }
730                         q->tx_len = qdisc_pkt_len(skb);
731
732                         if (cl->deficit <= 0) {
733                                 q->active[prio] = cl;
734                                 cl = cl->next_alive;
735                                 cl->deficit += cl->quantum;
736                         }
737                         return skb;
738
739 skip_class:
740                         if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
741                                 /* Class is empty or penalized.
742                                  * Unlink it from active chain.
743                                  */
744                                 cl_prev->next_alive = cl->next_alive;
745                                 cl->next_alive = NULL;
746
747                                 /* Did cl_tail point to it? */
748                                 if (cl == cl_tail) {
749                                         /* Repair it! */
750                                         cl_tail = cl_prev;
751
752                                         /* Was it the last class in this band? */
753                                         if (cl == cl_tail) {
754                                                 /* Kill the band! */
755                                                 q->active[prio] = NULL;
756                                                 q->activemask &= ~(1<<prio);
757                                                 if (cl->q->q.qlen)
758                                                         cbq_activate_class(cl);
759                                                 return NULL;
760                                         }
761
762                                         q->active[prio] = cl_tail;
763                                 }
764                                 if (cl->q->q.qlen)
765                                         cbq_activate_class(cl);
766
767                                 cl = cl_prev;
768                         }
769
770 next_class:
771                         cl_prev = cl;
772                         cl = cl->next_alive;
773                 } while (cl_prev != cl_tail);
774         } while (deficit);
775
776         q->active[prio] = cl_prev;
777
778         return NULL;
779 }
780
781 static inline struct sk_buff *
782 cbq_dequeue_1(struct Qdisc *sch)
783 {
784         struct cbq_sched_data *q = qdisc_priv(sch);
785         struct sk_buff *skb;
786         unsigned int activemask;
787
788         activemask = q->activemask & 0xFF;
789         while (activemask) {
790                 int prio = ffz(~activemask);
791                 activemask &= ~(1<<prio);
792                 skb = cbq_dequeue_prio(sch, prio);
793                 if (skb)
794                         return skb;
795         }
796         return NULL;
797 }
798
799 static struct sk_buff *
800 cbq_dequeue(struct Qdisc *sch)
801 {
802         struct sk_buff *skb;
803         struct cbq_sched_data *q = qdisc_priv(sch);
804         psched_time_t now;
805
806         now = psched_get_time();
807
808         if (q->tx_class)
809                 cbq_update(q);
810
811         q->now = now;
812
813         for (;;) {
814                 q->wd_expires = 0;
815
816                 skb = cbq_dequeue_1(sch);
817                 if (skb) {
818                         qdisc_bstats_update(sch, skb);
819                         sch->q.qlen--;
820                         return skb;
821                 }
822
823                 /* All the classes are overlimit.
824                  *
825                  * It is possible, if:
826                  *
827                  * 1. Scheduler is empty.
828                  * 2. Toplevel cutoff inhibited borrowing.
829                  * 3. Root class is overlimit.
830                  *
831                  * Reset 2d and 3d conditions and retry.
832                  *
833                  * Note, that NS and cbq-2.0 are buggy, peeking
834                  * an arbitrary class is appropriate for ancestor-only
835                  * sharing, but not for toplevel algorithm.
836                  *
837                  * Our version is better, but slower, because it requires
838                  * two passes, but it is unavoidable with top-level sharing.
839                  */
840
841                 if (q->toplevel == TC_CBQ_MAXLEVEL &&
842                     q->link.undertime == PSCHED_PASTPERFECT)
843                         break;
844
845                 q->toplevel = TC_CBQ_MAXLEVEL;
846                 q->link.undertime = PSCHED_PASTPERFECT;
847         }
848
849         /* No packets in scheduler or nobody wants to give them to us :-(
850          * Sigh... start watchdog timer in the last case.
851          */
852
853         if (sch->q.qlen) {
854                 qdisc_qstats_overlimit(sch);
855                 if (q->wd_expires)
856                         qdisc_watchdog_schedule(&q->watchdog,
857                                                 now + q->wd_expires);
858         }
859         return NULL;
860 }
861
862 /* CBQ class maintenance routines */
863
864 static void cbq_adjust_levels(struct cbq_class *this)
865 {
866         if (this == NULL)
867                 return;
868
869         do {
870                 int level = 0;
871                 struct cbq_class *cl;
872
873                 cl = this->children;
874                 if (cl) {
875                         do {
876                                 if (cl->level > level)
877                                         level = cl->level;
878                         } while ((cl = cl->sibling) != this->children);
879                 }
880                 this->level = level + 1;
881         } while ((this = this->tparent) != NULL);
882 }
883
884 static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
885 {
886         struct cbq_class *cl;
887         unsigned int h;
888
889         if (q->quanta[prio] == 0)
890                 return;
891
892         for (h = 0; h < q->clhash.hashsize; h++) {
893                 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
894                         /* BUGGGG... Beware! This expression suffer of
895                          * arithmetic overflows!
896                          */
897                         if (cl->priority == prio) {
898                                 cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
899                                         q->quanta[prio];
900                         }
901                         if (cl->quantum <= 0 ||
902                             cl->quantum > 32*qdisc_dev(cl->qdisc)->mtu) {
903                                 pr_warn("CBQ: class %08x has bad quantum==%ld, repaired.\n",
904                                         cl->common.classid, cl->quantum);
905                                 cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
906                         }
907                 }
908         }
909 }
910
911 static void cbq_sync_defmap(struct cbq_class *cl)
912 {
913         struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
914         struct cbq_class *split = cl->split;
915         unsigned int h;
916         int i;
917
918         if (split == NULL)
919                 return;
920
921         for (i = 0; i <= TC_PRIO_MAX; i++) {
922                 if (split->defaults[i] == cl && !(cl->defmap & (1<<i)))
923                         split->defaults[i] = NULL;
924         }
925
926         for (i = 0; i <= TC_PRIO_MAX; i++) {
927                 int level = split->level;
928
929                 if (split->defaults[i])
930                         continue;
931
932                 for (h = 0; h < q->clhash.hashsize; h++) {
933                         struct cbq_class *c;
934
935                         hlist_for_each_entry(c, &q->clhash.hash[h],
936                                              common.hnode) {
937                                 if (c->split == split && c->level < level &&
938                                     c->defmap & (1<<i)) {
939                                         split->defaults[i] = c;
940                                         level = c->level;
941                                 }
942                         }
943                 }
944         }
945 }
946
947 static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
948 {
949         struct cbq_class *split = NULL;
950
951         if (splitid == 0) {
952                 split = cl->split;
953                 if (!split)
954                         return;
955                 splitid = split->common.classid;
956         }
957
958         if (split == NULL || split->common.classid != splitid) {
959                 for (split = cl->tparent; split; split = split->tparent)
960                         if (split->common.classid == splitid)
961                                 break;
962         }
963
964         if (split == NULL)
965                 return;
966
967         if (cl->split != split) {
968                 cl->defmap = 0;
969                 cbq_sync_defmap(cl);
970                 cl->split = split;
971                 cl->defmap = def & mask;
972         } else
973                 cl->defmap = (cl->defmap & ~mask) | (def & mask);
974
975         cbq_sync_defmap(cl);
976 }
977
978 static void cbq_unlink_class(struct cbq_class *this)
979 {
980         struct cbq_class *cl, **clp;
981         struct cbq_sched_data *q = qdisc_priv(this->qdisc);
982
983         qdisc_class_hash_remove(&q->clhash, &this->common);
984
985         if (this->tparent) {
986                 clp = &this->sibling;
987                 cl = *clp;
988                 do {
989                         if (cl == this) {
990                                 *clp = cl->sibling;
991                                 break;
992                         }
993                         clp = &cl->sibling;
994                 } while ((cl = *clp) != this->sibling);
995
996                 if (this->tparent->children == this) {
997                         this->tparent->children = this->sibling;
998                         if (this->sibling == this)
999                                 this->tparent->children = NULL;
1000                 }
1001         } else {
1002                 WARN_ON(this->sibling != this);
1003         }
1004 }
1005
1006 static void cbq_link_class(struct cbq_class *this)
1007 {
1008         struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1009         struct cbq_class *parent = this->tparent;
1010
1011         this->sibling = this;
1012         qdisc_class_hash_insert(&q->clhash, &this->common);
1013
1014         if (parent == NULL)
1015                 return;
1016
1017         if (parent->children == NULL) {
1018                 parent->children = this;
1019         } else {
1020                 this->sibling = parent->children->sibling;
1021                 parent->children->sibling = this;
1022         }
1023 }
1024
1025 static void
1026 cbq_reset(struct Qdisc *sch)
1027 {
1028         struct cbq_sched_data *q = qdisc_priv(sch);
1029         struct cbq_class *cl;
1030         int prio;
1031         unsigned int h;
1032
1033         q->activemask = 0;
1034         q->pmask = 0;
1035         q->tx_class = NULL;
1036         q->tx_borrowed = NULL;
1037         qdisc_watchdog_cancel(&q->watchdog);
1038         hrtimer_cancel(&q->delay_timer);
1039         q->toplevel = TC_CBQ_MAXLEVEL;
1040         q->now = psched_get_time();
1041
1042         for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1043                 q->active[prio] = NULL;
1044
1045         for (h = 0; h < q->clhash.hashsize; h++) {
1046                 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1047                         qdisc_reset(cl->q);
1048
1049                         cl->next_alive = NULL;
1050                         cl->undertime = PSCHED_PASTPERFECT;
1051                         cl->avgidle = cl->maxidle;
1052                         cl->deficit = cl->quantum;
1053                         cl->cpriority = cl->priority;
1054                 }
1055         }
1056 }
1057
1058
1059 static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1060 {
1061         if (lss->change & TCF_CBQ_LSS_FLAGS) {
1062                 cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1063                 cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1064         }
1065         if (lss->change & TCF_CBQ_LSS_EWMA)
1066                 cl->ewma_log = lss->ewma_log;
1067         if (lss->change & TCF_CBQ_LSS_AVPKT)
1068                 cl->avpkt = lss->avpkt;
1069         if (lss->change & TCF_CBQ_LSS_MINIDLE)
1070                 cl->minidle = -(long)lss->minidle;
1071         if (lss->change & TCF_CBQ_LSS_MAXIDLE) {
1072                 cl->maxidle = lss->maxidle;
1073                 cl->avgidle = lss->maxidle;
1074         }
1075         if (lss->change & TCF_CBQ_LSS_OFFTIME)
1076                 cl->offtime = lss->offtime;
1077         return 0;
1078 }
1079
1080 static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1081 {
1082         q->nclasses[cl->priority]--;
1083         q->quanta[cl->priority] -= cl->weight;
1084         cbq_normalize_quanta(q, cl->priority);
1085 }
1086
1087 static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1088 {
1089         q->nclasses[cl->priority]++;
1090         q->quanta[cl->priority] += cl->weight;
1091         cbq_normalize_quanta(q, cl->priority);
1092 }
1093
1094 static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1095 {
1096         struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1097
1098         if (wrr->allot)
1099                 cl->allot = wrr->allot;
1100         if (wrr->weight)
1101                 cl->weight = wrr->weight;
1102         if (wrr->priority) {
1103                 cl->priority = wrr->priority - 1;
1104                 cl->cpriority = cl->priority;
1105                 if (cl->priority >= cl->priority2)
1106                         cl->priority2 = TC_CBQ_MAXPRIO - 1;
1107         }
1108
1109         cbq_addprio(q, cl);
1110         return 0;
1111 }
1112
1113 static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1114 {
1115         cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1116         return 0;
1117 }
1118
1119 static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
1120         [TCA_CBQ_LSSOPT]        = { .len = sizeof(struct tc_cbq_lssopt) },
1121         [TCA_CBQ_WRROPT]        = { .len = sizeof(struct tc_cbq_wrropt) },
1122         [TCA_CBQ_FOPT]          = { .len = sizeof(struct tc_cbq_fopt) },
1123         [TCA_CBQ_OVL_STRATEGY]  = { .len = sizeof(struct tc_cbq_ovl) },
1124         [TCA_CBQ_RATE]          = { .len = sizeof(struct tc_ratespec) },
1125         [TCA_CBQ_RTAB]          = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1126         [TCA_CBQ_POLICE]        = { .len = sizeof(struct tc_cbq_police) },
1127 };
1128
1129 static int cbq_opt_parse(struct nlattr *tb[TCA_CBQ_MAX + 1],
1130                          struct nlattr *opt,
1131                          struct netlink_ext_ack *extack)
1132 {
1133         int err;
1134
1135         if (!opt) {
1136                 NL_SET_ERR_MSG(extack, "CBQ options are required for this operation");
1137                 return -EINVAL;
1138         }
1139
1140         err = nla_parse_nested_deprecated(tb, TCA_CBQ_MAX, opt,
1141                                           cbq_policy, extack);
1142         if (err < 0)
1143                 return err;
1144
1145         if (tb[TCA_CBQ_WRROPT]) {
1146                 const struct tc_cbq_wrropt *wrr = nla_data(tb[TCA_CBQ_WRROPT]);
1147
1148                 if (wrr->priority > TC_CBQ_MAXPRIO) {
1149                         NL_SET_ERR_MSG(extack, "priority is bigger than TC_CBQ_MAXPRIO");
1150                         err = -EINVAL;
1151                 }
1152         }
1153         return err;
1154 }
1155
1156 static int cbq_init(struct Qdisc *sch, struct nlattr *opt,
1157                     struct netlink_ext_ack *extack)
1158 {
1159         struct cbq_sched_data *q = qdisc_priv(sch);
1160         struct nlattr *tb[TCA_CBQ_MAX + 1];
1161         struct tc_ratespec *r;
1162         int err;
1163
1164         qdisc_watchdog_init(&q->watchdog, sch);
1165         hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
1166         q->delay_timer.function = cbq_undelay;
1167
1168         err = cbq_opt_parse(tb, opt, extack);
1169         if (err < 0)
1170                 return err;
1171
1172         if (!tb[TCA_CBQ_RTAB] || !tb[TCA_CBQ_RATE]) {
1173                 NL_SET_ERR_MSG(extack, "Rate specification missing or incomplete");
1174                 return -EINVAL;
1175         }
1176
1177         r = nla_data(tb[TCA_CBQ_RATE]);
1178
1179         q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB], extack);
1180         if (!q->link.R_tab)
1181                 return -EINVAL;
1182
1183         err = tcf_block_get(&q->link.block, &q->link.filter_list, sch, extack);
1184         if (err)
1185                 goto put_rtab;
1186
1187         err = qdisc_class_hash_init(&q->clhash);
1188         if (err < 0)
1189                 goto put_block;
1190
1191         q->link.sibling = &q->link;
1192         q->link.common.classid = sch->handle;
1193         q->link.qdisc = sch;
1194         q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1195                                       sch->handle, NULL);
1196         if (!q->link.q)
1197                 q->link.q = &noop_qdisc;
1198         else
1199                 qdisc_hash_add(q->link.q, true);
1200
1201         q->link.priority = TC_CBQ_MAXPRIO - 1;
1202         q->link.priority2 = TC_CBQ_MAXPRIO - 1;
1203         q->link.cpriority = TC_CBQ_MAXPRIO - 1;
1204         q->link.allot = psched_mtu(qdisc_dev(sch));
1205         q->link.quantum = q->link.allot;
1206         q->link.weight = q->link.R_tab->rate.rate;
1207
1208         q->link.ewma_log = TC_CBQ_DEF_EWMA;
1209         q->link.avpkt = q->link.allot/2;
1210         q->link.minidle = -0x7FFFFFFF;
1211
1212         q->toplevel = TC_CBQ_MAXLEVEL;
1213         q->now = psched_get_time();
1214
1215         cbq_link_class(&q->link);
1216
1217         if (tb[TCA_CBQ_LSSOPT])
1218                 cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
1219
1220         cbq_addprio(q, &q->link);
1221         return 0;
1222
1223 put_block:
1224         tcf_block_put(q->link.block);
1225
1226 put_rtab:
1227         qdisc_put_rtab(q->link.R_tab);
1228         return err;
1229 }
1230
1231 static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1232 {
1233         unsigned char *b = skb_tail_pointer(skb);
1234
1235         if (nla_put(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate))
1236                 goto nla_put_failure;
1237         return skb->len;
1238
1239 nla_put_failure:
1240         nlmsg_trim(skb, b);
1241         return -1;
1242 }
1243
1244 static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1245 {
1246         unsigned char *b = skb_tail_pointer(skb);
1247         struct tc_cbq_lssopt opt;
1248
1249         opt.flags = 0;
1250         if (cl->borrow == NULL)
1251                 opt.flags |= TCF_CBQ_LSS_BOUNDED;
1252         if (cl->share == NULL)
1253                 opt.flags |= TCF_CBQ_LSS_ISOLATED;
1254         opt.ewma_log = cl->ewma_log;
1255         opt.level = cl->level;
1256         opt.avpkt = cl->avpkt;
1257         opt.maxidle = cl->maxidle;
1258         opt.minidle = (u32)(-cl->minidle);
1259         opt.offtime = cl->offtime;
1260         opt.change = ~0;
1261         if (nla_put(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt))
1262                 goto nla_put_failure;
1263         return skb->len;
1264
1265 nla_put_failure:
1266         nlmsg_trim(skb, b);
1267         return -1;
1268 }
1269
1270 static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1271 {
1272         unsigned char *b = skb_tail_pointer(skb);
1273         struct tc_cbq_wrropt opt;
1274
1275         memset(&opt, 0, sizeof(opt));
1276         opt.flags = 0;
1277         opt.allot = cl->allot;
1278         opt.priority = cl->priority + 1;
1279         opt.cpriority = cl->cpriority + 1;
1280         opt.weight = cl->weight;
1281         if (nla_put(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt))
1282                 goto nla_put_failure;
1283         return skb->len;
1284
1285 nla_put_failure:
1286         nlmsg_trim(skb, b);
1287         return -1;
1288 }
1289
1290 static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1291 {
1292         unsigned char *b = skb_tail_pointer(skb);
1293         struct tc_cbq_fopt opt;
1294
1295         if (cl->split || cl->defmap) {
1296                 opt.split = cl->split ? cl->split->common.classid : 0;
1297                 opt.defmap = cl->defmap;
1298                 opt.defchange = ~0;
1299                 if (nla_put(skb, TCA_CBQ_FOPT, sizeof(opt), &opt))
1300                         goto nla_put_failure;
1301         }
1302         return skb->len;
1303
1304 nla_put_failure:
1305         nlmsg_trim(skb, b);
1306         return -1;
1307 }
1308
1309 static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1310 {
1311         if (cbq_dump_lss(skb, cl) < 0 ||
1312             cbq_dump_rate(skb, cl) < 0 ||
1313             cbq_dump_wrr(skb, cl) < 0 ||
1314             cbq_dump_fopt(skb, cl) < 0)
1315                 return -1;
1316         return 0;
1317 }
1318
1319 static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1320 {
1321         struct cbq_sched_data *q = qdisc_priv(sch);
1322         struct nlattr *nest;
1323
1324         nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
1325         if (nest == NULL)
1326                 goto nla_put_failure;
1327         if (cbq_dump_attr(skb, &q->link) < 0)
1328                 goto nla_put_failure;
1329         return nla_nest_end(skb, nest);
1330
1331 nla_put_failure:
1332         nla_nest_cancel(skb, nest);
1333         return -1;
1334 }
1335
1336 static int
1337 cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1338 {
1339         struct cbq_sched_data *q = qdisc_priv(sch);
1340
1341         q->link.xstats.avgidle = q->link.avgidle;
1342         return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1343 }
1344
1345 static int
1346 cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1347                struct sk_buff *skb, struct tcmsg *tcm)
1348 {
1349         struct cbq_class *cl = (struct cbq_class *)arg;
1350         struct nlattr *nest;
1351
1352         if (cl->tparent)
1353                 tcm->tcm_parent = cl->tparent->common.classid;
1354         else
1355                 tcm->tcm_parent = TC_H_ROOT;
1356         tcm->tcm_handle = cl->common.classid;
1357         tcm->tcm_info = cl->q->handle;
1358
1359         nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
1360         if (nest == NULL)
1361                 goto nla_put_failure;
1362         if (cbq_dump_attr(skb, cl) < 0)
1363                 goto nla_put_failure;
1364         return nla_nest_end(skb, nest);
1365
1366 nla_put_failure:
1367         nla_nest_cancel(skb, nest);
1368         return -1;
1369 }
1370
1371 static int
1372 cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1373         struct gnet_dump *d)
1374 {
1375         struct cbq_sched_data *q = qdisc_priv(sch);
1376         struct cbq_class *cl = (struct cbq_class *)arg;
1377         __u32 qlen;
1378
1379         cl->xstats.avgidle = cl->avgidle;
1380         cl->xstats.undertime = 0;
1381         qdisc_qstats_qlen_backlog(cl->q, &qlen, &cl->qstats.backlog);
1382
1383         if (cl->undertime != PSCHED_PASTPERFECT)
1384                 cl->xstats.undertime = cl->undertime - q->now;
1385
1386         if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
1387                                   d, NULL, &cl->bstats) < 0 ||
1388             gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1389             gnet_stats_copy_queue(d, NULL, &cl->qstats, qlen) < 0)
1390                 return -1;
1391
1392         return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1393 }
1394
1395 static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1396                      struct Qdisc **old, struct netlink_ext_ack *extack)
1397 {
1398         struct cbq_class *cl = (struct cbq_class *)arg;
1399
1400         if (new == NULL) {
1401                 new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1402                                         cl->common.classid, extack);
1403                 if (new == NULL)
1404                         return -ENOBUFS;
1405         }
1406
1407         *old = qdisc_replace(sch, new, &cl->q);
1408         return 0;
1409 }
1410
1411 static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg)
1412 {
1413         struct cbq_class *cl = (struct cbq_class *)arg;
1414
1415         return cl->q;
1416 }
1417
1418 static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1419 {
1420         struct cbq_class *cl = (struct cbq_class *)arg;
1421
1422         cbq_deactivate_class(cl);
1423 }
1424
1425 static unsigned long cbq_find(struct Qdisc *sch, u32 classid)
1426 {
1427         struct cbq_sched_data *q = qdisc_priv(sch);
1428
1429         return (unsigned long)cbq_class_lookup(q, classid);
1430 }
1431
1432 static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1433 {
1434         struct cbq_sched_data *q = qdisc_priv(sch);
1435
1436         WARN_ON(cl->filters);
1437
1438         tcf_block_put(cl->block);
1439         qdisc_put(cl->q);
1440         qdisc_put_rtab(cl->R_tab);
1441         gen_kill_estimator(&cl->rate_est);
1442         if (cl != &q->link)
1443                 kfree(cl);
1444 }
1445
1446 static void cbq_destroy(struct Qdisc *sch)
1447 {
1448         struct cbq_sched_data *q = qdisc_priv(sch);
1449         struct hlist_node *next;
1450         struct cbq_class *cl;
1451         unsigned int h;
1452
1453 #ifdef CONFIG_NET_CLS_ACT
1454         q->rx_class = NULL;
1455 #endif
1456         /*
1457          * Filters must be destroyed first because we don't destroy the
1458          * classes from root to leafs which means that filters can still
1459          * be bound to classes which have been destroyed already. --TGR '04
1460          */
1461         for (h = 0; h < q->clhash.hashsize; h++) {
1462                 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1463                         tcf_block_put(cl->block);
1464                         cl->block = NULL;
1465                 }
1466         }
1467         for (h = 0; h < q->clhash.hashsize; h++) {
1468                 hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h],
1469                                           common.hnode)
1470                         cbq_destroy_class(sch, cl);
1471         }
1472         qdisc_class_hash_destroy(&q->clhash);
1473 }
1474
1475 static int
1476 cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
1477                  unsigned long *arg, struct netlink_ext_ack *extack)
1478 {
1479         int err;
1480         struct cbq_sched_data *q = qdisc_priv(sch);
1481         struct cbq_class *cl = (struct cbq_class *)*arg;
1482         struct nlattr *opt = tca[TCA_OPTIONS];
1483         struct nlattr *tb[TCA_CBQ_MAX + 1];
1484         struct cbq_class *parent;
1485         struct qdisc_rate_table *rtab = NULL;
1486
1487         err = cbq_opt_parse(tb, opt, extack);
1488         if (err < 0)
1489                 return err;
1490
1491         if (tb[TCA_CBQ_OVL_STRATEGY] || tb[TCA_CBQ_POLICE]) {
1492                 NL_SET_ERR_MSG(extack, "Neither overlimit strategy nor policing attributes can be used for changing class params");
1493                 return -EOPNOTSUPP;
1494         }
1495
1496         if (cl) {
1497                 /* Check parent */
1498                 if (parentid) {
1499                         if (cl->tparent &&
1500                             cl->tparent->common.classid != parentid) {
1501                                 NL_SET_ERR_MSG(extack, "Invalid parent id");
1502                                 return -EINVAL;
1503                         }
1504                         if (!cl->tparent && parentid != TC_H_ROOT) {
1505                                 NL_SET_ERR_MSG(extack, "Parent must be root");
1506                                 return -EINVAL;
1507                         }
1508                 }
1509
1510                 if (tb[TCA_CBQ_RATE]) {
1511                         rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]),
1512                                               tb[TCA_CBQ_RTAB], extack);
1513                         if (rtab == NULL)
1514                                 return -EINVAL;
1515                 }
1516
1517                 if (tca[TCA_RATE]) {
1518                         err = gen_replace_estimator(&cl->bstats, NULL,
1519                                                     &cl->rate_est,
1520                                                     NULL,
1521                                                     qdisc_root_sleeping_running(sch),
1522                                                     tca[TCA_RATE]);
1523                         if (err) {
1524                                 NL_SET_ERR_MSG(extack, "Failed to replace specified rate estimator");
1525                                 qdisc_put_rtab(rtab);
1526                                 return err;
1527                         }
1528                 }
1529
1530                 /* Change class parameters */
1531                 sch_tree_lock(sch);
1532
1533                 if (cl->next_alive != NULL)
1534                         cbq_deactivate_class(cl);
1535
1536                 if (rtab) {
1537                         qdisc_put_rtab(cl->R_tab);
1538                         cl->R_tab = rtab;
1539                 }
1540
1541                 if (tb[TCA_CBQ_LSSOPT])
1542                         cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1543
1544                 if (tb[TCA_CBQ_WRROPT]) {
1545                         cbq_rmprio(q, cl);
1546                         cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1547                 }
1548
1549                 if (tb[TCA_CBQ_FOPT])
1550                         cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1551
1552                 if (cl->q->q.qlen)
1553                         cbq_activate_class(cl);
1554
1555                 sch_tree_unlock(sch);
1556
1557                 return 0;
1558         }
1559
1560         if (parentid == TC_H_ROOT)
1561                 return -EINVAL;
1562
1563         if (!tb[TCA_CBQ_WRROPT] || !tb[TCA_CBQ_RATE] || !tb[TCA_CBQ_LSSOPT]) {
1564                 NL_SET_ERR_MSG(extack, "One of the following attributes MUST be specified: WRR, rate or link sharing");
1565                 return -EINVAL;
1566         }
1567
1568         rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB],
1569                               extack);
1570         if (rtab == NULL)
1571                 return -EINVAL;
1572
1573         if (classid) {
1574                 err = -EINVAL;
1575                 if (TC_H_MAJ(classid ^ sch->handle) ||
1576                     cbq_class_lookup(q, classid)) {
1577                         NL_SET_ERR_MSG(extack, "Specified class not found");
1578                         goto failure;
1579                 }
1580         } else {
1581                 int i;
1582                 classid = TC_H_MAKE(sch->handle, 0x8000);
1583
1584                 for (i = 0; i < 0x8000; i++) {
1585                         if (++q->hgenerator >= 0x8000)
1586                                 q->hgenerator = 1;
1587                         if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1588                                 break;
1589                 }
1590                 err = -ENOSR;
1591                 if (i >= 0x8000) {
1592                         NL_SET_ERR_MSG(extack, "Unable to generate classid");
1593                         goto failure;
1594                 }
1595                 classid = classid|q->hgenerator;
1596         }
1597
1598         parent = &q->link;
1599         if (parentid) {
1600                 parent = cbq_class_lookup(q, parentid);
1601                 err = -EINVAL;
1602                 if (!parent) {
1603                         NL_SET_ERR_MSG(extack, "Failed to find parentid");
1604                         goto failure;
1605                 }
1606         }
1607
1608         err = -ENOBUFS;
1609         cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1610         if (cl == NULL)
1611                 goto failure;
1612
1613         err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack);
1614         if (err) {
1615                 kfree(cl);
1616                 goto failure;
1617         }
1618
1619         if (tca[TCA_RATE]) {
1620                 err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
1621                                         NULL,
1622                                         qdisc_root_sleeping_running(sch),
1623                                         tca[TCA_RATE]);
1624                 if (err) {
1625                         NL_SET_ERR_MSG(extack, "Couldn't create new estimator");
1626                         tcf_block_put(cl->block);
1627                         kfree(cl);
1628                         goto failure;
1629                 }
1630         }
1631
1632         cl->R_tab = rtab;
1633         rtab = NULL;
1634         cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid,
1635                                   NULL);
1636         if (!cl->q)
1637                 cl->q = &noop_qdisc;
1638         else
1639                 qdisc_hash_add(cl->q, true);
1640
1641         cl->common.classid = classid;
1642         cl->tparent = parent;
1643         cl->qdisc = sch;
1644         cl->allot = parent->allot;
1645         cl->quantum = cl->allot;
1646         cl->weight = cl->R_tab->rate.rate;
1647
1648         sch_tree_lock(sch);
1649         cbq_link_class(cl);
1650         cl->borrow = cl->tparent;
1651         if (cl->tparent != &q->link)
1652                 cl->share = cl->tparent;
1653         cbq_adjust_levels(parent);
1654         cl->minidle = -0x7FFFFFFF;
1655         cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1656         cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1657         if (cl->ewma_log == 0)
1658                 cl->ewma_log = q->link.ewma_log;
1659         if (cl->maxidle == 0)
1660                 cl->maxidle = q->link.maxidle;
1661         if (cl->avpkt == 0)
1662                 cl->avpkt = q->link.avpkt;
1663         if (tb[TCA_CBQ_FOPT])
1664                 cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1665         sch_tree_unlock(sch);
1666
1667         qdisc_class_hash_grow(sch, &q->clhash);
1668
1669         *arg = (unsigned long)cl;
1670         return 0;
1671
1672 failure:
1673         qdisc_put_rtab(rtab);
1674         return err;
1675 }
1676
1677 static int cbq_delete(struct Qdisc *sch, unsigned long arg,
1678                       struct netlink_ext_ack *extack)
1679 {
1680         struct cbq_sched_data *q = qdisc_priv(sch);
1681         struct cbq_class *cl = (struct cbq_class *)arg;
1682
1683         if (cl->filters || cl->children || cl == &q->link)
1684                 return -EBUSY;
1685
1686         sch_tree_lock(sch);
1687
1688         qdisc_purge_queue(cl->q);
1689
1690         if (cl->next_alive)
1691                 cbq_deactivate_class(cl);
1692
1693         if (q->tx_borrowed == cl)
1694                 q->tx_borrowed = q->tx_class;
1695         if (q->tx_class == cl) {
1696                 q->tx_class = NULL;
1697                 q->tx_borrowed = NULL;
1698         }
1699 #ifdef CONFIG_NET_CLS_ACT
1700         if (q->rx_class == cl)
1701                 q->rx_class = NULL;
1702 #endif
1703
1704         cbq_unlink_class(cl);
1705         cbq_adjust_levels(cl->tparent);
1706         cl->defmap = 0;
1707         cbq_sync_defmap(cl);
1708
1709         cbq_rmprio(q, cl);
1710         sch_tree_unlock(sch);
1711
1712         cbq_destroy_class(sch, cl);
1713         return 0;
1714 }
1715
1716 static struct tcf_block *cbq_tcf_block(struct Qdisc *sch, unsigned long arg,
1717                                        struct netlink_ext_ack *extack)
1718 {
1719         struct cbq_sched_data *q = qdisc_priv(sch);
1720         struct cbq_class *cl = (struct cbq_class *)arg;
1721
1722         if (cl == NULL)
1723                 cl = &q->link;
1724
1725         return cl->block;
1726 }
1727
1728 static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
1729                                      u32 classid)
1730 {
1731         struct cbq_sched_data *q = qdisc_priv(sch);
1732         struct cbq_class *p = (struct cbq_class *)parent;
1733         struct cbq_class *cl = cbq_class_lookup(q, classid);
1734
1735         if (cl) {
1736                 if (p && p->level <= cl->level)
1737                         return 0;
1738                 cl->filters++;
1739                 return (unsigned long)cl;
1740         }
1741         return 0;
1742 }
1743
1744 static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
1745 {
1746         struct cbq_class *cl = (struct cbq_class *)arg;
1747
1748         cl->filters--;
1749 }
1750
1751 static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1752 {
1753         struct cbq_sched_data *q = qdisc_priv(sch);
1754         struct cbq_class *cl;
1755         unsigned int h;
1756
1757         if (arg->stop)
1758                 return;
1759
1760         for (h = 0; h < q->clhash.hashsize; h++) {
1761                 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1762                         if (arg->count < arg->skip) {
1763                                 arg->count++;
1764                                 continue;
1765                         }
1766                         if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1767                                 arg->stop = 1;
1768                                 return;
1769                         }
1770                         arg->count++;
1771                 }
1772         }
1773 }
1774
1775 static const struct Qdisc_class_ops cbq_class_ops = {
1776         .graft          =       cbq_graft,
1777         .leaf           =       cbq_leaf,
1778         .qlen_notify    =       cbq_qlen_notify,
1779         .find           =       cbq_find,
1780         .change         =       cbq_change_class,
1781         .delete         =       cbq_delete,
1782         .walk           =       cbq_walk,
1783         .tcf_block      =       cbq_tcf_block,
1784         .bind_tcf       =       cbq_bind_filter,
1785         .unbind_tcf     =       cbq_unbind_filter,
1786         .dump           =       cbq_dump_class,
1787         .dump_stats     =       cbq_dump_class_stats,
1788 };
1789
1790 static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
1791         .next           =       NULL,
1792         .cl_ops         =       &cbq_class_ops,
1793         .id             =       "cbq",
1794         .priv_size      =       sizeof(struct cbq_sched_data),
1795         .enqueue        =       cbq_enqueue,
1796         .dequeue        =       cbq_dequeue,
1797         .peek           =       qdisc_peek_dequeued,
1798         .init           =       cbq_init,
1799         .reset          =       cbq_reset,
1800         .destroy        =       cbq_destroy,
1801         .change         =       NULL,
1802         .dump           =       cbq_dump,
1803         .dump_stats     =       cbq_dump_stats,
1804         .owner          =       THIS_MODULE,
1805 };
1806
1807 static int __init cbq_module_init(void)
1808 {
1809         return register_qdisc(&cbq_qdisc_ops);
1810 }
1811 static void __exit cbq_module_exit(void)
1812 {
1813         unregister_qdisc(&cbq_qdisc_ops);
1814 }
1815 module_init(cbq_module_init)
1816 module_exit(cbq_module_exit)
1817 MODULE_LICENSE("GPL");