Merge tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf
[platform/kernel/linux-starfive.git] / net / sched / sch_cake.c
1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2
3 /* COMMON Applications Kept Enhanced (CAKE) discipline
4  *
5  * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com>
6  * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk>
7  * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com>
8  * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de>
9  * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk>
10  * Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au>
11  *
12  * The CAKE Principles:
13  *                 (or, how to have your cake and eat it too)
14  *
15  * This is a combination of several shaping, AQM and FQ techniques into one
16  * easy-to-use package:
17  *
18  * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE
19  *   equipment and bloated MACs.  This operates in deficit mode (as in sch_fq),
20  *   eliminating the need for any sort of burst parameter (eg. token bucket
21  *   depth).  Burst support is limited to that necessary to overcome scheduling
22  *   latency.
23  *
24  * - A Diffserv-aware priority queue, giving more priority to certain classes,
25  *   up to a specified fraction of bandwidth.  Above that bandwidth threshold,
26  *   the priority is reduced to avoid starving other tins.
27  *
28  * - Each priority tin has a separate Flow Queue system, to isolate traffic
29  *   flows from each other.  This prevents a burst on one flow from increasing
30  *   the delay to another.  Flows are distributed to queues using a
31  *   set-associative hash function.
32  *
33  * - Each queue is actively managed by Cobalt, which is a combination of the
34  *   Codel and Blue AQM algorithms.  This serves flows fairly, and signals
35  *   congestion early via ECN (if available) and/or packet drops, to keep
36  *   latency low.  The codel parameters are auto-tuned based on the bandwidth
37  *   setting, as is necessary at low bandwidths.
38  *
39  * The configuration parameters are kept deliberately simple for ease of use.
40  * Everything has sane defaults.  Complete generality of configuration is *not*
41  * a goal.
42  *
43  * The priority queue operates according to a weighted DRR scheme, combined with
44  * a bandwidth tracker which reuses the shaper logic to detect which side of the
45  * bandwidth sharing threshold the tin is operating.  This determines whether a
46  * priority-based weight (high) or a bandwidth-based weight (low) is used for
47  * that tin in the current pass.
48  *
49  * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly
50  * granted us permission to leverage.
51  */
52
53 #include <linux/module.h>
54 #include <linux/types.h>
55 #include <linux/kernel.h>
56 #include <linux/jiffies.h>
57 #include <linux/string.h>
58 #include <linux/in.h>
59 #include <linux/errno.h>
60 #include <linux/init.h>
61 #include <linux/skbuff.h>
62 #include <linux/jhash.h>
63 #include <linux/slab.h>
64 #include <linux/vmalloc.h>
65 #include <linux/reciprocal_div.h>
66 #include <net/netlink.h>
67 #include <linux/if_vlan.h>
68 #include <net/pkt_sched.h>
69 #include <net/pkt_cls.h>
70 #include <net/tcp.h>
71 #include <net/flow_dissector.h>
72
73 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
74 #include <net/netfilter/nf_conntrack_core.h>
75 #endif
76
77 #define CAKE_SET_WAYS (8)
78 #define CAKE_MAX_TINS (8)
79 #define CAKE_QUEUES (1024)
80 #define CAKE_FLOW_MASK 63
81 #define CAKE_FLOW_NAT_FLAG 64
82
83 /* struct cobalt_params - contains codel and blue parameters
84  * @interval:   codel initial drop rate
85  * @target:     maximum persistent sojourn time & blue update rate
86  * @mtu_time:   serialisation delay of maximum-size packet
87  * @p_inc:      increment of blue drop probability (0.32 fxp)
88  * @p_dec:      decrement of blue drop probability (0.32 fxp)
89  */
90 struct cobalt_params {
91         u64     interval;
92         u64     target;
93         u64     mtu_time;
94         u32     p_inc;
95         u32     p_dec;
96 };
97
98 /* struct cobalt_vars - contains codel and blue variables
99  * @count:              codel dropping frequency
100  * @rec_inv_sqrt:       reciprocal value of sqrt(count) >> 1
101  * @drop_next:          time to drop next packet, or when we dropped last
102  * @blue_timer:         Blue time to next drop
103  * @p_drop:             BLUE drop probability (0.32 fxp)
104  * @dropping:           set if in dropping state
105  * @ecn_marked:         set if marked
106  */
107 struct cobalt_vars {
108         u32     count;
109         u32     rec_inv_sqrt;
110         ktime_t drop_next;
111         ktime_t blue_timer;
112         u32     p_drop;
113         bool    dropping;
114         bool    ecn_marked;
115 };
116
117 enum {
118         CAKE_SET_NONE = 0,
119         CAKE_SET_SPARSE,
120         CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */
121         CAKE_SET_BULK,
122         CAKE_SET_DECAYING
123 };
124
125 struct cake_flow {
126         /* this stuff is all needed per-flow at dequeue time */
127         struct sk_buff    *head;
128         struct sk_buff    *tail;
129         struct list_head  flowchain;
130         s32               deficit;
131         u32               dropped;
132         struct cobalt_vars cvars;
133         u16               srchost; /* index into cake_host table */
134         u16               dsthost;
135         u8                set;
136 }; /* please try to keep this structure <= 64 bytes */
137
138 struct cake_host {
139         u32 srchost_tag;
140         u32 dsthost_tag;
141         u16 srchost_bulk_flow_count;
142         u16 dsthost_bulk_flow_count;
143 };
144
145 struct cake_heap_entry {
146         u16 t:3, b:10;
147 };
148
149 struct cake_tin_data {
150         struct cake_flow flows[CAKE_QUEUES];
151         u32     backlogs[CAKE_QUEUES];
152         u32     tags[CAKE_QUEUES]; /* for set association */
153         u16     overflow_idx[CAKE_QUEUES];
154         struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */
155         u16     flow_quantum;
156
157         struct cobalt_params cparams;
158         u32     drop_overlimit;
159         u16     bulk_flow_count;
160         u16     sparse_flow_count;
161         u16     decaying_flow_count;
162         u16     unresponsive_flow_count;
163
164         u32     max_skblen;
165
166         struct list_head new_flows;
167         struct list_head old_flows;
168         struct list_head decaying_flows;
169
170         /* time_next = time_this + ((len * rate_ns) >> rate_shft) */
171         ktime_t time_next_packet;
172         u64     tin_rate_ns;
173         u64     tin_rate_bps;
174         u16     tin_rate_shft;
175
176         u16     tin_quantum;
177         s32     tin_deficit;
178         u32     tin_backlog;
179         u32     tin_dropped;
180         u32     tin_ecn_mark;
181
182         u32     packets;
183         u64     bytes;
184
185         u32     ack_drops;
186
187         /* moving averages */
188         u64 avge_delay;
189         u64 peak_delay;
190         u64 base_delay;
191
192         /* hash function stats */
193         u32     way_directs;
194         u32     way_hits;
195         u32     way_misses;
196         u32     way_collisions;
197 }; /* number of tins is small, so size of this struct doesn't matter much */
198
199 struct cake_sched_data {
200         struct tcf_proto __rcu *filter_list; /* optional external classifier */
201         struct tcf_block *block;
202         struct cake_tin_data *tins;
203
204         struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS];
205         u16             overflow_timeout;
206
207         u16             tin_cnt;
208         u8              tin_mode;
209         u8              flow_mode;
210         u8              ack_filter;
211         u8              atm_mode;
212
213         u32             fwmark_mask;
214         u16             fwmark_shft;
215
216         /* time_next = time_this + ((len * rate_ns) >> rate_shft) */
217         u16             rate_shft;
218         ktime_t         time_next_packet;
219         ktime_t         failsafe_next_packet;
220         u64             rate_ns;
221         u64             rate_bps;
222         u16             rate_flags;
223         s16             rate_overhead;
224         u16             rate_mpu;
225         u64             interval;
226         u64             target;
227
228         /* resource tracking */
229         u32             buffer_used;
230         u32             buffer_max_used;
231         u32             buffer_limit;
232         u32             buffer_config_limit;
233
234         /* indices for dequeue */
235         u16             cur_tin;
236         u16             cur_flow;
237
238         struct qdisc_watchdog watchdog;
239         const u8        *tin_index;
240         const u8        *tin_order;
241
242         /* bandwidth capacity estimate */
243         ktime_t         last_packet_time;
244         ktime_t         avg_window_begin;
245         u64             avg_packet_interval;
246         u64             avg_window_bytes;
247         u64             avg_peak_bandwidth;
248         ktime_t         last_reconfig_time;
249
250         /* packet length stats */
251         u32             avg_netoff;
252         u16             max_netlen;
253         u16             max_adjlen;
254         u16             min_netlen;
255         u16             min_adjlen;
256 };
257
258 enum {
259         CAKE_FLAG_OVERHEAD         = BIT(0),
260         CAKE_FLAG_AUTORATE_INGRESS = BIT(1),
261         CAKE_FLAG_INGRESS          = BIT(2),
262         CAKE_FLAG_WASH             = BIT(3),
263         CAKE_FLAG_SPLIT_GSO        = BIT(4)
264 };
265
266 /* COBALT operates the Codel and BLUE algorithms in parallel, in order to
267  * obtain the best features of each.  Codel is excellent on flows which
268  * respond to congestion signals in a TCP-like way.  BLUE is more effective on
269  * unresponsive flows.
270  */
271
272 struct cobalt_skb_cb {
273         ktime_t enqueue_time;
274         u32     adjusted_len;
275 };
276
277 static u64 us_to_ns(u64 us)
278 {
279         return us * NSEC_PER_USEC;
280 }
281
282 static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb)
283 {
284         qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb));
285         return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data;
286 }
287
288 static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb)
289 {
290         return get_cobalt_cb(skb)->enqueue_time;
291 }
292
293 static void cobalt_set_enqueue_time(struct sk_buff *skb,
294                                     ktime_t now)
295 {
296         get_cobalt_cb(skb)->enqueue_time = now;
297 }
298
299 static u16 quantum_div[CAKE_QUEUES + 1] = {0};
300
301 /* Diffserv lookup tables */
302
303 static const u8 precedence[] = {
304         0, 0, 0, 0, 0, 0, 0, 0,
305         1, 1, 1, 1, 1, 1, 1, 1,
306         2, 2, 2, 2, 2, 2, 2, 2,
307         3, 3, 3, 3, 3, 3, 3, 3,
308         4, 4, 4, 4, 4, 4, 4, 4,
309         5, 5, 5, 5, 5, 5, 5, 5,
310         6, 6, 6, 6, 6, 6, 6, 6,
311         7, 7, 7, 7, 7, 7, 7, 7,
312 };
313
314 static const u8 diffserv8[] = {
315         2, 0, 1, 2, 4, 2, 2, 2,
316         1, 2, 1, 2, 1, 2, 1, 2,
317         5, 2, 4, 2, 4, 2, 4, 2,
318         3, 2, 3, 2, 3, 2, 3, 2,
319         6, 2, 3, 2, 3, 2, 3, 2,
320         6, 2, 2, 2, 6, 2, 6, 2,
321         7, 2, 2, 2, 2, 2, 2, 2,
322         7, 2, 2, 2, 2, 2, 2, 2,
323 };
324
325 static const u8 diffserv4[] = {
326         0, 1, 0, 0, 2, 0, 0, 0,
327         1, 0, 0, 0, 0, 0, 0, 0,
328         2, 0, 2, 0, 2, 0, 2, 0,
329         2, 0, 2, 0, 2, 0, 2, 0,
330         3, 0, 2, 0, 2, 0, 2, 0,
331         3, 0, 0, 0, 3, 0, 3, 0,
332         3, 0, 0, 0, 0, 0, 0, 0,
333         3, 0, 0, 0, 0, 0, 0, 0,
334 };
335
336 static const u8 diffserv3[] = {
337         0, 1, 0, 0, 2, 0, 0, 0,
338         1, 0, 0, 0, 0, 0, 0, 0,
339         0, 0, 0, 0, 0, 0, 0, 0,
340         0, 0, 0, 0, 0, 0, 0, 0,
341         0, 0, 0, 0, 0, 0, 0, 0,
342         0, 0, 0, 0, 2, 0, 2, 0,
343         2, 0, 0, 0, 0, 0, 0, 0,
344         2, 0, 0, 0, 0, 0, 0, 0,
345 };
346
347 static const u8 besteffort[] = {
348         0, 0, 0, 0, 0, 0, 0, 0,
349         0, 0, 0, 0, 0, 0, 0, 0,
350         0, 0, 0, 0, 0, 0, 0, 0,
351         0, 0, 0, 0, 0, 0, 0, 0,
352         0, 0, 0, 0, 0, 0, 0, 0,
353         0, 0, 0, 0, 0, 0, 0, 0,
354         0, 0, 0, 0, 0, 0, 0, 0,
355         0, 0, 0, 0, 0, 0, 0, 0,
356 };
357
358 /* tin priority order for stats dumping */
359
360 static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7};
361 static const u8 bulk_order[] = {1, 0, 2, 3};
362
363 #define REC_INV_SQRT_CACHE (16)
364 static u32 cobalt_rec_inv_sqrt_cache[REC_INV_SQRT_CACHE] = {0};
365
366 /* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots
367  * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
368  *
369  * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
370  */
371
372 static void cobalt_newton_step(struct cobalt_vars *vars)
373 {
374         u32 invsqrt, invsqrt2;
375         u64 val;
376
377         invsqrt = vars->rec_inv_sqrt;
378         invsqrt2 = ((u64)invsqrt * invsqrt) >> 32;
379         val = (3LL << 32) - ((u64)vars->count * invsqrt2);
380
381         val >>= 2; /* avoid overflow in following multiply */
382         val = (val * invsqrt) >> (32 - 2 + 1);
383
384         vars->rec_inv_sqrt = val;
385 }
386
387 static void cobalt_invsqrt(struct cobalt_vars *vars)
388 {
389         if (vars->count < REC_INV_SQRT_CACHE)
390                 vars->rec_inv_sqrt = cobalt_rec_inv_sqrt_cache[vars->count];
391         else
392                 cobalt_newton_step(vars);
393 }
394
395 /* There is a big difference in timing between the accurate values placed in
396  * the cache and the approximations given by a single Newton step for small
397  * count values, particularly when stepping from count 1 to 2 or vice versa.
398  * Above 16, a single Newton step gives sufficient accuracy in either
399  * direction, given the precision stored.
400  *
401  * The magnitude of the error when stepping up to count 2 is such as to give
402  * the value that *should* have been produced at count 4.
403  */
404
405 static void cobalt_cache_init(void)
406 {
407         struct cobalt_vars v;
408
409         memset(&v, 0, sizeof(v));
410         v.rec_inv_sqrt = ~0U;
411         cobalt_rec_inv_sqrt_cache[0] = v.rec_inv_sqrt;
412
413         for (v.count = 1; v.count < REC_INV_SQRT_CACHE; v.count++) {
414                 cobalt_newton_step(&v);
415                 cobalt_newton_step(&v);
416                 cobalt_newton_step(&v);
417                 cobalt_newton_step(&v);
418
419                 cobalt_rec_inv_sqrt_cache[v.count] = v.rec_inv_sqrt;
420         }
421 }
422
423 static void cobalt_vars_init(struct cobalt_vars *vars)
424 {
425         memset(vars, 0, sizeof(*vars));
426
427         if (!cobalt_rec_inv_sqrt_cache[0]) {
428                 cobalt_cache_init();
429                 cobalt_rec_inv_sqrt_cache[0] = ~0;
430         }
431 }
432
433 /* CoDel control_law is t + interval/sqrt(count)
434  * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid
435  * both sqrt() and divide operation.
436  */
437 static ktime_t cobalt_control(ktime_t t,
438                               u64 interval,
439                               u32 rec_inv_sqrt)
440 {
441         return ktime_add_ns(t, reciprocal_scale(interval,
442                                                 rec_inv_sqrt));
443 }
444
445 /* Call this when a packet had to be dropped due to queue overflow.  Returns
446  * true if the BLUE state was quiescent before but active after this call.
447  */
448 static bool cobalt_queue_full(struct cobalt_vars *vars,
449                               struct cobalt_params *p,
450                               ktime_t now)
451 {
452         bool up = false;
453
454         if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
455                 up = !vars->p_drop;
456                 vars->p_drop += p->p_inc;
457                 if (vars->p_drop < p->p_inc)
458                         vars->p_drop = ~0;
459                 vars->blue_timer = now;
460         }
461         vars->dropping = true;
462         vars->drop_next = now;
463         if (!vars->count)
464                 vars->count = 1;
465
466         return up;
467 }
468
469 /* Call this when the queue was serviced but turned out to be empty.  Returns
470  * true if the BLUE state was active before but quiescent after this call.
471  */
472 static bool cobalt_queue_empty(struct cobalt_vars *vars,
473                                struct cobalt_params *p,
474                                ktime_t now)
475 {
476         bool down = false;
477
478         if (vars->p_drop &&
479             ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
480                 if (vars->p_drop < p->p_dec)
481                         vars->p_drop = 0;
482                 else
483                         vars->p_drop -= p->p_dec;
484                 vars->blue_timer = now;
485                 down = !vars->p_drop;
486         }
487         vars->dropping = false;
488
489         if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) {
490                 vars->count--;
491                 cobalt_invsqrt(vars);
492                 vars->drop_next = cobalt_control(vars->drop_next,
493                                                  p->interval,
494                                                  vars->rec_inv_sqrt);
495         }
496
497         return down;
498 }
499
500 /* Call this with a freshly dequeued packet for possible congestion marking.
501  * Returns true as an instruction to drop the packet, false for delivery.
502  */
503 static bool cobalt_should_drop(struct cobalt_vars *vars,
504                                struct cobalt_params *p,
505                                ktime_t now,
506                                struct sk_buff *skb,
507                                u32 bulk_flows)
508 {
509         bool next_due, over_target, drop = false;
510         ktime_t schedule;
511         u64 sojourn;
512
513 /* The 'schedule' variable records, in its sign, whether 'now' is before or
514  * after 'drop_next'.  This allows 'drop_next' to be updated before the next
515  * scheduling decision is actually branched, without destroying that
516  * information.  Similarly, the first 'schedule' value calculated is preserved
517  * in the boolean 'next_due'.
518  *
519  * As for 'drop_next', we take advantage of the fact that 'interval' is both
520  * the delay between first exceeding 'target' and the first signalling event,
521  * *and* the scaling factor for the signalling frequency.  It's therefore very
522  * natural to use a single mechanism for both purposes, and eliminates a
523  * significant amount of reference Codel's spaghetti code.  To help with this,
524  * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close
525  * as possible to 1.0 in fixed-point.
526  */
527
528         sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
529         schedule = ktime_sub(now, vars->drop_next);
530         over_target = sojourn > p->target &&
531                       sojourn > p->mtu_time * bulk_flows * 2 &&
532                       sojourn > p->mtu_time * 4;
533         next_due = vars->count && ktime_to_ns(schedule) >= 0;
534
535         vars->ecn_marked = false;
536
537         if (over_target) {
538                 if (!vars->dropping) {
539                         vars->dropping = true;
540                         vars->drop_next = cobalt_control(now,
541                                                          p->interval,
542                                                          vars->rec_inv_sqrt);
543                 }
544                 if (!vars->count)
545                         vars->count = 1;
546         } else if (vars->dropping) {
547                 vars->dropping = false;
548         }
549
550         if (next_due && vars->dropping) {
551                 /* Use ECN mark if possible, otherwise drop */
552                 drop = !(vars->ecn_marked = INET_ECN_set_ce(skb));
553
554                 vars->count++;
555                 if (!vars->count)
556                         vars->count--;
557                 cobalt_invsqrt(vars);
558                 vars->drop_next = cobalt_control(vars->drop_next,
559                                                  p->interval,
560                                                  vars->rec_inv_sqrt);
561                 schedule = ktime_sub(now, vars->drop_next);
562         } else {
563                 while (next_due) {
564                         vars->count--;
565                         cobalt_invsqrt(vars);
566                         vars->drop_next = cobalt_control(vars->drop_next,
567                                                          p->interval,
568                                                          vars->rec_inv_sqrt);
569                         schedule = ktime_sub(now, vars->drop_next);
570                         next_due = vars->count && ktime_to_ns(schedule) >= 0;
571                 }
572         }
573
574         /* Simple BLUE implementation.  Lack of ECN is deliberate. */
575         if (vars->p_drop)
576                 drop |= (get_random_u32() < vars->p_drop);
577
578         /* Overload the drop_next field as an activity timeout */
579         if (!vars->count)
580                 vars->drop_next = ktime_add_ns(now, p->interval);
581         else if (ktime_to_ns(schedule) > 0 && !drop)
582                 vars->drop_next = now;
583
584         return drop;
585 }
586
587 static bool cake_update_flowkeys(struct flow_keys *keys,
588                                  const struct sk_buff *skb)
589 {
590 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
591         struct nf_conntrack_tuple tuple = {};
592         bool rev = !skb->_nfct, upd = false;
593         __be32 ip;
594
595         if (skb_protocol(skb, true) != htons(ETH_P_IP))
596                 return false;
597
598         if (!nf_ct_get_tuple_skb(&tuple, skb))
599                 return false;
600
601         ip = rev ? tuple.dst.u3.ip : tuple.src.u3.ip;
602         if (ip != keys->addrs.v4addrs.src) {
603                 keys->addrs.v4addrs.src = ip;
604                 upd = true;
605         }
606         ip = rev ? tuple.src.u3.ip : tuple.dst.u3.ip;
607         if (ip != keys->addrs.v4addrs.dst) {
608                 keys->addrs.v4addrs.dst = ip;
609                 upd = true;
610         }
611
612         if (keys->ports.ports) {
613                 __be16 port;
614
615                 port = rev ? tuple.dst.u.all : tuple.src.u.all;
616                 if (port != keys->ports.src) {
617                         keys->ports.src = port;
618                         upd = true;
619                 }
620                 port = rev ? tuple.src.u.all : tuple.dst.u.all;
621                 if (port != keys->ports.dst) {
622                         port = keys->ports.dst;
623                         upd = true;
624                 }
625         }
626         return upd;
627 #else
628         return false;
629 #endif
630 }
631
632 /* Cake has several subtle multiple bit settings. In these cases you
633  *  would be matching triple isolate mode as well.
634  */
635
636 static bool cake_dsrc(int flow_mode)
637 {
638         return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC;
639 }
640
641 static bool cake_ddst(int flow_mode)
642 {
643         return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST;
644 }
645
646 static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb,
647                      int flow_mode, u16 flow_override, u16 host_override)
648 {
649         bool hash_flows = (!flow_override && !!(flow_mode & CAKE_FLOW_FLOWS));
650         bool hash_hosts = (!host_override && !!(flow_mode & CAKE_FLOW_HOSTS));
651         bool nat_enabled = !!(flow_mode & CAKE_FLOW_NAT_FLAG);
652         u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0;
653         u16 reduced_hash, srchost_idx, dsthost_idx;
654         struct flow_keys keys, host_keys;
655         bool use_skbhash = skb->l4_hash;
656
657         if (unlikely(flow_mode == CAKE_FLOW_NONE))
658                 return 0;
659
660         /* If both overrides are set, or we can use the SKB hash and nat mode is
661          * disabled, we can skip packet dissection entirely. If nat mode is
662          * enabled there's another check below after doing the conntrack lookup.
663          */
664         if ((!hash_flows || (use_skbhash && !nat_enabled)) && !hash_hosts)
665                 goto skip_hash;
666
667         skb_flow_dissect_flow_keys(skb, &keys,
668                                    FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
669
670         /* Don't use the SKB hash if we change the lookup keys from conntrack */
671         if (nat_enabled && cake_update_flowkeys(&keys, skb))
672                 use_skbhash = false;
673
674         /* If we can still use the SKB hash and don't need the host hash, we can
675          * skip the rest of the hashing procedure
676          */
677         if (use_skbhash && !hash_hosts)
678                 goto skip_hash;
679
680         /* flow_hash_from_keys() sorts the addresses by value, so we have
681          * to preserve their order in a separate data structure to treat
682          * src and dst host addresses as independently selectable.
683          */
684         host_keys = keys;
685         host_keys.ports.ports     = 0;
686         host_keys.basic.ip_proto  = 0;
687         host_keys.keyid.keyid     = 0;
688         host_keys.tags.flow_label = 0;
689
690         switch (host_keys.control.addr_type) {
691         case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
692                 host_keys.addrs.v4addrs.src = 0;
693                 dsthost_hash = flow_hash_from_keys(&host_keys);
694                 host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
695                 host_keys.addrs.v4addrs.dst = 0;
696                 srchost_hash = flow_hash_from_keys(&host_keys);
697                 break;
698
699         case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
700                 memset(&host_keys.addrs.v6addrs.src, 0,
701                        sizeof(host_keys.addrs.v6addrs.src));
702                 dsthost_hash = flow_hash_from_keys(&host_keys);
703                 host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
704                 memset(&host_keys.addrs.v6addrs.dst, 0,
705                        sizeof(host_keys.addrs.v6addrs.dst));
706                 srchost_hash = flow_hash_from_keys(&host_keys);
707                 break;
708
709         default:
710                 dsthost_hash = 0;
711                 srchost_hash = 0;
712         }
713
714         /* This *must* be after the above switch, since as a
715          * side-effect it sorts the src and dst addresses.
716          */
717         if (hash_flows && !use_skbhash)
718                 flow_hash = flow_hash_from_keys(&keys);
719
720 skip_hash:
721         if (flow_override)
722                 flow_hash = flow_override - 1;
723         else if (use_skbhash && (flow_mode & CAKE_FLOW_FLOWS))
724                 flow_hash = skb->hash;
725         if (host_override) {
726                 dsthost_hash = host_override - 1;
727                 srchost_hash = host_override - 1;
728         }
729
730         if (!(flow_mode & CAKE_FLOW_FLOWS)) {
731                 if (flow_mode & CAKE_FLOW_SRC_IP)
732                         flow_hash ^= srchost_hash;
733
734                 if (flow_mode & CAKE_FLOW_DST_IP)
735                         flow_hash ^= dsthost_hash;
736         }
737
738         reduced_hash = flow_hash % CAKE_QUEUES;
739
740         /* set-associative hashing */
741         /* fast path if no hash collision (direct lookup succeeds) */
742         if (likely(q->tags[reduced_hash] == flow_hash &&
743                    q->flows[reduced_hash].set)) {
744                 q->way_directs++;
745         } else {
746                 u32 inner_hash = reduced_hash % CAKE_SET_WAYS;
747                 u32 outer_hash = reduced_hash - inner_hash;
748                 bool allocate_src = false;
749                 bool allocate_dst = false;
750                 u32 i, k;
751
752                 /* check if any active queue in the set is reserved for
753                  * this flow.
754                  */
755                 for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
756                      i++, k = (k + 1) % CAKE_SET_WAYS) {
757                         if (q->tags[outer_hash + k] == flow_hash) {
758                                 if (i)
759                                         q->way_hits++;
760
761                                 if (!q->flows[outer_hash + k].set) {
762                                         /* need to increment host refcnts */
763                                         allocate_src = cake_dsrc(flow_mode);
764                                         allocate_dst = cake_ddst(flow_mode);
765                                 }
766
767                                 goto found;
768                         }
769                 }
770
771                 /* no queue is reserved for this flow, look for an
772                  * empty one.
773                  */
774                 for (i = 0; i < CAKE_SET_WAYS;
775                          i++, k = (k + 1) % CAKE_SET_WAYS) {
776                         if (!q->flows[outer_hash + k].set) {
777                                 q->way_misses++;
778                                 allocate_src = cake_dsrc(flow_mode);
779                                 allocate_dst = cake_ddst(flow_mode);
780                                 goto found;
781                         }
782                 }
783
784                 /* With no empty queues, default to the original
785                  * queue, accept the collision, update the host tags.
786                  */
787                 q->way_collisions++;
788                 if (q->flows[outer_hash + k].set == CAKE_SET_BULK) {
789                         q->hosts[q->flows[reduced_hash].srchost].srchost_bulk_flow_count--;
790                         q->hosts[q->flows[reduced_hash].dsthost].dsthost_bulk_flow_count--;
791                 }
792                 allocate_src = cake_dsrc(flow_mode);
793                 allocate_dst = cake_ddst(flow_mode);
794 found:
795                 /* reserve queue for future packets in same flow */
796                 reduced_hash = outer_hash + k;
797                 q->tags[reduced_hash] = flow_hash;
798
799                 if (allocate_src) {
800                         srchost_idx = srchost_hash % CAKE_QUEUES;
801                         inner_hash = srchost_idx % CAKE_SET_WAYS;
802                         outer_hash = srchost_idx - inner_hash;
803                         for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
804                                 i++, k = (k + 1) % CAKE_SET_WAYS) {
805                                 if (q->hosts[outer_hash + k].srchost_tag ==
806                                     srchost_hash)
807                                         goto found_src;
808                         }
809                         for (i = 0; i < CAKE_SET_WAYS;
810                                 i++, k = (k + 1) % CAKE_SET_WAYS) {
811                                 if (!q->hosts[outer_hash + k].srchost_bulk_flow_count)
812                                         break;
813                         }
814                         q->hosts[outer_hash + k].srchost_tag = srchost_hash;
815 found_src:
816                         srchost_idx = outer_hash + k;
817                         if (q->flows[reduced_hash].set == CAKE_SET_BULK)
818                                 q->hosts[srchost_idx].srchost_bulk_flow_count++;
819                         q->flows[reduced_hash].srchost = srchost_idx;
820                 }
821
822                 if (allocate_dst) {
823                         dsthost_idx = dsthost_hash % CAKE_QUEUES;
824                         inner_hash = dsthost_idx % CAKE_SET_WAYS;
825                         outer_hash = dsthost_idx - inner_hash;
826                         for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
827                              i++, k = (k + 1) % CAKE_SET_WAYS) {
828                                 if (q->hosts[outer_hash + k].dsthost_tag ==
829                                     dsthost_hash)
830                                         goto found_dst;
831                         }
832                         for (i = 0; i < CAKE_SET_WAYS;
833                              i++, k = (k + 1) % CAKE_SET_WAYS) {
834                                 if (!q->hosts[outer_hash + k].dsthost_bulk_flow_count)
835                                         break;
836                         }
837                         q->hosts[outer_hash + k].dsthost_tag = dsthost_hash;
838 found_dst:
839                         dsthost_idx = outer_hash + k;
840                         if (q->flows[reduced_hash].set == CAKE_SET_BULK)
841                                 q->hosts[dsthost_idx].dsthost_bulk_flow_count++;
842                         q->flows[reduced_hash].dsthost = dsthost_idx;
843                 }
844         }
845
846         return reduced_hash;
847 }
848
849 /* helper functions : might be changed when/if skb use a standard list_head */
850 /* remove one skb from head of slot queue */
851
852 static struct sk_buff *dequeue_head(struct cake_flow *flow)
853 {
854         struct sk_buff *skb = flow->head;
855
856         if (skb) {
857                 flow->head = skb->next;
858                 skb_mark_not_on_list(skb);
859         }
860
861         return skb;
862 }
863
864 /* add skb to flow queue (tail add) */
865
866 static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb)
867 {
868         if (!flow->head)
869                 flow->head = skb;
870         else
871                 flow->tail->next = skb;
872         flow->tail = skb;
873         skb->next = NULL;
874 }
875
876 static struct iphdr *cake_get_iphdr(const struct sk_buff *skb,
877                                     struct ipv6hdr *buf)
878 {
879         unsigned int offset = skb_network_offset(skb);
880         struct iphdr *iph;
881
882         iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf);
883
884         if (!iph)
885                 return NULL;
886
887         if (iph->version == 4 && iph->protocol == IPPROTO_IPV6)
888                 return skb_header_pointer(skb, offset + iph->ihl * 4,
889                                           sizeof(struct ipv6hdr), buf);
890
891         else if (iph->version == 4)
892                 return iph;
893
894         else if (iph->version == 6)
895                 return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr),
896                                           buf);
897
898         return NULL;
899 }
900
901 static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb,
902                                       void *buf, unsigned int bufsize)
903 {
904         unsigned int offset = skb_network_offset(skb);
905         const struct ipv6hdr *ipv6h;
906         const struct tcphdr *tcph;
907         const struct iphdr *iph;
908         struct ipv6hdr _ipv6h;
909         struct tcphdr _tcph;
910
911         ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h);
912
913         if (!ipv6h)
914                 return NULL;
915
916         if (ipv6h->version == 4) {
917                 iph = (struct iphdr *)ipv6h;
918                 offset += iph->ihl * 4;
919
920                 /* special-case 6in4 tunnelling, as that is a common way to get
921                  * v6 connectivity in the home
922                  */
923                 if (iph->protocol == IPPROTO_IPV6) {
924                         ipv6h = skb_header_pointer(skb, offset,
925                                                    sizeof(_ipv6h), &_ipv6h);
926
927                         if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP)
928                                 return NULL;
929
930                         offset += sizeof(struct ipv6hdr);
931
932                 } else if (iph->protocol != IPPROTO_TCP) {
933                         return NULL;
934                 }
935
936         } else if (ipv6h->version == 6) {
937                 if (ipv6h->nexthdr != IPPROTO_TCP)
938                         return NULL;
939
940                 offset += sizeof(struct ipv6hdr);
941         } else {
942                 return NULL;
943         }
944
945         tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
946         if (!tcph || tcph->doff < 5)
947                 return NULL;
948
949         return skb_header_pointer(skb, offset,
950                                   min(__tcp_hdrlen(tcph), bufsize), buf);
951 }
952
953 static const void *cake_get_tcpopt(const struct tcphdr *tcph,
954                                    int code, int *oplen)
955 {
956         /* inspired by tcp_parse_options in tcp_input.c */
957         int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
958         const u8 *ptr = (const u8 *)(tcph + 1);
959
960         while (length > 0) {
961                 int opcode = *ptr++;
962                 int opsize;
963
964                 if (opcode == TCPOPT_EOL)
965                         break;
966                 if (opcode == TCPOPT_NOP) {
967                         length--;
968                         continue;
969                 }
970                 if (length < 2)
971                         break;
972                 opsize = *ptr++;
973                 if (opsize < 2 || opsize > length)
974                         break;
975
976                 if (opcode == code) {
977                         *oplen = opsize;
978                         return ptr;
979                 }
980
981                 ptr += opsize - 2;
982                 length -= opsize;
983         }
984
985         return NULL;
986 }
987
988 /* Compare two SACK sequences. A sequence is considered greater if it SACKs more
989  * bytes than the other. In the case where both sequences ACKs bytes that the
990  * other doesn't, A is considered greater. DSACKs in A also makes A be
991  * considered greater.
992  *
993  * @return -1, 0 or 1 as normal compare functions
994  */
995 static int cake_tcph_sack_compare(const struct tcphdr *tcph_a,
996                                   const struct tcphdr *tcph_b)
997 {
998         const struct tcp_sack_block_wire *sack_a, *sack_b;
999         u32 ack_seq_a = ntohl(tcph_a->ack_seq);
1000         u32 bytes_a = 0, bytes_b = 0;
1001         int oplen_a, oplen_b;
1002         bool first = true;
1003
1004         sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a);
1005         sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b);
1006
1007         /* pointers point to option contents */
1008         oplen_a -= TCPOLEN_SACK_BASE;
1009         oplen_b -= TCPOLEN_SACK_BASE;
1010
1011         if (sack_a && oplen_a >= sizeof(*sack_a) &&
1012             (!sack_b || oplen_b < sizeof(*sack_b)))
1013                 return -1;
1014         else if (sack_b && oplen_b >= sizeof(*sack_b) &&
1015                  (!sack_a || oplen_a < sizeof(*sack_a)))
1016                 return 1;
1017         else if ((!sack_a || oplen_a < sizeof(*sack_a)) &&
1018                  (!sack_b || oplen_b < sizeof(*sack_b)))
1019                 return 0;
1020
1021         while (oplen_a >= sizeof(*sack_a)) {
1022                 const struct tcp_sack_block_wire *sack_tmp = sack_b;
1023                 u32 start_a = get_unaligned_be32(&sack_a->start_seq);
1024                 u32 end_a = get_unaligned_be32(&sack_a->end_seq);
1025                 int oplen_tmp = oplen_b;
1026                 bool found = false;
1027
1028                 /* DSACK; always considered greater to prevent dropping */
1029                 if (before(start_a, ack_seq_a))
1030                         return -1;
1031
1032                 bytes_a += end_a - start_a;
1033
1034                 while (oplen_tmp >= sizeof(*sack_tmp)) {
1035                         u32 start_b = get_unaligned_be32(&sack_tmp->start_seq);
1036                         u32 end_b = get_unaligned_be32(&sack_tmp->end_seq);
1037
1038                         /* first time through we count the total size */
1039                         if (first)
1040                                 bytes_b += end_b - start_b;
1041
1042                         if (!after(start_b, start_a) && !before(end_b, end_a)) {
1043                                 found = true;
1044                                 if (!first)
1045                                         break;
1046                         }
1047                         oplen_tmp -= sizeof(*sack_tmp);
1048                         sack_tmp++;
1049                 }
1050
1051                 if (!found)
1052                         return -1;
1053
1054                 oplen_a -= sizeof(*sack_a);
1055                 sack_a++;
1056                 first = false;
1057         }
1058
1059         /* If we made it this far, all ranges SACKed by A are covered by B, so
1060          * either the SACKs are equal, or B SACKs more bytes.
1061          */
1062         return bytes_b > bytes_a ? 1 : 0;
1063 }
1064
1065 static void cake_tcph_get_tstamp(const struct tcphdr *tcph,
1066                                  u32 *tsval, u32 *tsecr)
1067 {
1068         const u8 *ptr;
1069         int opsize;
1070
1071         ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize);
1072
1073         if (ptr && opsize == TCPOLEN_TIMESTAMP) {
1074                 *tsval = get_unaligned_be32(ptr);
1075                 *tsecr = get_unaligned_be32(ptr + 4);
1076         }
1077 }
1078
1079 static bool cake_tcph_may_drop(const struct tcphdr *tcph,
1080                                u32 tstamp_new, u32 tsecr_new)
1081 {
1082         /* inspired by tcp_parse_options in tcp_input.c */
1083         int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
1084         const u8 *ptr = (const u8 *)(tcph + 1);
1085         u32 tstamp, tsecr;
1086
1087         /* 3 reserved flags must be unset to avoid future breakage
1088          * ACK must be set
1089          * ECE/CWR are handled separately
1090          * All other flags URG/PSH/RST/SYN/FIN must be unset
1091          * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero)
1092          * 0x00C00000 = CWR/ECE (handled separately)
1093          * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000
1094          */
1095         if (((tcp_flag_word(tcph) &
1096               cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK))
1097                 return false;
1098
1099         while (length > 0) {
1100                 int opcode = *ptr++;
1101                 int opsize;
1102
1103                 if (opcode == TCPOPT_EOL)
1104                         break;
1105                 if (opcode == TCPOPT_NOP) {
1106                         length--;
1107                         continue;
1108                 }
1109                 if (length < 2)
1110                         break;
1111                 opsize = *ptr++;
1112                 if (opsize < 2 || opsize > length)
1113                         break;
1114
1115                 switch (opcode) {
1116                 case TCPOPT_MD5SIG: /* doesn't influence state */
1117                         break;
1118
1119                 case TCPOPT_SACK: /* stricter checking performed later */
1120                         if (opsize % 8 != 2)
1121                                 return false;
1122                         break;
1123
1124                 case TCPOPT_TIMESTAMP:
1125                         /* only drop timestamps lower than new */
1126                         if (opsize != TCPOLEN_TIMESTAMP)
1127                                 return false;
1128                         tstamp = get_unaligned_be32(ptr);
1129                         tsecr = get_unaligned_be32(ptr + 4);
1130                         if (after(tstamp, tstamp_new) ||
1131                             after(tsecr, tsecr_new))
1132                                 return false;
1133                         break;
1134
1135                 case TCPOPT_MSS:  /* these should only be set on SYN */
1136                 case TCPOPT_WINDOW:
1137                 case TCPOPT_SACK_PERM:
1138                 case TCPOPT_FASTOPEN:
1139                 case TCPOPT_EXP:
1140                 default: /* don't drop if any unknown options are present */
1141                         return false;
1142                 }
1143
1144                 ptr += opsize - 2;
1145                 length -= opsize;
1146         }
1147
1148         return true;
1149 }
1150
1151 static struct sk_buff *cake_ack_filter(struct cake_sched_data *q,
1152                                        struct cake_flow *flow)
1153 {
1154         bool aggressive = q->ack_filter == CAKE_ACK_AGGRESSIVE;
1155         struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL;
1156         struct sk_buff *skb_check, *skb_prev = NULL;
1157         const struct ipv6hdr *ipv6h, *ipv6h_check;
1158         unsigned char _tcph[64], _tcph_check[64];
1159         const struct tcphdr *tcph, *tcph_check;
1160         const struct iphdr *iph, *iph_check;
1161         struct ipv6hdr _iph, _iph_check;
1162         const struct sk_buff *skb;
1163         int seglen, num_found = 0;
1164         u32 tstamp = 0, tsecr = 0;
1165         __be32 elig_flags = 0;
1166         int sack_comp;
1167
1168         /* no other possible ACKs to filter */
1169         if (flow->head == flow->tail)
1170                 return NULL;
1171
1172         skb = flow->tail;
1173         tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph));
1174         iph = cake_get_iphdr(skb, &_iph);
1175         if (!tcph)
1176                 return NULL;
1177
1178         cake_tcph_get_tstamp(tcph, &tstamp, &tsecr);
1179
1180         /* the 'triggering' packet need only have the ACK flag set.
1181          * also check that SYN is not set, as there won't be any previous ACKs.
1182          */
1183         if ((tcp_flag_word(tcph) &
1184              (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK)
1185                 return NULL;
1186
1187         /* the 'triggering' ACK is at the tail of the queue, we have already
1188          * returned if it is the only packet in the flow. loop through the rest
1189          * of the queue looking for pure ACKs with the same 5-tuple as the
1190          * triggering one.
1191          */
1192         for (skb_check = flow->head;
1193              skb_check && skb_check != skb;
1194              skb_prev = skb_check, skb_check = skb_check->next) {
1195                 iph_check = cake_get_iphdr(skb_check, &_iph_check);
1196                 tcph_check = cake_get_tcphdr(skb_check, &_tcph_check,
1197                                              sizeof(_tcph_check));
1198
1199                 /* only TCP packets with matching 5-tuple are eligible, and only
1200                  * drop safe headers
1201                  */
1202                 if (!tcph_check || iph->version != iph_check->version ||
1203                     tcph_check->source != tcph->source ||
1204                     tcph_check->dest != tcph->dest)
1205                         continue;
1206
1207                 if (iph_check->version == 4) {
1208                         if (iph_check->saddr != iph->saddr ||
1209                             iph_check->daddr != iph->daddr)
1210                                 continue;
1211
1212                         seglen = ntohs(iph_check->tot_len) -
1213                                        (4 * iph_check->ihl);
1214                 } else if (iph_check->version == 6) {
1215                         ipv6h = (struct ipv6hdr *)iph;
1216                         ipv6h_check = (struct ipv6hdr *)iph_check;
1217
1218                         if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) ||
1219                             ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr))
1220                                 continue;
1221
1222                         seglen = ntohs(ipv6h_check->payload_len);
1223                 } else {
1224                         WARN_ON(1);  /* shouldn't happen */
1225                         continue;
1226                 }
1227
1228                 /* If the ECE/CWR flags changed from the previous eligible
1229                  * packet in the same flow, we should no longer be dropping that
1230                  * previous packet as this would lose information.
1231                  */
1232                 if (elig_ack && (tcp_flag_word(tcph_check) &
1233                                  (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) {
1234                         elig_ack = NULL;
1235                         elig_ack_prev = NULL;
1236                         num_found--;
1237                 }
1238
1239                 /* Check TCP options and flags, don't drop ACKs with segment
1240                  * data, and don't drop ACKs with a higher cumulative ACK
1241                  * counter than the triggering packet. Check ACK seqno here to
1242                  * avoid parsing SACK options of packets we are going to exclude
1243                  * anyway.
1244                  */
1245                 if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) ||
1246                     (seglen - __tcp_hdrlen(tcph_check)) != 0 ||
1247                     after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq)))
1248                         continue;
1249
1250                 /* Check SACK options. The triggering packet must SACK more data
1251                  * than the ACK under consideration, or SACK the same range but
1252                  * have a larger cumulative ACK counter. The latter is a
1253                  * pathological case, but is contained in the following check
1254                  * anyway, just to be safe.
1255                  */
1256                 sack_comp = cake_tcph_sack_compare(tcph_check, tcph);
1257
1258                 if (sack_comp < 0 ||
1259                     (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) &&
1260                      sack_comp == 0))
1261                         continue;
1262
1263                 /* At this point we have found an eligible pure ACK to drop; if
1264                  * we are in aggressive mode, we are done. Otherwise, keep
1265                  * searching unless this is the second eligible ACK we
1266                  * found.
1267                  *
1268                  * Since we want to drop ACK closest to the head of the queue,
1269                  * save the first eligible ACK we find, even if we need to loop
1270                  * again.
1271                  */
1272                 if (!elig_ack) {
1273                         elig_ack = skb_check;
1274                         elig_ack_prev = skb_prev;
1275                         elig_flags = (tcp_flag_word(tcph_check)
1276                                       & (TCP_FLAG_ECE | TCP_FLAG_CWR));
1277                 }
1278
1279                 if (num_found++ > 0)
1280                         goto found;
1281         }
1282
1283         /* We made it through the queue without finding two eligible ACKs . If
1284          * we found a single eligible ACK we can drop it in aggressive mode if
1285          * we can guarantee that this does not interfere with ECN flag
1286          * information. We ensure this by dropping it only if the enqueued
1287          * packet is consecutive with the eligible ACK, and their flags match.
1288          */
1289         if (elig_ack && aggressive && elig_ack->next == skb &&
1290             (elig_flags == (tcp_flag_word(tcph) &
1291                             (TCP_FLAG_ECE | TCP_FLAG_CWR))))
1292                 goto found;
1293
1294         return NULL;
1295
1296 found:
1297         if (elig_ack_prev)
1298                 elig_ack_prev->next = elig_ack->next;
1299         else
1300                 flow->head = elig_ack->next;
1301
1302         skb_mark_not_on_list(elig_ack);
1303
1304         return elig_ack;
1305 }
1306
1307 static u64 cake_ewma(u64 avg, u64 sample, u32 shift)
1308 {
1309         avg -= avg >> shift;
1310         avg += sample >> shift;
1311         return avg;
1312 }
1313
1314 static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off)
1315 {
1316         if (q->rate_flags & CAKE_FLAG_OVERHEAD)
1317                 len -= off;
1318
1319         if (q->max_netlen < len)
1320                 q->max_netlen = len;
1321         if (q->min_netlen > len)
1322                 q->min_netlen = len;
1323
1324         len += q->rate_overhead;
1325
1326         if (len < q->rate_mpu)
1327                 len = q->rate_mpu;
1328
1329         if (q->atm_mode == CAKE_ATM_ATM) {
1330                 len += 47;
1331                 len /= 48;
1332                 len *= 53;
1333         } else if (q->atm_mode == CAKE_ATM_PTM) {
1334                 /* Add one byte per 64 bytes or part thereof.
1335                  * This is conservative and easier to calculate than the
1336                  * precise value.
1337                  */
1338                 len += (len + 63) / 64;
1339         }
1340
1341         if (q->max_adjlen < len)
1342                 q->max_adjlen = len;
1343         if (q->min_adjlen > len)
1344                 q->min_adjlen = len;
1345
1346         return len;
1347 }
1348
1349 static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb)
1350 {
1351         const struct skb_shared_info *shinfo = skb_shinfo(skb);
1352         unsigned int hdr_len, last_len = 0;
1353         u32 off = skb_network_offset(skb);
1354         u32 len = qdisc_pkt_len(skb);
1355         u16 segs = 1;
1356
1357         q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8);
1358
1359         if (!shinfo->gso_size)
1360                 return cake_calc_overhead(q, len, off);
1361
1362         /* borrowed from qdisc_pkt_len_init() */
1363         hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
1364
1365         /* + transport layer */
1366         if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 |
1367                                                 SKB_GSO_TCPV6))) {
1368                 const struct tcphdr *th;
1369                 struct tcphdr _tcphdr;
1370
1371                 th = skb_header_pointer(skb, skb_transport_offset(skb),
1372                                         sizeof(_tcphdr), &_tcphdr);
1373                 if (likely(th))
1374                         hdr_len += __tcp_hdrlen(th);
1375         } else {
1376                 struct udphdr _udphdr;
1377
1378                 if (skb_header_pointer(skb, skb_transport_offset(skb),
1379                                        sizeof(_udphdr), &_udphdr))
1380                         hdr_len += sizeof(struct udphdr);
1381         }
1382
1383         if (unlikely(shinfo->gso_type & SKB_GSO_DODGY))
1384                 segs = DIV_ROUND_UP(skb->len - hdr_len,
1385                                     shinfo->gso_size);
1386         else
1387                 segs = shinfo->gso_segs;
1388
1389         len = shinfo->gso_size + hdr_len;
1390         last_len = skb->len - shinfo->gso_size * (segs - 1);
1391
1392         return (cake_calc_overhead(q, len, off) * (segs - 1) +
1393                 cake_calc_overhead(q, last_len, off));
1394 }
1395
1396 static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j)
1397 {
1398         struct cake_heap_entry ii = q->overflow_heap[i];
1399         struct cake_heap_entry jj = q->overflow_heap[j];
1400
1401         q->overflow_heap[i] = jj;
1402         q->overflow_heap[j] = ii;
1403
1404         q->tins[ii.t].overflow_idx[ii.b] = j;
1405         q->tins[jj.t].overflow_idx[jj.b] = i;
1406 }
1407
1408 static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i)
1409 {
1410         struct cake_heap_entry ii = q->overflow_heap[i];
1411
1412         return q->tins[ii.t].backlogs[ii.b];
1413 }
1414
1415 static void cake_heapify(struct cake_sched_data *q, u16 i)
1416 {
1417         static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES;
1418         u32 mb = cake_heap_get_backlog(q, i);
1419         u32 m = i;
1420
1421         while (m < a) {
1422                 u32 l = m + m + 1;
1423                 u32 r = l + 1;
1424
1425                 if (l < a) {
1426                         u32 lb = cake_heap_get_backlog(q, l);
1427
1428                         if (lb > mb) {
1429                                 m  = l;
1430                                 mb = lb;
1431                         }
1432                 }
1433
1434                 if (r < a) {
1435                         u32 rb = cake_heap_get_backlog(q, r);
1436
1437                         if (rb > mb) {
1438                                 m  = r;
1439                                 mb = rb;
1440                         }
1441                 }
1442
1443                 if (m != i) {
1444                         cake_heap_swap(q, i, m);
1445                         i = m;
1446                 } else {
1447                         break;
1448                 }
1449         }
1450 }
1451
1452 static void cake_heapify_up(struct cake_sched_data *q, u16 i)
1453 {
1454         while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) {
1455                 u16 p = (i - 1) >> 1;
1456                 u32 ib = cake_heap_get_backlog(q, i);
1457                 u32 pb = cake_heap_get_backlog(q, p);
1458
1459                 if (ib > pb) {
1460                         cake_heap_swap(q, i, p);
1461                         i = p;
1462                 } else {
1463                         break;
1464                 }
1465         }
1466 }
1467
1468 static int cake_advance_shaper(struct cake_sched_data *q,
1469                                struct cake_tin_data *b,
1470                                struct sk_buff *skb,
1471                                ktime_t now, bool drop)
1472 {
1473         u32 len = get_cobalt_cb(skb)->adjusted_len;
1474
1475         /* charge packet bandwidth to this tin
1476          * and to the global shaper.
1477          */
1478         if (q->rate_ns) {
1479                 u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft;
1480                 u64 global_dur = (len * q->rate_ns) >> q->rate_shft;
1481                 u64 failsafe_dur = global_dur + (global_dur >> 1);
1482
1483                 if (ktime_before(b->time_next_packet, now))
1484                         b->time_next_packet = ktime_add_ns(b->time_next_packet,
1485                                                            tin_dur);
1486
1487                 else if (ktime_before(b->time_next_packet,
1488                                       ktime_add_ns(now, tin_dur)))
1489                         b->time_next_packet = ktime_add_ns(now, tin_dur);
1490
1491                 q->time_next_packet = ktime_add_ns(q->time_next_packet,
1492                                                    global_dur);
1493                 if (!drop)
1494                         q->failsafe_next_packet = \
1495                                 ktime_add_ns(q->failsafe_next_packet,
1496                                              failsafe_dur);
1497         }
1498         return len;
1499 }
1500
1501 static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free)
1502 {
1503         struct cake_sched_data *q = qdisc_priv(sch);
1504         ktime_t now = ktime_get();
1505         u32 idx = 0, tin = 0, len;
1506         struct cake_heap_entry qq;
1507         struct cake_tin_data *b;
1508         struct cake_flow *flow;
1509         struct sk_buff *skb;
1510
1511         if (!q->overflow_timeout) {
1512                 int i;
1513                 /* Build fresh max-heap */
1514                 for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2; i >= 0; i--)
1515                         cake_heapify(q, i);
1516         }
1517         q->overflow_timeout = 65535;
1518
1519         /* select longest queue for pruning */
1520         qq  = q->overflow_heap[0];
1521         tin = qq.t;
1522         idx = qq.b;
1523
1524         b = &q->tins[tin];
1525         flow = &b->flows[idx];
1526         skb = dequeue_head(flow);
1527         if (unlikely(!skb)) {
1528                 /* heap has gone wrong, rebuild it next time */
1529                 q->overflow_timeout = 0;
1530                 return idx + (tin << 16);
1531         }
1532
1533         if (cobalt_queue_full(&flow->cvars, &b->cparams, now))
1534                 b->unresponsive_flow_count++;
1535
1536         len = qdisc_pkt_len(skb);
1537         q->buffer_used      -= skb->truesize;
1538         b->backlogs[idx]    -= len;
1539         b->tin_backlog      -= len;
1540         sch->qstats.backlog -= len;
1541         qdisc_tree_reduce_backlog(sch, 1, len);
1542
1543         flow->dropped++;
1544         b->tin_dropped++;
1545         sch->qstats.drops++;
1546
1547         if (q->rate_flags & CAKE_FLAG_INGRESS)
1548                 cake_advance_shaper(q, b, skb, now, true);
1549
1550         __qdisc_drop(skb, to_free);
1551         sch->q.qlen--;
1552
1553         cake_heapify(q, 0);
1554
1555         return idx + (tin << 16);
1556 }
1557
1558 static u8 cake_handle_diffserv(struct sk_buff *skb, bool wash)
1559 {
1560         const int offset = skb_network_offset(skb);
1561         u16 *buf, buf_;
1562         u8 dscp;
1563
1564         switch (skb_protocol(skb, true)) {
1565         case htons(ETH_P_IP):
1566                 buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
1567                 if (unlikely(!buf))
1568                         return 0;
1569
1570                 /* ToS is in the second byte of iphdr */
1571                 dscp = ipv4_get_dsfield((struct iphdr *)buf) >> 2;
1572
1573                 if (wash && dscp) {
1574                         const int wlen = offset + sizeof(struct iphdr);
1575
1576                         if (!pskb_may_pull(skb, wlen) ||
1577                             skb_try_make_writable(skb, wlen))
1578                                 return 0;
1579
1580                         ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
1581                 }
1582
1583                 return dscp;
1584
1585         case htons(ETH_P_IPV6):
1586                 buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
1587                 if (unlikely(!buf))
1588                         return 0;
1589
1590                 /* Traffic class is in the first and second bytes of ipv6hdr */
1591                 dscp = ipv6_get_dsfield((struct ipv6hdr *)buf) >> 2;
1592
1593                 if (wash && dscp) {
1594                         const int wlen = offset + sizeof(struct ipv6hdr);
1595
1596                         if (!pskb_may_pull(skb, wlen) ||
1597                             skb_try_make_writable(skb, wlen))
1598                                 return 0;
1599
1600                         ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
1601                 }
1602
1603                 return dscp;
1604
1605         case htons(ETH_P_ARP):
1606                 return 0x38;  /* CS7 - Net Control */
1607
1608         default:
1609                 /* If there is no Diffserv field, treat as best-effort */
1610                 return 0;
1611         }
1612 }
1613
1614 static struct cake_tin_data *cake_select_tin(struct Qdisc *sch,
1615                                              struct sk_buff *skb)
1616 {
1617         struct cake_sched_data *q = qdisc_priv(sch);
1618         u32 tin, mark;
1619         bool wash;
1620         u8 dscp;
1621
1622         /* Tin selection: Default to diffserv-based selection, allow overriding
1623          * using firewall marks or skb->priority. Call DSCP parsing early if
1624          * wash is enabled, otherwise defer to below to skip unneeded parsing.
1625          */
1626         mark = (skb->mark & q->fwmark_mask) >> q->fwmark_shft;
1627         wash = !!(q->rate_flags & CAKE_FLAG_WASH);
1628         if (wash)
1629                 dscp = cake_handle_diffserv(skb, wash);
1630
1631         if (q->tin_mode == CAKE_DIFFSERV_BESTEFFORT)
1632                 tin = 0;
1633
1634         else if (mark && mark <= q->tin_cnt)
1635                 tin = q->tin_order[mark - 1];
1636
1637         else if (TC_H_MAJ(skb->priority) == sch->handle &&
1638                  TC_H_MIN(skb->priority) > 0 &&
1639                  TC_H_MIN(skb->priority) <= q->tin_cnt)
1640                 tin = q->tin_order[TC_H_MIN(skb->priority) - 1];
1641
1642         else {
1643                 if (!wash)
1644                         dscp = cake_handle_diffserv(skb, wash);
1645                 tin = q->tin_index[dscp];
1646
1647                 if (unlikely(tin >= q->tin_cnt))
1648                         tin = 0;
1649         }
1650
1651         return &q->tins[tin];
1652 }
1653
1654 static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t,
1655                          struct sk_buff *skb, int flow_mode, int *qerr)
1656 {
1657         struct cake_sched_data *q = qdisc_priv(sch);
1658         struct tcf_proto *filter;
1659         struct tcf_result res;
1660         u16 flow = 0, host = 0;
1661         int result;
1662
1663         filter = rcu_dereference_bh(q->filter_list);
1664         if (!filter)
1665                 goto hash;
1666
1667         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
1668         result = tcf_classify(skb, NULL, filter, &res, false);
1669
1670         if (result >= 0) {
1671 #ifdef CONFIG_NET_CLS_ACT
1672                 switch (result) {
1673                 case TC_ACT_STOLEN:
1674                 case TC_ACT_QUEUED:
1675                 case TC_ACT_TRAP:
1676                         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
1677                         fallthrough;
1678                 case TC_ACT_SHOT:
1679                         return 0;
1680                 }
1681 #endif
1682                 if (TC_H_MIN(res.classid) <= CAKE_QUEUES)
1683                         flow = TC_H_MIN(res.classid);
1684                 if (TC_H_MAJ(res.classid) <= (CAKE_QUEUES << 16))
1685                         host = TC_H_MAJ(res.classid) >> 16;
1686         }
1687 hash:
1688         *t = cake_select_tin(sch, skb);
1689         return cake_hash(*t, skb, flow_mode, flow, host) + 1;
1690 }
1691
1692 static void cake_reconfigure(struct Qdisc *sch);
1693
1694 static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
1695                         struct sk_buff **to_free)
1696 {
1697         struct cake_sched_data *q = qdisc_priv(sch);
1698         int len = qdisc_pkt_len(skb);
1699         int ret;
1700         struct sk_buff *ack = NULL;
1701         ktime_t now = ktime_get();
1702         struct cake_tin_data *b;
1703         struct cake_flow *flow;
1704         u32 idx;
1705
1706         /* choose flow to insert into */
1707         idx = cake_classify(sch, &b, skb, q->flow_mode, &ret);
1708         if (idx == 0) {
1709                 if (ret & __NET_XMIT_BYPASS)
1710                         qdisc_qstats_drop(sch);
1711                 __qdisc_drop(skb, to_free);
1712                 return ret;
1713         }
1714         idx--;
1715         flow = &b->flows[idx];
1716
1717         /* ensure shaper state isn't stale */
1718         if (!b->tin_backlog) {
1719                 if (ktime_before(b->time_next_packet, now))
1720                         b->time_next_packet = now;
1721
1722                 if (!sch->q.qlen) {
1723                         if (ktime_before(q->time_next_packet, now)) {
1724                                 q->failsafe_next_packet = now;
1725                                 q->time_next_packet = now;
1726                         } else if (ktime_after(q->time_next_packet, now) &&
1727                                    ktime_after(q->failsafe_next_packet, now)) {
1728                                 u64 next = \
1729                                         min(ktime_to_ns(q->time_next_packet),
1730                                             ktime_to_ns(
1731                                                    q->failsafe_next_packet));
1732                                 sch->qstats.overlimits++;
1733                                 qdisc_watchdog_schedule_ns(&q->watchdog, next);
1734                         }
1735                 }
1736         }
1737
1738         if (unlikely(len > b->max_skblen))
1739                 b->max_skblen = len;
1740
1741         if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) {
1742                 struct sk_buff *segs, *nskb;
1743                 netdev_features_t features = netif_skb_features(skb);
1744                 unsigned int slen = 0, numsegs = 0;
1745
1746                 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
1747                 if (IS_ERR_OR_NULL(segs))
1748                         return qdisc_drop(skb, sch, to_free);
1749
1750                 skb_list_walk_safe(segs, segs, nskb) {
1751                         skb_mark_not_on_list(segs);
1752                         qdisc_skb_cb(segs)->pkt_len = segs->len;
1753                         cobalt_set_enqueue_time(segs, now);
1754                         get_cobalt_cb(segs)->adjusted_len = cake_overhead(q,
1755                                                                           segs);
1756                         flow_queue_add(flow, segs);
1757
1758                         sch->q.qlen++;
1759                         numsegs++;
1760                         slen += segs->len;
1761                         q->buffer_used += segs->truesize;
1762                         b->packets++;
1763                 }
1764
1765                 /* stats */
1766                 b->bytes            += slen;
1767                 b->backlogs[idx]    += slen;
1768                 b->tin_backlog      += slen;
1769                 sch->qstats.backlog += slen;
1770                 q->avg_window_bytes += slen;
1771
1772                 qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen);
1773                 consume_skb(skb);
1774         } else {
1775                 /* not splitting */
1776                 cobalt_set_enqueue_time(skb, now);
1777                 get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb);
1778                 flow_queue_add(flow, skb);
1779
1780                 if (q->ack_filter)
1781                         ack = cake_ack_filter(q, flow);
1782
1783                 if (ack) {
1784                         b->ack_drops++;
1785                         sch->qstats.drops++;
1786                         b->bytes += qdisc_pkt_len(ack);
1787                         len -= qdisc_pkt_len(ack);
1788                         q->buffer_used += skb->truesize - ack->truesize;
1789                         if (q->rate_flags & CAKE_FLAG_INGRESS)
1790                                 cake_advance_shaper(q, b, ack, now, true);
1791
1792                         qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack));
1793                         consume_skb(ack);
1794                 } else {
1795                         sch->q.qlen++;
1796                         q->buffer_used      += skb->truesize;
1797                 }
1798
1799                 /* stats */
1800                 b->packets++;
1801                 b->bytes            += len;
1802                 b->backlogs[idx]    += len;
1803                 b->tin_backlog      += len;
1804                 sch->qstats.backlog += len;
1805                 q->avg_window_bytes += len;
1806         }
1807
1808         if (q->overflow_timeout)
1809                 cake_heapify_up(q, b->overflow_idx[idx]);
1810
1811         /* incoming bandwidth capacity estimate */
1812         if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) {
1813                 u64 packet_interval = \
1814                         ktime_to_ns(ktime_sub(now, q->last_packet_time));
1815
1816                 if (packet_interval > NSEC_PER_SEC)
1817                         packet_interval = NSEC_PER_SEC;
1818
1819                 /* filter out short-term bursts, eg. wifi aggregation */
1820                 q->avg_packet_interval = \
1821                         cake_ewma(q->avg_packet_interval,
1822                                   packet_interval,
1823                                   (packet_interval > q->avg_packet_interval ?
1824                                           2 : 8));
1825
1826                 q->last_packet_time = now;
1827
1828                 if (packet_interval > q->avg_packet_interval) {
1829                         u64 window_interval = \
1830                                 ktime_to_ns(ktime_sub(now,
1831                                                       q->avg_window_begin));
1832                         u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC;
1833
1834                         b = div64_u64(b, window_interval);
1835                         q->avg_peak_bandwidth =
1836                                 cake_ewma(q->avg_peak_bandwidth, b,
1837                                           b > q->avg_peak_bandwidth ? 2 : 8);
1838                         q->avg_window_bytes = 0;
1839                         q->avg_window_begin = now;
1840
1841                         if (ktime_after(now,
1842                                         ktime_add_ms(q->last_reconfig_time,
1843                                                      250))) {
1844                                 q->rate_bps = (q->avg_peak_bandwidth * 15) >> 4;
1845                                 cake_reconfigure(sch);
1846                         }
1847                 }
1848         } else {
1849                 q->avg_window_bytes = 0;
1850                 q->last_packet_time = now;
1851         }
1852
1853         /* flowchain */
1854         if (!flow->set || flow->set == CAKE_SET_DECAYING) {
1855                 struct cake_host *srchost = &b->hosts[flow->srchost];
1856                 struct cake_host *dsthost = &b->hosts[flow->dsthost];
1857                 u16 host_load = 1;
1858
1859                 if (!flow->set) {
1860                         list_add_tail(&flow->flowchain, &b->new_flows);
1861                 } else {
1862                         b->decaying_flow_count--;
1863                         list_move_tail(&flow->flowchain, &b->new_flows);
1864                 }
1865                 flow->set = CAKE_SET_SPARSE;
1866                 b->sparse_flow_count++;
1867
1868                 if (cake_dsrc(q->flow_mode))
1869                         host_load = max(host_load, srchost->srchost_bulk_flow_count);
1870
1871                 if (cake_ddst(q->flow_mode))
1872                         host_load = max(host_load, dsthost->dsthost_bulk_flow_count);
1873
1874                 flow->deficit = (b->flow_quantum *
1875                                  quantum_div[host_load]) >> 16;
1876         } else if (flow->set == CAKE_SET_SPARSE_WAIT) {
1877                 struct cake_host *srchost = &b->hosts[flow->srchost];
1878                 struct cake_host *dsthost = &b->hosts[flow->dsthost];
1879
1880                 /* this flow was empty, accounted as a sparse flow, but actually
1881                  * in the bulk rotation.
1882                  */
1883                 flow->set = CAKE_SET_BULK;
1884                 b->sparse_flow_count--;
1885                 b->bulk_flow_count++;
1886
1887                 if (cake_dsrc(q->flow_mode))
1888                         srchost->srchost_bulk_flow_count++;
1889
1890                 if (cake_ddst(q->flow_mode))
1891                         dsthost->dsthost_bulk_flow_count++;
1892
1893         }
1894
1895         if (q->buffer_used > q->buffer_max_used)
1896                 q->buffer_max_used = q->buffer_used;
1897
1898         if (q->buffer_used > q->buffer_limit) {
1899                 u32 dropped = 0;
1900
1901                 while (q->buffer_used > q->buffer_limit) {
1902                         dropped++;
1903                         cake_drop(sch, to_free);
1904                 }
1905                 b->drop_overlimit += dropped;
1906         }
1907         return NET_XMIT_SUCCESS;
1908 }
1909
1910 static struct sk_buff *cake_dequeue_one(struct Qdisc *sch)
1911 {
1912         struct cake_sched_data *q = qdisc_priv(sch);
1913         struct cake_tin_data *b = &q->tins[q->cur_tin];
1914         struct cake_flow *flow = &b->flows[q->cur_flow];
1915         struct sk_buff *skb = NULL;
1916         u32 len;
1917
1918         if (flow->head) {
1919                 skb = dequeue_head(flow);
1920                 len = qdisc_pkt_len(skb);
1921                 b->backlogs[q->cur_flow] -= len;
1922                 b->tin_backlog           -= len;
1923                 sch->qstats.backlog      -= len;
1924                 q->buffer_used           -= skb->truesize;
1925                 sch->q.qlen--;
1926
1927                 if (q->overflow_timeout)
1928                         cake_heapify(q, b->overflow_idx[q->cur_flow]);
1929         }
1930         return skb;
1931 }
1932
1933 /* Discard leftover packets from a tin no longer in use. */
1934 static void cake_clear_tin(struct Qdisc *sch, u16 tin)
1935 {
1936         struct cake_sched_data *q = qdisc_priv(sch);
1937         struct sk_buff *skb;
1938
1939         q->cur_tin = tin;
1940         for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++)
1941                 while (!!(skb = cake_dequeue_one(sch)))
1942                         kfree_skb(skb);
1943 }
1944
1945 static struct sk_buff *cake_dequeue(struct Qdisc *sch)
1946 {
1947         struct cake_sched_data *q = qdisc_priv(sch);
1948         struct cake_tin_data *b = &q->tins[q->cur_tin];
1949         struct cake_host *srchost, *dsthost;
1950         ktime_t now = ktime_get();
1951         struct cake_flow *flow;
1952         struct list_head *head;
1953         bool first_flow = true;
1954         struct sk_buff *skb;
1955         u16 host_load;
1956         u64 delay;
1957         u32 len;
1958
1959 begin:
1960         if (!sch->q.qlen)
1961                 return NULL;
1962
1963         /* global hard shaper */
1964         if (ktime_after(q->time_next_packet, now) &&
1965             ktime_after(q->failsafe_next_packet, now)) {
1966                 u64 next = min(ktime_to_ns(q->time_next_packet),
1967                                ktime_to_ns(q->failsafe_next_packet));
1968
1969                 sch->qstats.overlimits++;
1970                 qdisc_watchdog_schedule_ns(&q->watchdog, next);
1971                 return NULL;
1972         }
1973
1974         /* Choose a class to work on. */
1975         if (!q->rate_ns) {
1976                 /* In unlimited mode, can't rely on shaper timings, just balance
1977                  * with DRR
1978                  */
1979                 bool wrapped = false, empty = true;
1980
1981                 while (b->tin_deficit < 0 ||
1982                        !(b->sparse_flow_count + b->bulk_flow_count)) {
1983                         if (b->tin_deficit <= 0)
1984                                 b->tin_deficit += b->tin_quantum;
1985                         if (b->sparse_flow_count + b->bulk_flow_count)
1986                                 empty = false;
1987
1988                         q->cur_tin++;
1989                         b++;
1990                         if (q->cur_tin >= q->tin_cnt) {
1991                                 q->cur_tin = 0;
1992                                 b = q->tins;
1993
1994                                 if (wrapped) {
1995                                         /* It's possible for q->qlen to be
1996                                          * nonzero when we actually have no
1997                                          * packets anywhere.
1998                                          */
1999                                         if (empty)
2000                                                 return NULL;
2001                                 } else {
2002                                         wrapped = true;
2003                                 }
2004                         }
2005                 }
2006         } else {
2007                 /* In shaped mode, choose:
2008                  * - Highest-priority tin with queue and meeting schedule, or
2009                  * - The earliest-scheduled tin with queue.
2010                  */
2011                 ktime_t best_time = KTIME_MAX;
2012                 int tin, best_tin = 0;
2013
2014                 for (tin = 0; tin < q->tin_cnt; tin++) {
2015                         b = q->tins + tin;
2016                         if ((b->sparse_flow_count + b->bulk_flow_count) > 0) {
2017                                 ktime_t time_to_pkt = \
2018                                         ktime_sub(b->time_next_packet, now);
2019
2020                                 if (ktime_to_ns(time_to_pkt) <= 0 ||
2021                                     ktime_compare(time_to_pkt,
2022                                                   best_time) <= 0) {
2023                                         best_time = time_to_pkt;
2024                                         best_tin = tin;
2025                                 }
2026                         }
2027                 }
2028
2029                 q->cur_tin = best_tin;
2030                 b = q->tins + best_tin;
2031
2032                 /* No point in going further if no packets to deliver. */
2033                 if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count)))
2034                         return NULL;
2035         }
2036
2037 retry:
2038         /* service this class */
2039         head = &b->decaying_flows;
2040         if (!first_flow || list_empty(head)) {
2041                 head = &b->new_flows;
2042                 if (list_empty(head)) {
2043                         head = &b->old_flows;
2044                         if (unlikely(list_empty(head))) {
2045                                 head = &b->decaying_flows;
2046                                 if (unlikely(list_empty(head)))
2047                                         goto begin;
2048                         }
2049                 }
2050         }
2051         flow = list_first_entry(head, struct cake_flow, flowchain);
2052         q->cur_flow = flow - b->flows;
2053         first_flow = false;
2054
2055         /* triple isolation (modified DRR++) */
2056         srchost = &b->hosts[flow->srchost];
2057         dsthost = &b->hosts[flow->dsthost];
2058         host_load = 1;
2059
2060         /* flow isolation (DRR++) */
2061         if (flow->deficit <= 0) {
2062                 /* Keep all flows with deficits out of the sparse and decaying
2063                  * rotations.  No non-empty flow can go into the decaying
2064                  * rotation, so they can't get deficits
2065                  */
2066                 if (flow->set == CAKE_SET_SPARSE) {
2067                         if (flow->head) {
2068                                 b->sparse_flow_count--;
2069                                 b->bulk_flow_count++;
2070
2071                                 if (cake_dsrc(q->flow_mode))
2072                                         srchost->srchost_bulk_flow_count++;
2073
2074                                 if (cake_ddst(q->flow_mode))
2075                                         dsthost->dsthost_bulk_flow_count++;
2076
2077                                 flow->set = CAKE_SET_BULK;
2078                         } else {
2079                                 /* we've moved it to the bulk rotation for
2080                                  * correct deficit accounting but we still want
2081                                  * to count it as a sparse flow, not a bulk one.
2082                                  */
2083                                 flow->set = CAKE_SET_SPARSE_WAIT;
2084                         }
2085                 }
2086
2087                 if (cake_dsrc(q->flow_mode))
2088                         host_load = max(host_load, srchost->srchost_bulk_flow_count);
2089
2090                 if (cake_ddst(q->flow_mode))
2091                         host_load = max(host_load, dsthost->dsthost_bulk_flow_count);
2092
2093                 WARN_ON(host_load > CAKE_QUEUES);
2094
2095                 /* The get_random_u16() is a way to apply dithering to avoid
2096                  * accumulating roundoff errors
2097                  */
2098                 flow->deficit += (b->flow_quantum * quantum_div[host_load] +
2099                                   get_random_u16()) >> 16;
2100                 list_move_tail(&flow->flowchain, &b->old_flows);
2101
2102                 goto retry;
2103         }
2104
2105         /* Retrieve a packet via the AQM */
2106         while (1) {
2107                 skb = cake_dequeue_one(sch);
2108                 if (!skb) {
2109                         /* this queue was actually empty */
2110                         if (cobalt_queue_empty(&flow->cvars, &b->cparams, now))
2111                                 b->unresponsive_flow_count--;
2112
2113                         if (flow->cvars.p_drop || flow->cvars.count ||
2114                             ktime_before(now, flow->cvars.drop_next)) {
2115                                 /* keep in the flowchain until the state has
2116                                  * decayed to rest
2117                                  */
2118                                 list_move_tail(&flow->flowchain,
2119                                                &b->decaying_flows);
2120                                 if (flow->set == CAKE_SET_BULK) {
2121                                         b->bulk_flow_count--;
2122
2123                                         if (cake_dsrc(q->flow_mode))
2124                                                 srchost->srchost_bulk_flow_count--;
2125
2126                                         if (cake_ddst(q->flow_mode))
2127                                                 dsthost->dsthost_bulk_flow_count--;
2128
2129                                         b->decaying_flow_count++;
2130                                 } else if (flow->set == CAKE_SET_SPARSE ||
2131                                            flow->set == CAKE_SET_SPARSE_WAIT) {
2132                                         b->sparse_flow_count--;
2133                                         b->decaying_flow_count++;
2134                                 }
2135                                 flow->set = CAKE_SET_DECAYING;
2136                         } else {
2137                                 /* remove empty queue from the flowchain */
2138                                 list_del_init(&flow->flowchain);
2139                                 if (flow->set == CAKE_SET_SPARSE ||
2140                                     flow->set == CAKE_SET_SPARSE_WAIT)
2141                                         b->sparse_flow_count--;
2142                                 else if (flow->set == CAKE_SET_BULK) {
2143                                         b->bulk_flow_count--;
2144
2145                                         if (cake_dsrc(q->flow_mode))
2146                                                 srchost->srchost_bulk_flow_count--;
2147
2148                                         if (cake_ddst(q->flow_mode))
2149                                                 dsthost->dsthost_bulk_flow_count--;
2150
2151                                 } else
2152                                         b->decaying_flow_count--;
2153
2154                                 flow->set = CAKE_SET_NONE;
2155                         }
2156                         goto begin;
2157                 }
2158
2159                 /* Last packet in queue may be marked, shouldn't be dropped */
2160                 if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb,
2161                                         (b->bulk_flow_count *
2162                                          !!(q->rate_flags &
2163                                             CAKE_FLAG_INGRESS))) ||
2164                     !flow->head)
2165                         break;
2166
2167                 /* drop this packet, get another one */
2168                 if (q->rate_flags & CAKE_FLAG_INGRESS) {
2169                         len = cake_advance_shaper(q, b, skb,
2170                                                   now, true);
2171                         flow->deficit -= len;
2172                         b->tin_deficit -= len;
2173                 }
2174                 flow->dropped++;
2175                 b->tin_dropped++;
2176                 qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
2177                 qdisc_qstats_drop(sch);
2178                 kfree_skb(skb);
2179                 if (q->rate_flags & CAKE_FLAG_INGRESS)
2180                         goto retry;
2181         }
2182
2183         b->tin_ecn_mark += !!flow->cvars.ecn_marked;
2184         qdisc_bstats_update(sch, skb);
2185
2186         /* collect delay stats */
2187         delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
2188         b->avge_delay = cake_ewma(b->avge_delay, delay, 8);
2189         b->peak_delay = cake_ewma(b->peak_delay, delay,
2190                                   delay > b->peak_delay ? 2 : 8);
2191         b->base_delay = cake_ewma(b->base_delay, delay,
2192                                   delay < b->base_delay ? 2 : 8);
2193
2194         len = cake_advance_shaper(q, b, skb, now, false);
2195         flow->deficit -= len;
2196         b->tin_deficit -= len;
2197
2198         if (ktime_after(q->time_next_packet, now) && sch->q.qlen) {
2199                 u64 next = min(ktime_to_ns(q->time_next_packet),
2200                                ktime_to_ns(q->failsafe_next_packet));
2201
2202                 qdisc_watchdog_schedule_ns(&q->watchdog, next);
2203         } else if (!sch->q.qlen) {
2204                 int i;
2205
2206                 for (i = 0; i < q->tin_cnt; i++) {
2207                         if (q->tins[i].decaying_flow_count) {
2208                                 ktime_t next = \
2209                                         ktime_add_ns(now,
2210                                                      q->tins[i].cparams.target);
2211
2212                                 qdisc_watchdog_schedule_ns(&q->watchdog,
2213                                                            ktime_to_ns(next));
2214                                 break;
2215                         }
2216                 }
2217         }
2218
2219         if (q->overflow_timeout)
2220                 q->overflow_timeout--;
2221
2222         return skb;
2223 }
2224
2225 static void cake_reset(struct Qdisc *sch)
2226 {
2227         struct cake_sched_data *q = qdisc_priv(sch);
2228         u32 c;
2229
2230         if (!q->tins)
2231                 return;
2232
2233         for (c = 0; c < CAKE_MAX_TINS; c++)
2234                 cake_clear_tin(sch, c);
2235 }
2236
2237 static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = {
2238         [TCA_CAKE_BASE_RATE64]   = { .type = NLA_U64 },
2239         [TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 },
2240         [TCA_CAKE_ATM]           = { .type = NLA_U32 },
2241         [TCA_CAKE_FLOW_MODE]     = { .type = NLA_U32 },
2242         [TCA_CAKE_OVERHEAD]      = { .type = NLA_S32 },
2243         [TCA_CAKE_RTT]           = { .type = NLA_U32 },
2244         [TCA_CAKE_TARGET]        = { .type = NLA_U32 },
2245         [TCA_CAKE_AUTORATE]      = { .type = NLA_U32 },
2246         [TCA_CAKE_MEMORY]        = { .type = NLA_U32 },
2247         [TCA_CAKE_NAT]           = { .type = NLA_U32 },
2248         [TCA_CAKE_RAW]           = { .type = NLA_U32 },
2249         [TCA_CAKE_WASH]          = { .type = NLA_U32 },
2250         [TCA_CAKE_MPU]           = { .type = NLA_U32 },
2251         [TCA_CAKE_INGRESS]       = { .type = NLA_U32 },
2252         [TCA_CAKE_ACK_FILTER]    = { .type = NLA_U32 },
2253         [TCA_CAKE_SPLIT_GSO]     = { .type = NLA_U32 },
2254         [TCA_CAKE_FWMARK]        = { .type = NLA_U32 },
2255 };
2256
2257 static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu,
2258                           u64 target_ns, u64 rtt_est_ns)
2259 {
2260         /* convert byte-rate into time-per-byte
2261          * so it will always unwedge in reasonable time.
2262          */
2263         static const u64 MIN_RATE = 64;
2264         u32 byte_target = mtu;
2265         u64 byte_target_ns;
2266         u8  rate_shft = 0;
2267         u64 rate_ns = 0;
2268
2269         b->flow_quantum = 1514;
2270         if (rate) {
2271                 b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL);
2272                 rate_shft = 34;
2273                 rate_ns = ((u64)NSEC_PER_SEC) << rate_shft;
2274                 rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate));
2275                 while (!!(rate_ns >> 34)) {
2276                         rate_ns >>= 1;
2277                         rate_shft--;
2278                 }
2279         } /* else unlimited, ie. zero delay */
2280
2281         b->tin_rate_bps  = rate;
2282         b->tin_rate_ns   = rate_ns;
2283         b->tin_rate_shft = rate_shft;
2284
2285         byte_target_ns = (byte_target * rate_ns) >> rate_shft;
2286
2287         b->cparams.target = max((byte_target_ns * 3) / 2, target_ns);
2288         b->cparams.interval = max(rtt_est_ns +
2289                                      b->cparams.target - target_ns,
2290                                      b->cparams.target * 2);
2291         b->cparams.mtu_time = byte_target_ns;
2292         b->cparams.p_inc = 1 << 24; /* 1/256 */
2293         b->cparams.p_dec = 1 << 20; /* 1/4096 */
2294 }
2295
2296 static int cake_config_besteffort(struct Qdisc *sch)
2297 {
2298         struct cake_sched_data *q = qdisc_priv(sch);
2299         struct cake_tin_data *b = &q->tins[0];
2300         u32 mtu = psched_mtu(qdisc_dev(sch));
2301         u64 rate = q->rate_bps;
2302
2303         q->tin_cnt = 1;
2304
2305         q->tin_index = besteffort;
2306         q->tin_order = normal_order;
2307
2308         cake_set_rate(b, rate, mtu,
2309                       us_to_ns(q->target), us_to_ns(q->interval));
2310         b->tin_quantum = 65535;
2311
2312         return 0;
2313 }
2314
2315 static int cake_config_precedence(struct Qdisc *sch)
2316 {
2317         /* convert high-level (user visible) parameters into internal format */
2318         struct cake_sched_data *q = qdisc_priv(sch);
2319         u32 mtu = psched_mtu(qdisc_dev(sch));
2320         u64 rate = q->rate_bps;
2321         u32 quantum = 256;
2322         u32 i;
2323
2324         q->tin_cnt = 8;
2325         q->tin_index = precedence;
2326         q->tin_order = normal_order;
2327
2328         for (i = 0; i < q->tin_cnt; i++) {
2329                 struct cake_tin_data *b = &q->tins[i];
2330
2331                 cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2332                               us_to_ns(q->interval));
2333
2334                 b->tin_quantum = max_t(u16, 1U, quantum);
2335
2336                 /* calculate next class's parameters */
2337                 rate  *= 7;
2338                 rate >>= 3;
2339
2340                 quantum  *= 7;
2341                 quantum >>= 3;
2342         }
2343
2344         return 0;
2345 }
2346
2347 /*      List of known Diffserv codepoints:
2348  *
2349  *      Default Forwarding (DF/CS0) - Best Effort
2350  *      Max Throughput (TOS2)
2351  *      Min Delay (TOS4)
2352  *      LLT "La" (TOS5)
2353  *      Assured Forwarding 1 (AF1x) - x3
2354  *      Assured Forwarding 2 (AF2x) - x3
2355  *      Assured Forwarding 3 (AF3x) - x3
2356  *      Assured Forwarding 4 (AF4x) - x3
2357  *      Precedence Class 1 (CS1)
2358  *      Precedence Class 2 (CS2)
2359  *      Precedence Class 3 (CS3)
2360  *      Precedence Class 4 (CS4)
2361  *      Precedence Class 5 (CS5)
2362  *      Precedence Class 6 (CS6)
2363  *      Precedence Class 7 (CS7)
2364  *      Voice Admit (VA)
2365  *      Expedited Forwarding (EF)
2366  *      Lower Effort (LE)
2367  *
2368  *      Total 26 codepoints.
2369  */
2370
2371 /*      List of traffic classes in RFC 4594, updated by RFC 8622:
2372  *              (roughly descending order of contended priority)
2373  *              (roughly ascending order of uncontended throughput)
2374  *
2375  *      Network Control (CS6,CS7)      - routing traffic
2376  *      Telephony (EF,VA)         - aka. VoIP streams
2377  *      Signalling (CS5)               - VoIP setup
2378  *      Multimedia Conferencing (AF4x) - aka. video calls
2379  *      Realtime Interactive (CS4)     - eg. games
2380  *      Multimedia Streaming (AF3x)    - eg. YouTube, NetFlix, Twitch
2381  *      Broadcast Video (CS3)
2382  *      Low-Latency Data (AF2x,TOS4)      - eg. database
2383  *      Ops, Admin, Management (CS2)      - eg. ssh
2384  *      Standard Service (DF & unrecognised codepoints)
2385  *      High-Throughput Data (AF1x,TOS2)  - eg. web traffic
2386  *      Low-Priority Data (LE,CS1)        - eg. BitTorrent
2387  *
2388  *      Total 12 traffic classes.
2389  */
2390
2391 static int cake_config_diffserv8(struct Qdisc *sch)
2392 {
2393 /*      Pruned list of traffic classes for typical applications:
2394  *
2395  *              Network Control          (CS6, CS7)
2396  *              Minimum Latency          (EF, VA, CS5, CS4)
2397  *              Interactive Shell        (CS2)
2398  *              Low Latency Transactions (AF2x, TOS4)
2399  *              Video Streaming          (AF4x, AF3x, CS3)
2400  *              Bog Standard             (DF etc.)
2401  *              High Throughput          (AF1x, TOS2, CS1)
2402  *              Background Traffic       (LE)
2403  *
2404  *              Total 8 traffic classes.
2405  */
2406
2407         struct cake_sched_data *q = qdisc_priv(sch);
2408         u32 mtu = psched_mtu(qdisc_dev(sch));
2409         u64 rate = q->rate_bps;
2410         u32 quantum = 256;
2411         u32 i;
2412
2413         q->tin_cnt = 8;
2414
2415         /* codepoint to class mapping */
2416         q->tin_index = diffserv8;
2417         q->tin_order = normal_order;
2418
2419         /* class characteristics */
2420         for (i = 0; i < q->tin_cnt; i++) {
2421                 struct cake_tin_data *b = &q->tins[i];
2422
2423                 cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2424                               us_to_ns(q->interval));
2425
2426                 b->tin_quantum = max_t(u16, 1U, quantum);
2427
2428                 /* calculate next class's parameters */
2429                 rate  *= 7;
2430                 rate >>= 3;
2431
2432                 quantum  *= 7;
2433                 quantum >>= 3;
2434         }
2435
2436         return 0;
2437 }
2438
2439 static int cake_config_diffserv4(struct Qdisc *sch)
2440 {
2441 /*  Further pruned list of traffic classes for four-class system:
2442  *
2443  *          Latency Sensitive  (CS7, CS6, EF, VA, CS5, CS4)
2444  *          Streaming Media    (AF4x, AF3x, CS3, AF2x, TOS4, CS2)
2445  *          Best Effort        (DF, AF1x, TOS2, and those not specified)
2446  *          Background Traffic (LE, CS1)
2447  *
2448  *              Total 4 traffic classes.
2449  */
2450
2451         struct cake_sched_data *q = qdisc_priv(sch);
2452         u32 mtu = psched_mtu(qdisc_dev(sch));
2453         u64 rate = q->rate_bps;
2454         u32 quantum = 1024;
2455
2456         q->tin_cnt = 4;
2457
2458         /* codepoint to class mapping */
2459         q->tin_index = diffserv4;
2460         q->tin_order = bulk_order;
2461
2462         /* class characteristics */
2463         cake_set_rate(&q->tins[0], rate, mtu,
2464                       us_to_ns(q->target), us_to_ns(q->interval));
2465         cake_set_rate(&q->tins[1], rate >> 4, mtu,
2466                       us_to_ns(q->target), us_to_ns(q->interval));
2467         cake_set_rate(&q->tins[2], rate >> 1, mtu,
2468                       us_to_ns(q->target), us_to_ns(q->interval));
2469         cake_set_rate(&q->tins[3], rate >> 2, mtu,
2470                       us_to_ns(q->target), us_to_ns(q->interval));
2471
2472         /* bandwidth-sharing weights */
2473         q->tins[0].tin_quantum = quantum;
2474         q->tins[1].tin_quantum = quantum >> 4;
2475         q->tins[2].tin_quantum = quantum >> 1;
2476         q->tins[3].tin_quantum = quantum >> 2;
2477
2478         return 0;
2479 }
2480
2481 static int cake_config_diffserv3(struct Qdisc *sch)
2482 {
2483 /*  Simplified Diffserv structure with 3 tins.
2484  *              Latency Sensitive       (CS7, CS6, EF, VA, TOS4)
2485  *              Best Effort
2486  *              Low Priority            (LE, CS1)
2487  */
2488         struct cake_sched_data *q = qdisc_priv(sch);
2489         u32 mtu = psched_mtu(qdisc_dev(sch));
2490         u64 rate = q->rate_bps;
2491         u32 quantum = 1024;
2492
2493         q->tin_cnt = 3;
2494
2495         /* codepoint to class mapping */
2496         q->tin_index = diffserv3;
2497         q->tin_order = bulk_order;
2498
2499         /* class characteristics */
2500         cake_set_rate(&q->tins[0], rate, mtu,
2501                       us_to_ns(q->target), us_to_ns(q->interval));
2502         cake_set_rate(&q->tins[1], rate >> 4, mtu,
2503                       us_to_ns(q->target), us_to_ns(q->interval));
2504         cake_set_rate(&q->tins[2], rate >> 2, mtu,
2505                       us_to_ns(q->target), us_to_ns(q->interval));
2506
2507         /* bandwidth-sharing weights */
2508         q->tins[0].tin_quantum = quantum;
2509         q->tins[1].tin_quantum = quantum >> 4;
2510         q->tins[2].tin_quantum = quantum >> 2;
2511
2512         return 0;
2513 }
2514
2515 static void cake_reconfigure(struct Qdisc *sch)
2516 {
2517         struct cake_sched_data *q = qdisc_priv(sch);
2518         int c, ft;
2519
2520         switch (q->tin_mode) {
2521         case CAKE_DIFFSERV_BESTEFFORT:
2522                 ft = cake_config_besteffort(sch);
2523                 break;
2524
2525         case CAKE_DIFFSERV_PRECEDENCE:
2526                 ft = cake_config_precedence(sch);
2527                 break;
2528
2529         case CAKE_DIFFSERV_DIFFSERV8:
2530                 ft = cake_config_diffserv8(sch);
2531                 break;
2532
2533         case CAKE_DIFFSERV_DIFFSERV4:
2534                 ft = cake_config_diffserv4(sch);
2535                 break;
2536
2537         case CAKE_DIFFSERV_DIFFSERV3:
2538         default:
2539                 ft = cake_config_diffserv3(sch);
2540                 break;
2541         }
2542
2543         for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) {
2544                 cake_clear_tin(sch, c);
2545                 q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time;
2546         }
2547
2548         q->rate_ns   = q->tins[ft].tin_rate_ns;
2549         q->rate_shft = q->tins[ft].tin_rate_shft;
2550
2551         if (q->buffer_config_limit) {
2552                 q->buffer_limit = q->buffer_config_limit;
2553         } else if (q->rate_bps) {
2554                 u64 t = q->rate_bps * q->interval;
2555
2556                 do_div(t, USEC_PER_SEC / 4);
2557                 q->buffer_limit = max_t(u32, t, 4U << 20);
2558         } else {
2559                 q->buffer_limit = ~0;
2560         }
2561
2562         sch->flags &= ~TCQ_F_CAN_BYPASS;
2563
2564         q->buffer_limit = min(q->buffer_limit,
2565                               max(sch->limit * psched_mtu(qdisc_dev(sch)),
2566                                   q->buffer_config_limit));
2567 }
2568
2569 static int cake_change(struct Qdisc *sch, struct nlattr *opt,
2570                        struct netlink_ext_ack *extack)
2571 {
2572         struct cake_sched_data *q = qdisc_priv(sch);
2573         struct nlattr *tb[TCA_CAKE_MAX + 1];
2574         int err;
2575
2576         err = nla_parse_nested_deprecated(tb, TCA_CAKE_MAX, opt, cake_policy,
2577                                           extack);
2578         if (err < 0)
2579                 return err;
2580
2581         if (tb[TCA_CAKE_NAT]) {
2582 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
2583                 q->flow_mode &= ~CAKE_FLOW_NAT_FLAG;
2584                 q->flow_mode |= CAKE_FLOW_NAT_FLAG *
2585                         !!nla_get_u32(tb[TCA_CAKE_NAT]);
2586 #else
2587                 NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT],
2588                                     "No conntrack support in kernel");
2589                 return -EOPNOTSUPP;
2590 #endif
2591         }
2592
2593         if (tb[TCA_CAKE_BASE_RATE64])
2594                 q->rate_bps = nla_get_u64(tb[TCA_CAKE_BASE_RATE64]);
2595
2596         if (tb[TCA_CAKE_DIFFSERV_MODE])
2597                 q->tin_mode = nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]);
2598
2599         if (tb[TCA_CAKE_WASH]) {
2600                 if (!!nla_get_u32(tb[TCA_CAKE_WASH]))
2601                         q->rate_flags |= CAKE_FLAG_WASH;
2602                 else
2603                         q->rate_flags &= ~CAKE_FLAG_WASH;
2604         }
2605
2606         if (tb[TCA_CAKE_FLOW_MODE])
2607                 q->flow_mode = ((q->flow_mode & CAKE_FLOW_NAT_FLAG) |
2608                                 (nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) &
2609                                         CAKE_FLOW_MASK));
2610
2611         if (tb[TCA_CAKE_ATM])
2612                 q->atm_mode = nla_get_u32(tb[TCA_CAKE_ATM]);
2613
2614         if (tb[TCA_CAKE_OVERHEAD]) {
2615                 q->rate_overhead = nla_get_s32(tb[TCA_CAKE_OVERHEAD]);
2616                 q->rate_flags |= CAKE_FLAG_OVERHEAD;
2617
2618                 q->max_netlen = 0;
2619                 q->max_adjlen = 0;
2620                 q->min_netlen = ~0;
2621                 q->min_adjlen = ~0;
2622         }
2623
2624         if (tb[TCA_CAKE_RAW]) {
2625                 q->rate_flags &= ~CAKE_FLAG_OVERHEAD;
2626
2627                 q->max_netlen = 0;
2628                 q->max_adjlen = 0;
2629                 q->min_netlen = ~0;
2630                 q->min_adjlen = ~0;
2631         }
2632
2633         if (tb[TCA_CAKE_MPU])
2634                 q->rate_mpu = nla_get_u32(tb[TCA_CAKE_MPU]);
2635
2636         if (tb[TCA_CAKE_RTT]) {
2637                 q->interval = nla_get_u32(tb[TCA_CAKE_RTT]);
2638
2639                 if (!q->interval)
2640                         q->interval = 1;
2641         }
2642
2643         if (tb[TCA_CAKE_TARGET]) {
2644                 q->target = nla_get_u32(tb[TCA_CAKE_TARGET]);
2645
2646                 if (!q->target)
2647                         q->target = 1;
2648         }
2649
2650         if (tb[TCA_CAKE_AUTORATE]) {
2651                 if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE]))
2652                         q->rate_flags |= CAKE_FLAG_AUTORATE_INGRESS;
2653                 else
2654                         q->rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS;
2655         }
2656
2657         if (tb[TCA_CAKE_INGRESS]) {
2658                 if (!!nla_get_u32(tb[TCA_CAKE_INGRESS]))
2659                         q->rate_flags |= CAKE_FLAG_INGRESS;
2660                 else
2661                         q->rate_flags &= ~CAKE_FLAG_INGRESS;
2662         }
2663
2664         if (tb[TCA_CAKE_ACK_FILTER])
2665                 q->ack_filter = nla_get_u32(tb[TCA_CAKE_ACK_FILTER]);
2666
2667         if (tb[TCA_CAKE_MEMORY])
2668                 q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]);
2669
2670         if (tb[TCA_CAKE_SPLIT_GSO]) {
2671                 if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO]))
2672                         q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2673                 else
2674                         q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO;
2675         }
2676
2677         if (tb[TCA_CAKE_FWMARK]) {
2678                 q->fwmark_mask = nla_get_u32(tb[TCA_CAKE_FWMARK]);
2679                 q->fwmark_shft = q->fwmark_mask ? __ffs(q->fwmark_mask) : 0;
2680         }
2681
2682         if (q->tins) {
2683                 sch_tree_lock(sch);
2684                 cake_reconfigure(sch);
2685                 sch_tree_unlock(sch);
2686         }
2687
2688         return 0;
2689 }
2690
2691 static void cake_destroy(struct Qdisc *sch)
2692 {
2693         struct cake_sched_data *q = qdisc_priv(sch);
2694
2695         qdisc_watchdog_cancel(&q->watchdog);
2696         tcf_block_put(q->block);
2697         kvfree(q->tins);
2698 }
2699
2700 static int cake_init(struct Qdisc *sch, struct nlattr *opt,
2701                      struct netlink_ext_ack *extack)
2702 {
2703         struct cake_sched_data *q = qdisc_priv(sch);
2704         int i, j, err;
2705
2706         sch->limit = 10240;
2707         q->tin_mode = CAKE_DIFFSERV_DIFFSERV3;
2708         q->flow_mode  = CAKE_FLOW_TRIPLE;
2709
2710         q->rate_bps = 0; /* unlimited by default */
2711
2712         q->interval = 100000; /* 100ms default */
2713         q->target   =   5000; /* 5ms: codel RFC argues
2714                                * for 5 to 10% of interval
2715                                */
2716         q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2717         q->cur_tin = 0;
2718         q->cur_flow  = 0;
2719
2720         qdisc_watchdog_init(&q->watchdog, sch);
2721
2722         if (opt) {
2723                 err = cake_change(sch, opt, extack);
2724
2725                 if (err)
2726                         return err;
2727         }
2728
2729         err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
2730         if (err)
2731                 return err;
2732
2733         quantum_div[0] = ~0;
2734         for (i = 1; i <= CAKE_QUEUES; i++)
2735                 quantum_div[i] = 65535 / i;
2736
2737         q->tins = kvcalloc(CAKE_MAX_TINS, sizeof(struct cake_tin_data),
2738                            GFP_KERNEL);
2739         if (!q->tins)
2740                 return -ENOMEM;
2741
2742         for (i = 0; i < CAKE_MAX_TINS; i++) {
2743                 struct cake_tin_data *b = q->tins + i;
2744
2745                 INIT_LIST_HEAD(&b->new_flows);
2746                 INIT_LIST_HEAD(&b->old_flows);
2747                 INIT_LIST_HEAD(&b->decaying_flows);
2748                 b->sparse_flow_count = 0;
2749                 b->bulk_flow_count = 0;
2750                 b->decaying_flow_count = 0;
2751
2752                 for (j = 0; j < CAKE_QUEUES; j++) {
2753                         struct cake_flow *flow = b->flows + j;
2754                         u32 k = j * CAKE_MAX_TINS + i;
2755
2756                         INIT_LIST_HEAD(&flow->flowchain);
2757                         cobalt_vars_init(&flow->cvars);
2758
2759                         q->overflow_heap[k].t = i;
2760                         q->overflow_heap[k].b = j;
2761                         b->overflow_idx[j] = k;
2762                 }
2763         }
2764
2765         cake_reconfigure(sch);
2766         q->avg_peak_bandwidth = q->rate_bps;
2767         q->min_netlen = ~0;
2768         q->min_adjlen = ~0;
2769         return 0;
2770 }
2771
2772 static int cake_dump(struct Qdisc *sch, struct sk_buff *skb)
2773 {
2774         struct cake_sched_data *q = qdisc_priv(sch);
2775         struct nlattr *opts;
2776
2777         opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
2778         if (!opts)
2779                 goto nla_put_failure;
2780
2781         if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64, q->rate_bps,
2782                               TCA_CAKE_PAD))
2783                 goto nla_put_failure;
2784
2785         if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE,
2786                         q->flow_mode & CAKE_FLOW_MASK))
2787                 goto nla_put_failure;
2788
2789         if (nla_put_u32(skb, TCA_CAKE_RTT, q->interval))
2790                 goto nla_put_failure;
2791
2792         if (nla_put_u32(skb, TCA_CAKE_TARGET, q->target))
2793                 goto nla_put_failure;
2794
2795         if (nla_put_u32(skb, TCA_CAKE_MEMORY, q->buffer_config_limit))
2796                 goto nla_put_failure;
2797
2798         if (nla_put_u32(skb, TCA_CAKE_AUTORATE,
2799                         !!(q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS)))
2800                 goto nla_put_failure;
2801
2802         if (nla_put_u32(skb, TCA_CAKE_INGRESS,
2803                         !!(q->rate_flags & CAKE_FLAG_INGRESS)))
2804                 goto nla_put_failure;
2805
2806         if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, q->ack_filter))
2807                 goto nla_put_failure;
2808
2809         if (nla_put_u32(skb, TCA_CAKE_NAT,
2810                         !!(q->flow_mode & CAKE_FLOW_NAT_FLAG)))
2811                 goto nla_put_failure;
2812
2813         if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, q->tin_mode))
2814                 goto nla_put_failure;
2815
2816         if (nla_put_u32(skb, TCA_CAKE_WASH,
2817                         !!(q->rate_flags & CAKE_FLAG_WASH)))
2818                 goto nla_put_failure;
2819
2820         if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, q->rate_overhead))
2821                 goto nla_put_failure;
2822
2823         if (!(q->rate_flags & CAKE_FLAG_OVERHEAD))
2824                 if (nla_put_u32(skb, TCA_CAKE_RAW, 0))
2825                         goto nla_put_failure;
2826
2827         if (nla_put_u32(skb, TCA_CAKE_ATM, q->atm_mode))
2828                 goto nla_put_failure;
2829
2830         if (nla_put_u32(skb, TCA_CAKE_MPU, q->rate_mpu))
2831                 goto nla_put_failure;
2832
2833         if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO,
2834                         !!(q->rate_flags & CAKE_FLAG_SPLIT_GSO)))
2835                 goto nla_put_failure;
2836
2837         if (nla_put_u32(skb, TCA_CAKE_FWMARK, q->fwmark_mask))
2838                 goto nla_put_failure;
2839
2840         return nla_nest_end(skb, opts);
2841
2842 nla_put_failure:
2843         return -1;
2844 }
2845
2846 static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
2847 {
2848         struct nlattr *stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
2849         struct cake_sched_data *q = qdisc_priv(sch);
2850         struct nlattr *tstats, *ts;
2851         int i;
2852
2853         if (!stats)
2854                 return -1;
2855
2856 #define PUT_STAT_U32(attr, data) do {                                  \
2857                 if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2858                         goto nla_put_failure;                          \
2859         } while (0)
2860 #define PUT_STAT_U64(attr, data) do {                                  \
2861                 if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \
2862                                         data, TCA_CAKE_STATS_PAD)) \
2863                         goto nla_put_failure;                          \
2864         } while (0)
2865
2866         PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth);
2867         PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit);
2868         PUT_STAT_U32(MEMORY_USED, q->buffer_max_used);
2869         PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16));
2870         PUT_STAT_U32(MAX_NETLEN, q->max_netlen);
2871         PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen);
2872         PUT_STAT_U32(MIN_NETLEN, q->min_netlen);
2873         PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen);
2874
2875 #undef PUT_STAT_U32
2876 #undef PUT_STAT_U64
2877
2878         tstats = nla_nest_start_noflag(d->skb, TCA_CAKE_STATS_TIN_STATS);
2879         if (!tstats)
2880                 goto nla_put_failure;
2881
2882 #define PUT_TSTAT_U32(attr, data) do {                                  \
2883                 if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \
2884                         goto nla_put_failure;                           \
2885         } while (0)
2886 #define PUT_TSTAT_U64(attr, data) do {                                  \
2887                 if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \
2888                                         data, TCA_CAKE_TIN_STATS_PAD))  \
2889                         goto nla_put_failure;                           \
2890         } while (0)
2891
2892         for (i = 0; i < q->tin_cnt; i++) {
2893                 struct cake_tin_data *b = &q->tins[q->tin_order[i]];
2894
2895                 ts = nla_nest_start_noflag(d->skb, i + 1);
2896                 if (!ts)
2897                         goto nla_put_failure;
2898
2899                 PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps);
2900                 PUT_TSTAT_U64(SENT_BYTES64, b->bytes);
2901                 PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog);
2902
2903                 PUT_TSTAT_U32(TARGET_US,
2904                               ktime_to_us(ns_to_ktime(b->cparams.target)));
2905                 PUT_TSTAT_U32(INTERVAL_US,
2906                               ktime_to_us(ns_to_ktime(b->cparams.interval)));
2907
2908                 PUT_TSTAT_U32(SENT_PACKETS, b->packets);
2909                 PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped);
2910                 PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark);
2911                 PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops);
2912
2913                 PUT_TSTAT_U32(PEAK_DELAY_US,
2914                               ktime_to_us(ns_to_ktime(b->peak_delay)));
2915                 PUT_TSTAT_U32(AVG_DELAY_US,
2916                               ktime_to_us(ns_to_ktime(b->avge_delay)));
2917                 PUT_TSTAT_U32(BASE_DELAY_US,
2918                               ktime_to_us(ns_to_ktime(b->base_delay)));
2919
2920                 PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits);
2921                 PUT_TSTAT_U32(WAY_MISSES, b->way_misses);
2922                 PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions);
2923
2924                 PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count +
2925                                             b->decaying_flow_count);
2926                 PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count);
2927                 PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count);
2928                 PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen);
2929
2930                 PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum);
2931                 nla_nest_end(d->skb, ts);
2932         }
2933
2934 #undef PUT_TSTAT_U32
2935 #undef PUT_TSTAT_U64
2936
2937         nla_nest_end(d->skb, tstats);
2938         return nla_nest_end(d->skb, stats);
2939
2940 nla_put_failure:
2941         nla_nest_cancel(d->skb, stats);
2942         return -1;
2943 }
2944
2945 static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg)
2946 {
2947         return NULL;
2948 }
2949
2950 static unsigned long cake_find(struct Qdisc *sch, u32 classid)
2951 {
2952         return 0;
2953 }
2954
2955 static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent,
2956                                u32 classid)
2957 {
2958         return 0;
2959 }
2960
2961 static void cake_unbind(struct Qdisc *q, unsigned long cl)
2962 {
2963 }
2964
2965 static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl,
2966                                         struct netlink_ext_ack *extack)
2967 {
2968         struct cake_sched_data *q = qdisc_priv(sch);
2969
2970         if (cl)
2971                 return NULL;
2972         return q->block;
2973 }
2974
2975 static int cake_dump_class(struct Qdisc *sch, unsigned long cl,
2976                            struct sk_buff *skb, struct tcmsg *tcm)
2977 {
2978         tcm->tcm_handle |= TC_H_MIN(cl);
2979         return 0;
2980 }
2981
2982 static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl,
2983                                  struct gnet_dump *d)
2984 {
2985         struct cake_sched_data *q = qdisc_priv(sch);
2986         const struct cake_flow *flow = NULL;
2987         struct gnet_stats_queue qs = { 0 };
2988         struct nlattr *stats;
2989         u32 idx = cl - 1;
2990
2991         if (idx < CAKE_QUEUES * q->tin_cnt) {
2992                 const struct cake_tin_data *b = \
2993                         &q->tins[q->tin_order[idx / CAKE_QUEUES]];
2994                 const struct sk_buff *skb;
2995
2996                 flow = &b->flows[idx % CAKE_QUEUES];
2997
2998                 if (flow->head) {
2999                         sch_tree_lock(sch);
3000                         skb = flow->head;
3001                         while (skb) {
3002                                 qs.qlen++;
3003                                 skb = skb->next;
3004                         }
3005                         sch_tree_unlock(sch);
3006                 }
3007                 qs.backlog = b->backlogs[idx % CAKE_QUEUES];
3008                 qs.drops = flow->dropped;
3009         }
3010         if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
3011                 return -1;
3012         if (flow) {
3013                 ktime_t now = ktime_get();
3014
3015                 stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
3016                 if (!stats)
3017                         return -1;
3018
3019 #define PUT_STAT_U32(attr, data) do {                                  \
3020                 if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
3021                         goto nla_put_failure;                          \
3022         } while (0)
3023 #define PUT_STAT_S32(attr, data) do {                                  \
3024                 if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
3025                         goto nla_put_failure;                          \
3026         } while (0)
3027
3028                 PUT_STAT_S32(DEFICIT, flow->deficit);
3029                 PUT_STAT_U32(DROPPING, flow->cvars.dropping);
3030                 PUT_STAT_U32(COBALT_COUNT, flow->cvars.count);
3031                 PUT_STAT_U32(P_DROP, flow->cvars.p_drop);
3032                 if (flow->cvars.p_drop) {
3033                         PUT_STAT_S32(BLUE_TIMER_US,
3034                                      ktime_to_us(
3035                                              ktime_sub(now,
3036                                                        flow->cvars.blue_timer)));
3037                 }
3038                 if (flow->cvars.dropping) {
3039                         PUT_STAT_S32(DROP_NEXT_US,
3040                                      ktime_to_us(
3041                                              ktime_sub(now,
3042                                                        flow->cvars.drop_next)));
3043                 }
3044
3045                 if (nla_nest_end(d->skb, stats) < 0)
3046                         return -1;
3047         }
3048
3049         return 0;
3050
3051 nla_put_failure:
3052         nla_nest_cancel(d->skb, stats);
3053         return -1;
3054 }
3055
3056 static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg)
3057 {
3058         struct cake_sched_data *q = qdisc_priv(sch);
3059         unsigned int i, j;
3060
3061         if (arg->stop)
3062                 return;
3063
3064         for (i = 0; i < q->tin_cnt; i++) {
3065                 struct cake_tin_data *b = &q->tins[q->tin_order[i]];
3066
3067                 for (j = 0; j < CAKE_QUEUES; j++) {
3068                         if (list_empty(&b->flows[j].flowchain)) {
3069                                 arg->count++;
3070                                 continue;
3071                         }
3072                         if (!tc_qdisc_stats_dump(sch, i * CAKE_QUEUES + j + 1,
3073                                                  arg))
3074                                 break;
3075                 }
3076         }
3077 }
3078
3079 static const struct Qdisc_class_ops cake_class_ops = {
3080         .leaf           =       cake_leaf,
3081         .find           =       cake_find,
3082         .tcf_block      =       cake_tcf_block,
3083         .bind_tcf       =       cake_bind,
3084         .unbind_tcf     =       cake_unbind,
3085         .dump           =       cake_dump_class,
3086         .dump_stats     =       cake_dump_class_stats,
3087         .walk           =       cake_walk,
3088 };
3089
3090 static struct Qdisc_ops cake_qdisc_ops __read_mostly = {
3091         .cl_ops         =       &cake_class_ops,
3092         .id             =       "cake",
3093         .priv_size      =       sizeof(struct cake_sched_data),
3094         .enqueue        =       cake_enqueue,
3095         .dequeue        =       cake_dequeue,
3096         .peek           =       qdisc_peek_dequeued,
3097         .init           =       cake_init,
3098         .reset          =       cake_reset,
3099         .destroy        =       cake_destroy,
3100         .change         =       cake_change,
3101         .dump           =       cake_dump,
3102         .dump_stats     =       cake_dump_stats,
3103         .owner          =       THIS_MODULE,
3104 };
3105
3106 static int __init cake_module_init(void)
3107 {
3108         return register_qdisc(&cake_qdisc_ops);
3109 }
3110
3111 static void __exit cake_module_exit(void)
3112 {
3113         unregister_qdisc(&cake_qdisc_ops);
3114 }
3115
3116 module_init(cake_module_init)
3117 module_exit(cake_module_exit)
3118 MODULE_AUTHOR("Jonathan Morton");
3119 MODULE_LICENSE("Dual BSD/GPL");
3120 MODULE_DESCRIPTION("The CAKE shaper.");