net/sched/sch_cake.c

   1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
   2
   3 /* COMMON Applications Kept Enhanced (CAKE) discipline
   4  *
   5  * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com>
   6  * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk>
   7  * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com>
   8  * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de>
   9  * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk>
  10  * Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au>
  11  *
  12  * The CAKE Principles:
  13  *                 (or, how to have your cake and eat it too)
  14  *
  15  * This is a combination of several shaping, AQM and FQ techniques into one
  16  * easy-to-use package:
  17  *
  18  * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE
  19  *   equipment and bloated MACs.  This operates in deficit mode (as in sch_fq),
  20  *   eliminating the need for any sort of burst parameter (eg. token bucket
  21  *   depth).  Burst support is limited to that necessary to overcome scheduling
  22  *   latency.
  23  *
  24  * - A Diffserv-aware priority queue, giving more priority to certain classes,
  25  *   up to a specified fraction of bandwidth.  Above that bandwidth threshold,
  26  *   the priority is reduced to avoid starving other tins.
  27  *
  28  * - Each priority tin has a separate Flow Queue system, to isolate traffic
  29  *   flows from each other.  This prevents a burst on one flow from increasing
  30  *   the delay to another.  Flows are distributed to queues using a
  31  *   set-associative hash function.
  32  *
  33  * - Each queue is actively managed by Cobalt, which is a combination of the
  34  *   Codel and Blue AQM algorithms.  This serves flows fairly, and signals
  35  *   congestion early via ECN (if available) and/or packet drops, to keep
  36  *   latency low.  The codel parameters are auto-tuned based on the bandwidth
  37  *   setting, as is necessary at low bandwidths.
  38  *
  39  * The configuration parameters are kept deliberately simple for ease of use.
  40  * Everything has sane defaults.  Complete generality of configuration is *not*
  41  * a goal.
  42  *
  43  * The priority queue operates according to a weighted DRR scheme, combined with
  44  * a bandwidth tracker which reuses the shaper logic to detect which side of the
  45  * bandwidth sharing threshold the tin is operating.  This determines whether a
  46  * priority-based weight (high) or a bandwidth-based weight (low) is used for
  47  * that tin in the current pass.
  48  *
  49  * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly
  50  * granted us permission to leverage.
  51  */
  52
  53 #include <linux/module.h>
  54 #include <linux/types.h>
  55 #include <linux/kernel.h>
  56 #include <linux/jiffies.h>
  57 #include <linux/string.h>
  58 #include <linux/in.h>
  59 #include <linux/errno.h>
  60 #include <linux/init.h>
  61 #include <linux/skbuff.h>
  62 #include <linux/jhash.h>
  63 #include <linux/slab.h>
  64 #include <linux/vmalloc.h>
  65 #include <linux/reciprocal_div.h>
  66 #include <net/netlink.h>
  67 #include <linux/version.h>
  68 #include <linux/if_vlan.h>
  69 #include <net/pkt_sched.h>
  70 #include <net/pkt_cls.h>
  71 #include <net/tcp.h>
  72 #include <net/flow_dissector.h>
  73
  74 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
  75 #include <net/netfilter/nf_conntrack_core.h>
  76 #endif
  77
  78 #define CAKE_SET_WAYS (8)
  79 #define CAKE_MAX_TINS (8)
  80 #define CAKE_QUEUES (1024)
  81 #define CAKE_FLOW_MASK 63
  82 #define CAKE_FLOW_NAT_FLAG 64
  83
  84 /* struct cobalt_params - contains codel and blue parameters
  85  * @interval:   codel initial drop rate
  86  * @target:     maximum persistent sojourn time & blue update rate
  87  * @mtu_time:   serialisation delay of maximum-size packet
  88  * @p_inc:      increment of blue drop probability (0.32 fxp)
  89  * @p_dec:      decrement of blue drop probability (0.32 fxp)
  90  */
  91 struct cobalt_params {
  92         u64     interval;
  93         u64     target;
  94         u64     mtu_time;
  95         u32     p_inc;
  96         u32     p_dec;
  97 };
  98
  99 /* struct cobalt_vars - contains codel and blue variables
 100  * @count:              codel dropping frequency
 101  * @rec_inv_sqrt:       reciprocal value of sqrt(count) >> 1
 102  * @drop_next:          time to drop next packet, or when we dropped last
 103  * @blue_timer:         Blue time to next drop
 104  * @p_drop:             BLUE drop probability (0.32 fxp)
 105  * @dropping:           set if in dropping state
 106  * @ecn_marked:         set if marked
 107  */
 108 struct cobalt_vars {
 109         u32     count;
 110         u32     rec_inv_sqrt;
 111         ktime_t drop_next;
 112         ktime_t blue_timer;
 113         u32     p_drop;
 114         bool    dropping;
 115         bool    ecn_marked;
 116 };
 117
 118 enum {
 119         CAKE_SET_NONE = 0,
 120         CAKE_SET_SPARSE,
 121         CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */
 122         CAKE_SET_BULK,
 123         CAKE_SET_DECAYING
 124 };
 125
 126 struct cake_flow {
 127         /* this stuff is all needed per-flow at dequeue time */
 128         struct sk_buff    *head;
 129         struct sk_buff    *tail;
 130         struct list_head  flowchain;
 131         s32               deficit;
 132         u32               dropped;
 133         struct cobalt_vars cvars;
 134         u16               srchost; /* index into cake_host table */
 135         u16               dsthost;
 136         u8                set;
 137 }; /* please try to keep this structure <= 64 bytes */
 138
 139 struct cake_host {
 140         u32 srchost_tag;
 141         u32 dsthost_tag;
 142         u16 srchost_refcnt;
 143         u16 dsthost_refcnt;
 144 };
 145
 146 struct cake_heap_entry {
 147         u16 t:3, b:10;
 148 };
 149
 150 struct cake_tin_data {
 151         struct cake_flow flows[CAKE_QUEUES];
 152         u32     backlogs[CAKE_QUEUES];
 153         u32     tags[CAKE_QUEUES]; /* for set association */
 154         u16     overflow_idx[CAKE_QUEUES];
 155         struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */
 156         u16     flow_quantum;
 157
 158         struct cobalt_params cparams;
 159         u32     drop_overlimit;
 160         u16     bulk_flow_count;
 161         u16     sparse_flow_count;
 162         u16     decaying_flow_count;
 163         u16     unresponsive_flow_count;
 164
 165         u32     max_skblen;
 166
 167         struct list_head new_flows;
 168         struct list_head old_flows;
 169         struct list_head decaying_flows;
 170
 171         /* time_next = time_this + ((len * rate_ns) >> rate_shft) */
 172         ktime_t time_next_packet;
 173         u64     tin_rate_ns;
 174         u64     tin_rate_bps;
 175         u16     tin_rate_shft;
 176
 177         u16     tin_quantum_prio;
 178         u16     tin_quantum_band;
 179         s32     tin_deficit;
 180         u32     tin_backlog;
 181         u32     tin_dropped;
 182         u32     tin_ecn_mark;
 183
 184         u32     packets;
 185         u64     bytes;
 186
 187         u32     ack_drops;
 188
 189         /* moving averages */
 190         u64 avge_delay;
 191         u64 peak_delay;
 192         u64 base_delay;
 193
 194         /* hash function stats */
 195         u32     way_directs;
 196         u32     way_hits;
 197         u32     way_misses;
 198         u32     way_collisions;
 199 }; /* number of tins is small, so size of this struct doesn't matter much */
 200
 201 struct cake_sched_data {
 202         struct tcf_proto __rcu *filter_list; /* optional external classifier */
 203         struct tcf_block *block;
 204         struct cake_tin_data *tins;
 205
 206         struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS];
 207         u16             overflow_timeout;
 208
 209         u16             tin_cnt;
 210         u8              tin_mode;
 211         u8              flow_mode;
 212         u8              ack_filter;
 213         u8              atm_mode;
 214
 215         /* time_next = time_this + ((len * rate_ns) >> rate_shft) */
 216         u16             rate_shft;
 217         ktime_t         time_next_packet;
 218         ktime_t         failsafe_next_packet;
 219         u64             rate_ns;
 220         u64             rate_bps;
 221         u16             rate_flags;
 222         s16             rate_overhead;
 223         u16             rate_mpu;
 224         u64             interval;
 225         u64             target;
 226
 227         /* resource tracking */
 228         u32             buffer_used;
 229         u32             buffer_max_used;
 230         u32             buffer_limit;
 231         u32             buffer_config_limit;
 232
 233         /* indices for dequeue */
 234         u16             cur_tin;
 235         u16             cur_flow;
 236
 237         struct qdisc_watchdog watchdog;
 238         const u8        *tin_index;
 239         const u8        *tin_order;
 240
 241         /* bandwidth capacity estimate */
 242         ktime_t         last_packet_time;
 243         ktime_t         avg_window_begin;
 244         u64             avg_packet_interval;
 245         u64             avg_window_bytes;
 246         u64             avg_peak_bandwidth;
 247         ktime_t         last_reconfig_time;
 248
 249         /* packet length stats */
 250         u32             avg_netoff;
 251         u16             max_netlen;
 252         u16             max_adjlen;
 253         u16             min_netlen;
 254         u16             min_adjlen;
 255 };
 256
 257 enum {
 258         CAKE_FLAG_OVERHEAD         = BIT(0),
 259         CAKE_FLAG_AUTORATE_INGRESS = BIT(1),
 260         CAKE_FLAG_INGRESS          = BIT(2),
 261         CAKE_FLAG_WASH             = BIT(3),
 262         CAKE_FLAG_SPLIT_GSO        = BIT(4)
 263 };
 264
 265 /* COBALT operates the Codel and BLUE algorithms in parallel, in order to
 266  * obtain the best features of each.  Codel is excellent on flows which
 267  * respond to congestion signals in a TCP-like way.  BLUE is more effective on
 268  * unresponsive flows.
 269  */
 270
 271 struct cobalt_skb_cb {
 272         ktime_t enqueue_time;
 273         u32     adjusted_len;
 274 };
 275
 276 static u64 us_to_ns(u64 us)
 277 {
 278         return us * NSEC_PER_USEC;
 279 }
 280
 281 static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb)
 282 {
 283         qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb));
 284         return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data;
 285 }
 286
 287 static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb)
 288 {
 289         return get_cobalt_cb(skb)->enqueue_time;
 290 }
 291
 292 static void cobalt_set_enqueue_time(struct sk_buff *skb,
 293                                     ktime_t now)
 294 {
 295         get_cobalt_cb(skb)->enqueue_time = now;
 296 }
 297
 298 static u16 quantum_div[CAKE_QUEUES + 1] = {0};
 299
 300 /* Diffserv lookup tables */
 301
 302 static const u8 precedence[] = {
 303         0, 0, 0, 0, 0, 0, 0, 0,
 304         1, 1, 1, 1, 1, 1, 1, 1,
 305         2, 2, 2, 2, 2, 2, 2, 2,
 306         3, 3, 3, 3, 3, 3, 3, 3,
 307         4, 4, 4, 4, 4, 4, 4, 4,
 308         5, 5, 5, 5, 5, 5, 5, 5,
 309         6, 6, 6, 6, 6, 6, 6, 6,
 310         7, 7, 7, 7, 7, 7, 7, 7,
 311 };
 312
 313 static const u8 diffserv8[] = {
 314         2, 5, 1, 2, 4, 2, 2, 2,
 315         0, 2, 1, 2, 1, 2, 1, 2,
 316         5, 2, 4, 2, 4, 2, 4, 2,
 317         3, 2, 3, 2, 3, 2, 3, 2,
 318         6, 2, 3, 2, 3, 2, 3, 2,
 319         6, 2, 2, 2, 6, 2, 6, 2,
 320         7, 2, 2, 2, 2, 2, 2, 2,
 321         7, 2, 2, 2, 2, 2, 2, 2,
 322 };
 323
 324 static const u8 diffserv4[] = {
 325         0, 2, 0, 0, 2, 0, 0, 0,
 326         1, 0, 0, 0, 0, 0, 0, 0,
 327         2, 0, 2, 0, 2, 0, 2, 0,
 328         2, 0, 2, 0, 2, 0, 2, 0,
 329         3, 0, 2, 0, 2, 0, 2, 0,
 330         3, 0, 0, 0, 3, 0, 3, 0,
 331         3, 0, 0, 0, 0, 0, 0, 0,
 332         3, 0, 0, 0, 0, 0, 0, 0,
 333 };
 334
 335 static const u8 diffserv3[] = {
 336         0, 0, 0, 0, 2, 0, 0, 0,
 337         1, 0, 0, 0, 0, 0, 0, 0,
 338         0, 0, 0, 0, 0, 0, 0, 0,
 339         0, 0, 0, 0, 0, 0, 0, 0,
 340         0, 0, 0, 0, 0, 0, 0, 0,
 341         0, 0, 0, 0, 2, 0, 2, 0,
 342         2, 0, 0, 0, 0, 0, 0, 0,
 343         2, 0, 0, 0, 0, 0, 0, 0,
 344 };
 345
 346 static const u8 besteffort[] = {
 347         0, 0, 0, 0, 0, 0, 0, 0,
 348         0, 0, 0, 0, 0, 0, 0, 0,
 349         0, 0, 0, 0, 0, 0, 0, 0,
 350         0, 0, 0, 0, 0, 0, 0, 0,
 351         0, 0, 0, 0, 0, 0, 0, 0,
 352         0, 0, 0, 0, 0, 0, 0, 0,
 353         0, 0, 0, 0, 0, 0, 0, 0,
 354         0, 0, 0, 0, 0, 0, 0, 0,
 355 };
 356
 357 /* tin priority order for stats dumping */
 358
 359 static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7};
 360 static const u8 bulk_order[] = {1, 0, 2, 3};
 361
 362 #define REC_INV_SQRT_CACHE (16)
 363 static u32 cobalt_rec_inv_sqrt_cache[REC_INV_SQRT_CACHE] = {0};
 364
 365 /* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots
 366  * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
 367  *
 368  * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
 369  */
 370
 371 static void cobalt_newton_step(struct cobalt_vars *vars)
 372 {
 373         u32 invsqrt, invsqrt2;
 374         u64 val;
 375
 376         invsqrt = vars->rec_inv_sqrt;
 377         invsqrt2 = ((u64)invsqrt * invsqrt) >> 32;
 378         val = (3LL << 32) - ((u64)vars->count * invsqrt2);
 379
 380         val >>= 2; /* avoid overflow in following multiply */
 381         val = (val * invsqrt) >> (32 - 2 + 1);
 382
 383         vars->rec_inv_sqrt = val;
 384 }
 385
 386 static void cobalt_invsqrt(struct cobalt_vars *vars)
 387 {
 388         if (vars->count < REC_INV_SQRT_CACHE)
 389                 vars->rec_inv_sqrt = cobalt_rec_inv_sqrt_cache[vars->count];
 390         else
 391                 cobalt_newton_step(vars);
 392 }
 393
 394 /* There is a big difference in timing between the accurate values placed in
 395  * the cache and the approximations given by a single Newton step for small
 396  * count values, particularly when stepping from count 1 to 2 or vice versa.
 397  * Above 16, a single Newton step gives sufficient accuracy in either
 398  * direction, given the precision stored.
 399  *
 400  * The magnitude of the error when stepping up to count 2 is such as to give
 401  * the value that *should* have been produced at count 4.
 402  */
 403
 404 static void cobalt_cache_init(void)
 405 {
 406         struct cobalt_vars v;
 407
 408         memset(&v, 0, sizeof(v));
 409         v.rec_inv_sqrt = ~0U;
 410         cobalt_rec_inv_sqrt_cache[0] = v.rec_inv_sqrt;
 411
 412         for (v.count = 1; v.count < REC_INV_SQRT_CACHE; v.count++) {
 413                 cobalt_newton_step(&v);
 414                 cobalt_newton_step(&v);
 415                 cobalt_newton_step(&v);
 416                 cobalt_newton_step(&v);
 417
 418                 cobalt_rec_inv_sqrt_cache[v.count] = v.rec_inv_sqrt;
 419         }
 420 }
 421
 422 static void cobalt_vars_init(struct cobalt_vars *vars)
 423 {
 424         memset(vars, 0, sizeof(*vars));
 425
 426         if (!cobalt_rec_inv_sqrt_cache[0]) {
 427                 cobalt_cache_init();
 428                 cobalt_rec_inv_sqrt_cache[0] = ~0;
 429         }
 430 }
 431
 432 /* CoDel control_law is t + interval/sqrt(count)
 433  * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid
 434  * both sqrt() and divide operation.
 435  */
 436 static ktime_t cobalt_control(ktime_t t,
 437                               u64 interval,
 438                               u32 rec_inv_sqrt)
 439 {
 440         return ktime_add_ns(t, reciprocal_scale(interval,
 441                                                 rec_inv_sqrt));
 442 }
 443
 444 /* Call this when a packet had to be dropped due to queue overflow.  Returns
 445  * true if the BLUE state was quiescent before but active after this call.
 446  */
 447 static bool cobalt_queue_full(struct cobalt_vars *vars,
 448                               struct cobalt_params *p,
 449                               ktime_t now)
 450 {
 451         bool up = false;
 452
 453         if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
 454                 up = !vars->p_drop;
 455                 vars->p_drop += p->p_inc;
 456                 if (vars->p_drop < p->p_inc)
 457                         vars->p_drop = ~0;
 458                 vars->blue_timer = now;
 459         }
 460         vars->dropping = true;
 461         vars->drop_next = now;
 462         if (!vars->count)
 463                 vars->count = 1;
 464
 465         return up;
 466 }
 467
 468 /* Call this when the queue was serviced but turned out to be empty.  Returns
 469  * true if the BLUE state was active before but quiescent after this call.
 470  */
 471 static bool cobalt_queue_empty(struct cobalt_vars *vars,
 472                                struct cobalt_params *p,
 473                                ktime_t now)
 474 {
 475         bool down = false;
 476
 477         if (vars->p_drop &&
 478             ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
 479                 if (vars->p_drop < p->p_dec)
 480                         vars->p_drop = 0;
 481                 else
 482                         vars->p_drop -= p->p_dec;
 483                 vars->blue_timer = now;
 484                 down = !vars->p_drop;
 485         }
 486         vars->dropping = false;
 487
 488         if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) {
 489                 vars->count--;
 490                 cobalt_invsqrt(vars);
 491                 vars->drop_next = cobalt_control(vars->drop_next,
 492                                                  p->interval,
 493                                                  vars->rec_inv_sqrt);
 494         }
 495
 496         return down;
 497 }
 498
 499 /* Call this with a freshly dequeued packet for possible congestion marking.
 500  * Returns true as an instruction to drop the packet, false for delivery.
 501  */
 502 static bool cobalt_should_drop(struct cobalt_vars *vars,
 503                                struct cobalt_params *p,
 504                                ktime_t now,
 505                                struct sk_buff *skb,
 506                                u32 bulk_flows)
 507 {
 508         bool next_due, over_target, drop = false;
 509         ktime_t schedule;
 510         u64 sojourn;
 511
 512 /* The 'schedule' variable records, in its sign, whether 'now' is before or
 513  * after 'drop_next'.  This allows 'drop_next' to be updated before the next
 514  * scheduling decision is actually branched, without destroying that
 515  * information.  Similarly, the first 'schedule' value calculated is preserved
 516  * in the boolean 'next_due'.
 517  *
 518  * As for 'drop_next', we take advantage of the fact that 'interval' is both
 519  * the delay between first exceeding 'target' and the first signalling event,
 520  * *and* the scaling factor for the signalling frequency.  It's therefore very
 521  * natural to use a single mechanism for both purposes, and eliminates a
 522  * significant amount of reference Codel's spaghetti code.  To help with this,
 523  * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close
 524  * as possible to 1.0 in fixed-point.
 525  */
 526
 527         sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
 528         schedule = ktime_sub(now, vars->drop_next);
 529         over_target = sojourn > p->target &&
 530                       sojourn > p->mtu_time * bulk_flows * 2 &&
 531                       sojourn > p->mtu_time * 4;
 532         next_due = vars->count && ktime_to_ns(schedule) >= 0;
 533
 534         vars->ecn_marked = false;
 535
 536         if (over_target) {
 537                 if (!vars->dropping) {
 538                         vars->dropping = true;
 539                         vars->drop_next = cobalt_control(now,
 540                                                          p->interval,
 541                                                          vars->rec_inv_sqrt);
 542                 }
 543                 if (!vars->count)
 544                         vars->count = 1;
 545         } else if (vars->dropping) {
 546                 vars->dropping = false;
 547         }
 548
 549         if (next_due && vars->dropping) {
 550                 /* Use ECN mark if possible, otherwise drop */
 551                 drop = !(vars->ecn_marked = INET_ECN_set_ce(skb));
 552
 553                 vars->count++;
 554                 if (!vars->count)
 555                         vars->count--;
 556                 cobalt_invsqrt(vars);
 557                 vars->drop_next = cobalt_control(vars->drop_next,
 558                                                  p->interval,
 559                                                  vars->rec_inv_sqrt);
 560                 schedule = ktime_sub(now, vars->drop_next);
 561         } else {
 562                 while (next_due) {
 563                         vars->count--;
 564                         cobalt_invsqrt(vars);
 565                         vars->drop_next = cobalt_control(vars->drop_next,
 566                                                          p->interval,
 567                                                          vars->rec_inv_sqrt);
 568                         schedule = ktime_sub(now, vars->drop_next);
 569                         next_due = vars->count && ktime_to_ns(schedule) >= 0;
 570                 }
 571         }
 572
 573         /* Simple BLUE implementation.  Lack of ECN is deliberate. */
 574         if (vars->p_drop)
 575                 drop |= (prandom_u32() < vars->p_drop);
 576
 577         /* Overload the drop_next field as an activity timeout */
 578         if (!vars->count)
 579                 vars->drop_next = ktime_add_ns(now, p->interval);
 580         else if (ktime_to_ns(schedule) > 0 && !drop)
 581                 vars->drop_next = now;
 582
 583         return drop;
 584 }
 585
 586 static void cake_update_flowkeys(struct flow_keys *keys,
 587                                  const struct sk_buff *skb)
 588 {
 589 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
 590         struct nf_conntrack_tuple tuple = {};
 591         bool rev = !skb->_nfct;
 592
 593         if (tc_skb_protocol(skb) != htons(ETH_P_IP))
 594                 return;
 595
 596         if (!nf_ct_get_tuple_skb(&tuple, skb))
 597                 return;
 598
 599         keys->addrs.v4addrs.src = rev ? tuple.dst.u3.ip : tuple.src.u3.ip;
 600         keys->addrs.v4addrs.dst = rev ? tuple.src.u3.ip : tuple.dst.u3.ip;
 601
 602         if (keys->ports.ports) {
 603                 keys->ports.src = rev ? tuple.dst.u.all : tuple.src.u.all;
 604                 keys->ports.dst = rev ? tuple.src.u.all : tuple.dst.u.all;
 605         }
 606 #endif
 607 }
 608
 609 /* Cake has several subtle multiple bit settings. In these cases you
 610  *  would be matching triple isolate mode as well.
 611  */
 612
 613 static bool cake_dsrc(int flow_mode)
 614 {
 615         return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC;
 616 }
 617
 618 static bool cake_ddst(int flow_mode)
 619 {
 620         return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST;
 621 }
 622
 623 static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb,
 624                      int flow_mode)
 625 {
 626         u32 flow_hash = 0, srchost_hash, dsthost_hash;
 627         u16 reduced_hash, srchost_idx, dsthost_idx;
 628         struct flow_keys keys, host_keys;
 629
 630         if (unlikely(flow_mode == CAKE_FLOW_NONE))
 631                 return 0;
 632
 633         skb_flow_dissect_flow_keys(skb, &keys,
 634                                    FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
 635
 636         if (flow_mode & CAKE_FLOW_NAT_FLAG)
 637                 cake_update_flowkeys(&keys, skb);
 638
 639         /* flow_hash_from_keys() sorts the addresses by value, so we have
 640          * to preserve their order in a separate data structure to treat
 641          * src and dst host addresses as independently selectable.
 642          */
 643         host_keys = keys;
 644         host_keys.ports.ports     = 0;
 645         host_keys.basic.ip_proto  = 0;
 646         host_keys.keyid.keyid     = 0;
 647         host_keys.tags.flow_label = 0;
 648
 649         switch (host_keys.control.addr_type) {
 650         case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
 651                 host_keys.addrs.v4addrs.src = 0;
 652                 dsthost_hash = flow_hash_from_keys(&host_keys);
 653                 host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
 654                 host_keys.addrs.v4addrs.dst = 0;
 655                 srchost_hash = flow_hash_from_keys(&host_keys);
 656                 break;
 657
 658         case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
 659                 memset(&host_keys.addrs.v6addrs.src, 0,
 660                        sizeof(host_keys.addrs.v6addrs.src));
 661                 dsthost_hash = flow_hash_from_keys(&host_keys);
 662                 host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
 663                 memset(&host_keys.addrs.v6addrs.dst, 0,
 664                        sizeof(host_keys.addrs.v6addrs.dst));
 665                 srchost_hash = flow_hash_from_keys(&host_keys);
 666                 break;
 667
 668         default:
 669                 dsthost_hash = 0;
 670                 srchost_hash = 0;
 671         }
 672
 673         /* This *must* be after the above switch, since as a
 674          * side-effect it sorts the src and dst addresses.
 675          */
 676         if (flow_mode & CAKE_FLOW_FLOWS)
 677                 flow_hash = flow_hash_from_keys(&keys);
 678
 679         if (!(flow_mode & CAKE_FLOW_FLOWS)) {
 680                 if (flow_mode & CAKE_FLOW_SRC_IP)
 681                         flow_hash ^= srchost_hash;
 682
 683                 if (flow_mode & CAKE_FLOW_DST_IP)
 684                         flow_hash ^= dsthost_hash;
 685         }
 686
 687         reduced_hash = flow_hash % CAKE_QUEUES;
 688
 689         /* set-associative hashing */
 690         /* fast path if no hash collision (direct lookup succeeds) */
 691         if (likely(q->tags[reduced_hash] == flow_hash &&
 692                    q->flows[reduced_hash].set)) {
 693                 q->way_directs++;
 694         } else {
 695                 u32 inner_hash = reduced_hash % CAKE_SET_WAYS;
 696                 u32 outer_hash = reduced_hash - inner_hash;
 697                 bool allocate_src = false;
 698                 bool allocate_dst = false;
 699                 u32 i, k;
 700
 701                 /* check if any active queue in the set is reserved for
 702                  * this flow.
 703                  */
 704                 for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
 705                      i++, k = (k + 1) % CAKE_SET_WAYS) {
 706                         if (q->tags[outer_hash + k] == flow_hash) {
 707                                 if (i)
 708                                         q->way_hits++;
 709
 710                                 if (!q->flows[outer_hash + k].set) {
 711                                         /* need to increment host refcnts */
 712                                         allocate_src = cake_dsrc(flow_mode);
 713                                         allocate_dst = cake_ddst(flow_mode);
 714                                 }
 715
 716                                 goto found;
 717                         }
 718                 }
 719
 720                 /* no queue is reserved for this flow, look for an
 721                  * empty one.
 722                  */
 723                 for (i = 0; i < CAKE_SET_WAYS;
 724                          i++, k = (k + 1) % CAKE_SET_WAYS) {
 725                         if (!q->flows[outer_hash + k].set) {
 726                                 q->way_misses++;
 727                                 allocate_src = cake_dsrc(flow_mode);
 728                                 allocate_dst = cake_ddst(flow_mode);
 729                                 goto found;
 730                         }
 731                 }
 732
 733                 /* With no empty queues, default to the original
 734                  * queue, accept the collision, update the host tags.
 735                  */
 736                 q->way_collisions++;
 737                 q->hosts[q->flows[reduced_hash].srchost].srchost_refcnt--;
 738                 q->hosts[q->flows[reduced_hash].dsthost].dsthost_refcnt--;
 739                 allocate_src = cake_dsrc(flow_mode);
 740                 allocate_dst = cake_ddst(flow_mode);
 741 found:
 742                 /* reserve queue for future packets in same flow */
 743                 reduced_hash = outer_hash + k;
 744                 q->tags[reduced_hash] = flow_hash;
 745
 746                 if (allocate_src) {
 747                         srchost_idx = srchost_hash % CAKE_QUEUES;
 748                         inner_hash = srchost_idx % CAKE_SET_WAYS;
 749                         outer_hash = srchost_idx - inner_hash;
 750                         for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
 751                                 i++, k = (k + 1) % CAKE_SET_WAYS) {
 752                                 if (q->hosts[outer_hash + k].srchost_tag ==
 753                                     srchost_hash)
 754                                         goto found_src;
 755                         }
 756                         for (i = 0; i < CAKE_SET_WAYS;
 757                                 i++, k = (k + 1) % CAKE_SET_WAYS) {
 758                                 if (!q->hosts[outer_hash + k].srchost_refcnt)
 759                                         break;
 760                         }
 761                         q->hosts[outer_hash + k].srchost_tag = srchost_hash;
 762 found_src:
 763                         srchost_idx = outer_hash + k;
 764                         q->hosts[srchost_idx].srchost_refcnt++;
 765                         q->flows[reduced_hash].srchost = srchost_idx;
 766                 }
 767
 768                 if (allocate_dst) {
 769                         dsthost_idx = dsthost_hash % CAKE_QUEUES;
 770                         inner_hash = dsthost_idx % CAKE_SET_WAYS;
 771                         outer_hash = dsthost_idx - inner_hash;
 772                         for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
 773                              i++, k = (k + 1) % CAKE_SET_WAYS) {
 774                                 if (q->hosts[outer_hash + k].dsthost_tag ==
 775                                     dsthost_hash)
 776                                         goto found_dst;
 777                         }
 778                         for (i = 0; i < CAKE_SET_WAYS;
 779                              i++, k = (k + 1) % CAKE_SET_WAYS) {
 780                                 if (!q->hosts[outer_hash + k].dsthost_refcnt)
 781                                         break;
 782                         }
 783                         q->hosts[outer_hash + k].dsthost_tag = dsthost_hash;
 784 found_dst:
 785                         dsthost_idx = outer_hash + k;
 786                         q->hosts[dsthost_idx].dsthost_refcnt++;
 787                         q->flows[reduced_hash].dsthost = dsthost_idx;
 788                 }
 789         }
 790
 791         return reduced_hash;
 792 }
 793
 794 /* helper functions : might be changed when/if skb use a standard list_head */
 795 /* remove one skb from head of slot queue */
 796
 797 static struct sk_buff *dequeue_head(struct cake_flow *flow)
 798 {
 799         struct sk_buff *skb = flow->head;
 800
 801         if (skb) {
 802                 flow->head = skb->next;
 803                 skb->next = NULL;
 804         }
 805
 806         return skb;
 807 }
 808
 809 /* add skb to flow queue (tail add) */
 810
 811 static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb)
 812 {
 813         if (!flow->head)
 814                 flow->head = skb;
 815         else
 816                 flow->tail->next = skb;
 817         flow->tail = skb;
 818         skb->next = NULL;
 819 }
 820
 821 static struct iphdr *cake_get_iphdr(const struct sk_buff *skb,
 822                                     struct ipv6hdr *buf)
 823 {
 824         unsigned int offset = skb_network_offset(skb);
 825         struct iphdr *iph;
 826
 827         iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf);
 828
 829         if (!iph)
 830                 return NULL;
 831
 832         if (iph->version == 4 && iph->protocol == IPPROTO_IPV6)
 833                 return skb_header_pointer(skb, offset + iph->ihl * 4,
 834                                           sizeof(struct ipv6hdr), buf);
 835
 836         else if (iph->version == 4)
 837                 return iph;
 838
 839         else if (iph->version == 6)
 840                 return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr),
 841                                           buf);
 842
 843         return NULL;
 844 }
 845
 846 static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb,
 847                                       void *buf, unsigned int bufsize)
 848 {
 849         unsigned int offset = skb_network_offset(skb);
 850         const struct ipv6hdr *ipv6h;
 851         const struct tcphdr *tcph;
 852         const struct iphdr *iph;
 853         struct ipv6hdr _ipv6h;
 854         struct tcphdr _tcph;
 855
 856         ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h);
 857
 858         if (!ipv6h)
 859                 return NULL;
 860
 861         if (ipv6h->version == 4) {
 862                 iph = (struct iphdr *)ipv6h;
 863                 offset += iph->ihl * 4;
 864
 865                 /* special-case 6in4 tunnelling, as that is a common way to get
 866                  * v6 connectivity in the home
 867                  */
 868                 if (iph->protocol == IPPROTO_IPV6) {
 869                         ipv6h = skb_header_pointer(skb, offset,
 870                                                    sizeof(_ipv6h), &_ipv6h);
 871
 872                         if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP)
 873                                 return NULL;
 874
 875                         offset += sizeof(struct ipv6hdr);
 876
 877                 } else if (iph->protocol != IPPROTO_TCP) {
 878                         return NULL;
 879                 }
 880
 881         } else if (ipv6h->version == 6) {
 882                 if (ipv6h->nexthdr != IPPROTO_TCP)
 883                         return NULL;
 884
 885                 offset += sizeof(struct ipv6hdr);
 886         } else {
 887                 return NULL;
 888         }
 889
 890         tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
 891         if (!tcph)
 892                 return NULL;
 893
 894         return skb_header_pointer(skb, offset,
 895                                   min(__tcp_hdrlen(tcph), bufsize), buf);
 896 }
 897
 898 static const void *cake_get_tcpopt(const struct tcphdr *tcph,
 899                                    int code, int *oplen)
 900 {
 901         /* inspired by tcp_parse_options in tcp_input.c */
 902         int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
 903         const u8 *ptr = (const u8 *)(tcph + 1);
 904
 905         while (length > 0) {
 906                 int opcode = *ptr++;
 907                 int opsize;
 908
 909                 if (opcode == TCPOPT_EOL)
 910                         break;
 911                 if (opcode == TCPOPT_NOP) {
 912                         length--;
 913                         continue;
 914                 }
 915                 opsize = *ptr++;
 916                 if (opsize < 2 || opsize > length)
 917                         break;
 918
 919                 if (opcode == code) {
 920                         *oplen = opsize;
 921                         return ptr;
 922                 }
 923
 924                 ptr += opsize - 2;
 925                 length -= opsize;
 926         }
 927
 928         return NULL;
 929 }
 930
 931 /* Compare two SACK sequences. A sequence is considered greater if it SACKs more
 932  * bytes than the other. In the case where both sequences ACKs bytes that the
 933  * other doesn't, A is considered greater. DSACKs in A also makes A be
 934  * considered greater.
 935  *
 936  * @return -1, 0 or 1 as normal compare functions
 937  */
 938 static int cake_tcph_sack_compare(const struct tcphdr *tcph_a,
 939                                   const struct tcphdr *tcph_b)
 940 {
 941         const struct tcp_sack_block_wire *sack_a, *sack_b;
 942         u32 ack_seq_a = ntohl(tcph_a->ack_seq);
 943         u32 bytes_a = 0, bytes_b = 0;
 944         int oplen_a, oplen_b;
 945         bool first = true;
 946
 947         sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a);
 948         sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b);
 949
 950         /* pointers point to option contents */
 951         oplen_a -= TCPOLEN_SACK_BASE;
 952         oplen_b -= TCPOLEN_SACK_BASE;
 953
 954         if (sack_a && oplen_a >= sizeof(*sack_a) &&
 955             (!sack_b || oplen_b < sizeof(*sack_b)))
 956                 return -1;
 957         else if (sack_b && oplen_b >= sizeof(*sack_b) &&
 958                  (!sack_a || oplen_a < sizeof(*sack_a)))
 959                 return 1;
 960         else if ((!sack_a || oplen_a < sizeof(*sack_a)) &&
 961                  (!sack_b || oplen_b < sizeof(*sack_b)))
 962                 return 0;
 963
 964         while (oplen_a >= sizeof(*sack_a)) {
 965                 const struct tcp_sack_block_wire *sack_tmp = sack_b;
 966                 u32 start_a = get_unaligned_be32(&sack_a->start_seq);
 967                 u32 end_a = get_unaligned_be32(&sack_a->end_seq);
 968                 int oplen_tmp = oplen_b;
 969                 bool found = false;
 970
 971                 /* DSACK; always considered greater to prevent dropping */
 972                 if (before(start_a, ack_seq_a))
 973                         return -1;
 974
 975                 bytes_a += end_a - start_a;
 976
 977                 while (oplen_tmp >= sizeof(*sack_tmp)) {
 978                         u32 start_b = get_unaligned_be32(&sack_tmp->start_seq);
 979                         u32 end_b = get_unaligned_be32(&sack_tmp->end_seq);
 980
 981                         /* first time through we count the total size */
 982                         if (first)
 983                                 bytes_b += end_b - start_b;
 984
 985                         if (!after(start_b, start_a) && !before(end_b, end_a)) {
 986                                 found = true;
 987                                 if (!first)
 988                                         break;
 989                         }
 990                         oplen_tmp -= sizeof(*sack_tmp);
 991                         sack_tmp++;
 992                 }
 993
 994                 if (!found)
 995                         return -1;
 996
 997                 oplen_a -= sizeof(*sack_a);
 998                 sack_a++;
 999                 first = false;
1000         }
1001
1002         /* If we made it this far, all ranges SACKed by A are covered by B, so
1003          * either the SACKs are equal, or B SACKs more bytes.
1004          */
1005         return bytes_b > bytes_a ? 1 : 0;
1006 }
1007
1008 static void cake_tcph_get_tstamp(const struct tcphdr *tcph,
1009                                  u32 *tsval, u32 *tsecr)
1010 {
1011         const u8 *ptr;
1012         int opsize;
1013
1014         ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize);
1015
1016         if (ptr && opsize == TCPOLEN_TIMESTAMP) {
1017                 *tsval = get_unaligned_be32(ptr);
1018                 *tsecr = get_unaligned_be32(ptr + 4);
1019         }
1020 }
1021
1022 static bool cake_tcph_may_drop(const struct tcphdr *tcph,
1023                                u32 tstamp_new, u32 tsecr_new)
1024 {
1025         /* inspired by tcp_parse_options in tcp_input.c */
1026         int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
1027         const u8 *ptr = (const u8 *)(tcph + 1);
1028         u32 tstamp, tsecr;
1029
1030         /* 3 reserved flags must be unset to avoid future breakage
1031          * ACK must be set
1032          * ECE/CWR are handled separately
1033          * All other flags URG/PSH/RST/SYN/FIN must be unset
1034          * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero)
1035          * 0x00C00000 = CWR/ECE (handled separately)
1036          * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000
1037          */
1038         if (((tcp_flag_word(tcph) &
1039               cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK))
1040                 return false;
1041
1042         while (length > 0) {
1043                 int opcode = *ptr++;
1044                 int opsize;
1045
1046                 if (opcode == TCPOPT_EOL)
1047                         break;
1048                 if (opcode == TCPOPT_NOP) {
1049                         length--;
1050                         continue;
1051                 }
1052                 opsize = *ptr++;
1053                 if (opsize < 2 || opsize > length)
1054                         break;
1055
1056                 switch (opcode) {
1057                 case TCPOPT_MD5SIG: /* doesn't influence state */
1058                         break;
1059
1060                 case TCPOPT_SACK: /* stricter checking performed later */
1061                         if (opsize % 8 != 2)
1062                                 return false;
1063                         break;
1064
1065                 case TCPOPT_TIMESTAMP:
1066                         /* only drop timestamps lower than new */
1067                         if (opsize != TCPOLEN_TIMESTAMP)
1068                                 return false;
1069                         tstamp = get_unaligned_be32(ptr);
1070                         tsecr = get_unaligned_be32(ptr + 4);
1071                         if (after(tstamp, tstamp_new) ||
1072                             after(tsecr, tsecr_new))
1073                                 return false;
1074                         break;
1075
1076                 case TCPOPT_MSS:  /* these should only be set on SYN */
1077                 case TCPOPT_WINDOW:
1078                 case TCPOPT_SACK_PERM:
1079                 case TCPOPT_FASTOPEN:
1080                 case TCPOPT_EXP:
1081                 default: /* don't drop if any unknown options are present */
1082                         return false;
1083                 }
1084
1085                 ptr += opsize - 2;
1086                 length -= opsize;
1087         }
1088
1089         return true;
1090 }
1091
1092 static struct sk_buff *cake_ack_filter(struct cake_sched_data *q,
1093                                        struct cake_flow *flow)
1094 {
1095         bool aggressive = q->ack_filter == CAKE_ACK_AGGRESSIVE;
1096         struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL;
1097         struct sk_buff *skb_check, *skb_prev = NULL;
1098         const struct ipv6hdr *ipv6h, *ipv6h_check;
1099         unsigned char _tcph[64], _tcph_check[64];
1100         const struct tcphdr *tcph, *tcph_check;
1101         const struct iphdr *iph, *iph_check;
1102         struct ipv6hdr _iph, _iph_check;
1103         const struct sk_buff *skb;
1104         int seglen, num_found = 0;
1105         u32 tstamp = 0, tsecr = 0;
1106         __be32 elig_flags = 0;
1107         int sack_comp;
1108
1109         /* no other possible ACKs to filter */
1110         if (flow->head == flow->tail)
1111                 return NULL;
1112
1113         skb = flow->tail;
1114         tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph));
1115         iph = cake_get_iphdr(skb, &_iph);
1116         if (!tcph)
1117                 return NULL;
1118
1119         cake_tcph_get_tstamp(tcph, &tstamp, &tsecr);
1120
1121         /* the 'triggering' packet need only have the ACK flag set.
1122          * also check that SYN is not set, as there won't be any previous ACKs.
1123          */
1124         if ((tcp_flag_word(tcph) &
1125              (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK)
1126                 return NULL;
1127
1128         /* the 'triggering' ACK is at the tail of the queue, we have already
1129          * returned if it is the only packet in the flow. loop through the rest
1130          * of the queue looking for pure ACKs with the same 5-tuple as the
1131          * triggering one.
1132          */
1133         for (skb_check = flow->head;
1134              skb_check && skb_check != skb;
1135              skb_prev = skb_check, skb_check = skb_check->next) {
1136                 iph_check = cake_get_iphdr(skb_check, &_iph_check);
1137                 tcph_check = cake_get_tcphdr(skb_check, &_tcph_check,
1138                                              sizeof(_tcph_check));
1139
1140                 /* only TCP packets with matching 5-tuple are eligible, and only
1141                  * drop safe headers
1142                  */
1143                 if (!tcph_check || iph->version != iph_check->version ||
1144                     tcph_check->source != tcph->source ||
1145                     tcph_check->dest != tcph->dest)
1146                         continue;
1147
1148                 if (iph_check->version == 4) {
1149                         if (iph_check->saddr != iph->saddr ||
1150                             iph_check->daddr != iph->daddr)
1151                                 continue;
1152
1153                         seglen = ntohs(iph_check->tot_len) -
1154                                        (4 * iph_check->ihl);
1155                 } else if (iph_check->version == 6) {
1156                         ipv6h = (struct ipv6hdr *)iph;
1157                         ipv6h_check = (struct ipv6hdr *)iph_check;
1158
1159                         if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) ||
1160                             ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr))
1161                                 continue;
1162
1163                         seglen = ntohs(ipv6h_check->payload_len);
1164                 } else {
1165                         WARN_ON(1);  /* shouldn't happen */
1166                         continue;
1167                 }
1168
1169                 /* If the ECE/CWR flags changed from the previous eligible
1170                  * packet in the same flow, we should no longer be dropping that
1171                  * previous packet as this would lose information.
1172                  */
1173                 if (elig_ack && (tcp_flag_word(tcph_check) &
1174                                  (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) {
1175                         elig_ack = NULL;
1176                         elig_ack_prev = NULL;
1177                         num_found--;
1178                 }
1179
1180                 /* Check TCP options and flags, don't drop ACKs with segment
1181                  * data, and don't drop ACKs with a higher cumulative ACK
1182                  * counter than the triggering packet. Check ACK seqno here to
1183                  * avoid parsing SACK options of packets we are going to exclude
1184                  * anyway.
1185                  */
1186                 if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) ||
1187                     (seglen - __tcp_hdrlen(tcph_check)) != 0 ||
1188                     after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq)))
1189                         continue;
1190
1191                 /* Check SACK options. The triggering packet must SACK more data
1192                  * than the ACK under consideration, or SACK the same range but
1193                  * have a larger cumulative ACK counter. The latter is a
1194                  * pathological case, but is contained in the following check
1195                  * anyway, just to be safe.
1196                  */
1197                 sack_comp = cake_tcph_sack_compare(tcph_check, tcph);
1198
1199                 if (sack_comp < 0 ||
1200                     (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) &&
1201                      sack_comp == 0))
1202                         continue;
1203
1204                 /* At this point we have found an eligible pure ACK to drop; if
1205                  * we are in aggressive mode, we are done. Otherwise, keep
1206                  * searching unless this is the second eligible ACK we
1207                  * found.
1208                  *
1209                  * Since we want to drop ACK closest to the head of the queue,
1210                  * save the first eligible ACK we find, even if we need to loop
1211                  * again.
1212                  */
1213                 if (!elig_ack) {
1214                         elig_ack = skb_check;
1215                         elig_ack_prev = skb_prev;
1216                         elig_flags = (tcp_flag_word(tcph_check)
1217                                       & (TCP_FLAG_ECE | TCP_FLAG_CWR));
1218                 }
1219
1220                 if (num_found++ > 0)
1221                         goto found;
1222         }
1223
1224         /* We made it through the queue without finding two eligible ACKs . If
1225          * we found a single eligible ACK we can drop it in aggressive mode if
1226          * we can guarantee that this does not interfere with ECN flag
1227          * information. We ensure this by dropping it only if the enqueued
1228          * packet is consecutive with the eligible ACK, and their flags match.
1229          */
1230         if (elig_ack && aggressive && elig_ack->next == skb &&
1231             (elig_flags == (tcp_flag_word(tcph) &
1232                             (TCP_FLAG_ECE | TCP_FLAG_CWR))))
1233                 goto found;
1234
1235         return NULL;
1236
1237 found:
1238         if (elig_ack_prev)
1239                 elig_ack_prev->next = elig_ack->next;
1240         else
1241                 flow->head = elig_ack->next;
1242
1243         elig_ack->next = NULL;
1244
1245         return elig_ack;
1246 }
1247
1248 static u64 cake_ewma(u64 avg, u64 sample, u32 shift)
1249 {
1250         avg -= avg >> shift;
1251         avg += sample >> shift;
1252         return avg;
1253 }
1254
1255 static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off)
1256 {
1257         if (q->rate_flags & CAKE_FLAG_OVERHEAD)
1258                 len -= off;
1259
1260         if (q->max_netlen < len)
1261                 q->max_netlen = len;
1262         if (q->min_netlen > len)
1263                 q->min_netlen = len;
1264
1265         len += q->rate_overhead;
1266
1267         if (len < q->rate_mpu)
1268                 len = q->rate_mpu;
1269
1270         if (q->atm_mode == CAKE_ATM_ATM) {
1271                 len += 47;
1272                 len /= 48;
1273                 len *= 53;
1274         } else if (q->atm_mode == CAKE_ATM_PTM) {
1275                 /* Add one byte per 64 bytes or part thereof.
1276                  * This is conservative and easier to calculate than the
1277                  * precise value.
1278                  */
1279                 len += (len + 63) / 64;
1280         }
1281
1282         if (q->max_adjlen < len)
1283                 q->max_adjlen = len;
1284         if (q->min_adjlen > len)
1285                 q->min_adjlen = len;
1286
1287         return len;
1288 }
1289
1290 static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb)
1291 {
1292         const struct skb_shared_info *shinfo = skb_shinfo(skb);
1293         unsigned int hdr_len, last_len = 0;
1294         u32 off = skb_network_offset(skb);
1295         u32 len = qdisc_pkt_len(skb);
1296         u16 segs = 1;
1297
1298         q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8);
1299
1300         if (!shinfo->gso_size)
1301                 return cake_calc_overhead(q, len, off);
1302
1303         /* borrowed from qdisc_pkt_len_init() */
1304         hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
1305
1306         /* + transport layer */
1307         if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 |
1308                                                 SKB_GSO_TCPV6))) {
1309                 const struct tcphdr *th;
1310                 struct tcphdr _tcphdr;
1311
1312                 th = skb_header_pointer(skb, skb_transport_offset(skb),
1313                                         sizeof(_tcphdr), &_tcphdr);
1314                 if (likely(th))
1315                         hdr_len += __tcp_hdrlen(th);
1316         } else {
1317                 struct udphdr _udphdr;
1318
1319                 if (skb_header_pointer(skb, skb_transport_offset(skb),
1320                                        sizeof(_udphdr), &_udphdr))
1321                         hdr_len += sizeof(struct udphdr);
1322         }
1323
1324         if (unlikely(shinfo->gso_type & SKB_GSO_DODGY))
1325                 segs = DIV_ROUND_UP(skb->len - hdr_len,
1326                                     shinfo->gso_size);
1327         else
1328                 segs = shinfo->gso_segs;
1329
1330         len = shinfo->gso_size + hdr_len;
1331         last_len = skb->len - shinfo->gso_size * (segs - 1);
1332
1333         return (cake_calc_overhead(q, len, off) * (segs - 1) +
1334                 cake_calc_overhead(q, last_len, off));
1335 }
1336
1337 static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j)
1338 {
1339         struct cake_heap_entry ii = q->overflow_heap[i];
1340         struct cake_heap_entry jj = q->overflow_heap[j];
1341
1342         q->overflow_heap[i] = jj;
1343         q->overflow_heap[j] = ii;
1344
1345         q->tins[ii.t].overflow_idx[ii.b] = j;
1346         q->tins[jj.t].overflow_idx[jj.b] = i;
1347 }
1348
1349 static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i)
1350 {
1351         struct cake_heap_entry ii = q->overflow_heap[i];
1352
1353         return q->tins[ii.t].backlogs[ii.b];
1354 }
1355
1356 static void cake_heapify(struct cake_sched_data *q, u16 i)
1357 {
1358         static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES;
1359         u32 mb = cake_heap_get_backlog(q, i);
1360         u32 m = i;
1361
1362         while (m < a) {
1363                 u32 l = m + m + 1;
1364                 u32 r = l + 1;
1365
1366                 if (l < a) {
1367                         u32 lb = cake_heap_get_backlog(q, l);
1368
1369                         if (lb > mb) {
1370                                 m  = l;
1371                                 mb = lb;
1372                         }
1373                 }
1374
1375                 if (r < a) {
1376                         u32 rb = cake_heap_get_backlog(q, r);
1377
1378                         if (rb > mb) {
1379                                 m  = r;
1380                                 mb = rb;
1381                         }
1382                 }
1383
1384                 if (m != i) {
1385                         cake_heap_swap(q, i, m);
1386                         i = m;
1387                 } else {
1388                         break;
1389                 }
1390         }
1391 }
1392
1393 static void cake_heapify_up(struct cake_sched_data *q, u16 i)
1394 {
1395         while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) {
1396                 u16 p = (i - 1) >> 1;
1397                 u32 ib = cake_heap_get_backlog(q, i);
1398                 u32 pb = cake_heap_get_backlog(q, p);
1399
1400                 if (ib > pb) {
1401                         cake_heap_swap(q, i, p);
1402                         i = p;
1403                 } else {
1404                         break;
1405                 }
1406         }
1407 }
1408
1409 static int cake_advance_shaper(struct cake_sched_data *q,
1410                                struct cake_tin_data *b,
1411                                struct sk_buff *skb,
1412                                ktime_t now, bool drop)
1413 {
1414         u32 len = get_cobalt_cb(skb)->adjusted_len;
1415
1416         /* charge packet bandwidth to this tin
1417          * and to the global shaper.
1418          */
1419         if (q->rate_ns) {
1420                 u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft;
1421                 u64 global_dur = (len * q->rate_ns) >> q->rate_shft;
1422                 u64 failsafe_dur = global_dur + (global_dur >> 1);
1423
1424                 if (ktime_before(b->time_next_packet, now))
1425                         b->time_next_packet = ktime_add_ns(b->time_next_packet,
1426                                                            tin_dur);
1427
1428                 else if (ktime_before(b->time_next_packet,
1429                                       ktime_add_ns(now, tin_dur)))
1430                         b->time_next_packet = ktime_add_ns(now, tin_dur);
1431
1432                 q->time_next_packet = ktime_add_ns(q->time_next_packet,
1433                                                    global_dur);
1434                 if (!drop)
1435                         q->failsafe_next_packet = \
1436                                 ktime_add_ns(q->failsafe_next_packet,
1437                                              failsafe_dur);
1438         }
1439         return len;
1440 }
1441
1442 static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free)
1443 {
1444         struct cake_sched_data *q = qdisc_priv(sch);
1445         ktime_t now = ktime_get();
1446         u32 idx = 0, tin = 0, len;
1447         struct cake_heap_entry qq;
1448         struct cake_tin_data *b;
1449         struct cake_flow *flow;
1450         struct sk_buff *skb;
1451
1452         if (!q->overflow_timeout) {
1453                 int i;
1454                 /* Build fresh max-heap */
1455                 for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2; i >= 0; i--)
1456                         cake_heapify(q, i);
1457         }
1458         q->overflow_timeout = 65535;
1459
1460         /* select longest queue for pruning */
1461         qq  = q->overflow_heap[0];
1462         tin = qq.t;
1463         idx = qq.b;
1464
1465         b = &q->tins[tin];
1466         flow = &b->flows[idx];
1467         skb = dequeue_head(flow);
1468         if (unlikely(!skb)) {
1469                 /* heap has gone wrong, rebuild it next time */
1470                 q->overflow_timeout = 0;
1471                 return idx + (tin << 16);
1472         }
1473
1474         if (cobalt_queue_full(&flow->cvars, &b->cparams, now))
1475                 b->unresponsive_flow_count++;
1476
1477         len = qdisc_pkt_len(skb);
1478         q->buffer_used      -= skb->truesize;
1479         b->backlogs[idx]    -= len;
1480         b->tin_backlog      -= len;
1481         sch->qstats.backlog -= len;
1482         qdisc_tree_reduce_backlog(sch, 1, len);
1483
1484         flow->dropped++;
1485         b->tin_dropped++;
1486         sch->qstats.drops++;
1487
1488         if (q->rate_flags & CAKE_FLAG_INGRESS)
1489                 cake_advance_shaper(q, b, skb, now, true);
1490
1491         __qdisc_drop(skb, to_free);
1492         sch->q.qlen--;
1493
1494         cake_heapify(q, 0);
1495
1496         return idx + (tin << 16);
1497 }
1498
1499 static void cake_wash_diffserv(struct sk_buff *skb)
1500 {
1501         switch (skb->protocol) {
1502         case htons(ETH_P_IP):
1503                 ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
1504                 break;
1505         case htons(ETH_P_IPV6):
1506                 ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
1507                 break;
1508         default:
1509                 break;
1510         }
1511 }
1512
1513 static u8 cake_handle_diffserv(struct sk_buff *skb, u16 wash)
1514 {
1515         u8 dscp;
1516
1517         switch (skb->protocol) {
1518         case htons(ETH_P_IP):
1519                 dscp = ipv4_get_dsfield(ip_hdr(skb)) >> 2;
1520                 if (wash && dscp)
1521                         ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
1522                 return dscp;
1523
1524         case htons(ETH_P_IPV6):
1525                 dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2;
1526                 if (wash && dscp)
1527                         ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
1528                 return dscp;
1529
1530         case htons(ETH_P_ARP):
1531                 return 0x38;  /* CS7 - Net Control */
1532
1533         default:
1534                 /* If there is no Diffserv field, treat as best-effort */
1535                 return 0;
1536         }
1537 }
1538
1539 static struct cake_tin_data *cake_select_tin(struct Qdisc *sch,
1540                                              struct sk_buff *skb)
1541 {
1542         struct cake_sched_data *q = qdisc_priv(sch);
1543         u32 tin;
1544
1545         if (TC_H_MAJ(skb->priority) == sch->handle &&
1546             TC_H_MIN(skb->priority) > 0 &&
1547             TC_H_MIN(skb->priority) <= q->tin_cnt) {
1548                 tin = q->tin_order[TC_H_MIN(skb->priority) - 1];
1549
1550                 if (q->rate_flags & CAKE_FLAG_WASH)
1551                         cake_wash_diffserv(skb);
1552         } else if (q->tin_mode != CAKE_DIFFSERV_BESTEFFORT) {
1553                 /* extract the Diffserv Precedence field, if it exists */
1554                 /* and clear DSCP bits if washing */
1555                 tin = q->tin_index[cake_handle_diffserv(skb,
1556                                 q->rate_flags & CAKE_FLAG_WASH)];
1557                 if (unlikely(tin >= q->tin_cnt))
1558                         tin = 0;
1559         } else {
1560                 tin = 0;
1561                 if (q->rate_flags & CAKE_FLAG_WASH)
1562                         cake_wash_diffserv(skb);
1563         }
1564
1565         return &q->tins[tin];
1566 }
1567
1568 static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t,
1569                          struct sk_buff *skb, int flow_mode, int *qerr)
1570 {
1571         struct cake_sched_data *q = qdisc_priv(sch);
1572         struct tcf_proto *filter;
1573         struct tcf_result res;
1574         u32 flow = 0;
1575         int result;
1576
1577         filter = rcu_dereference_bh(q->filter_list);
1578         if (!filter)
1579                 goto hash;
1580
1581         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
1582         result = tcf_classify(skb, filter, &res, false);
1583
1584         if (result >= 0) {
1585 #ifdef CONFIG_NET_CLS_ACT
1586                 switch (result) {
1587                 case TC_ACT_STOLEN:
1588                 case TC_ACT_QUEUED:
1589                 case TC_ACT_TRAP:
1590                         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
1591                         /* fall through */
1592                 case TC_ACT_SHOT:
1593                         return 0;
1594                 }
1595 #endif
1596                 if (TC_H_MIN(res.classid) <= CAKE_QUEUES)
1597                         flow = TC_H_MIN(res.classid);
1598         }
1599 hash:
1600         *t = cake_select_tin(sch, skb);
1601         return flow ?: cake_hash(*t, skb, flow_mode) + 1;
1602 }
1603
1604 static void cake_reconfigure(struct Qdisc *sch);
1605
1606 static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
1607                         struct sk_buff **to_free)
1608 {
1609         struct cake_sched_data *q = qdisc_priv(sch);
1610         int len = qdisc_pkt_len(skb);
1611         int uninitialized_var(ret);
1612         struct sk_buff *ack = NULL;
1613         ktime_t now = ktime_get();
1614         struct cake_tin_data *b;
1615         struct cake_flow *flow;
1616         u32 idx;
1617
1618         /* choose flow to insert into */
1619         idx = cake_classify(sch, &b, skb, q->flow_mode, &ret);
1620         if (idx == 0) {
1621                 if (ret & __NET_XMIT_BYPASS)
1622                         qdisc_qstats_drop(sch);
1623                 __qdisc_drop(skb, to_free);
1624                 return ret;
1625         }
1626         idx--;
1627         flow = &b->flows[idx];
1628
1629         /* ensure shaper state isn't stale */
1630         if (!b->tin_backlog) {
1631                 if (ktime_before(b->time_next_packet, now))
1632                         b->time_next_packet = now;
1633
1634                 if (!sch->q.qlen) {
1635                         if (ktime_before(q->time_next_packet, now)) {
1636                                 q->failsafe_next_packet = now;
1637                                 q->time_next_packet = now;
1638                         } else if (ktime_after(q->time_next_packet, now) &&
1639                                    ktime_after(q->failsafe_next_packet, now)) {
1640                                 u64 next = \
1641                                         min(ktime_to_ns(q->time_next_packet),
1642                                             ktime_to_ns(
1643                                                    q->failsafe_next_packet));
1644                                 sch->qstats.overlimits++;
1645                                 qdisc_watchdog_schedule_ns(&q->watchdog, next);
1646                         }
1647                 }
1648         }
1649
1650         if (unlikely(len > b->max_skblen))
1651                 b->max_skblen = len;
1652
1653         if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) {
1654                 struct sk_buff *segs, *nskb;
1655                 netdev_features_t features = netif_skb_features(skb);
1656                 unsigned int slen = 0;
1657
1658                 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
1659                 if (IS_ERR_OR_NULL(segs))
1660                         return qdisc_drop(skb, sch, to_free);
1661
1662                 while (segs) {
1663                         nskb = segs->next;
1664                         segs->next = NULL;
1665                         qdisc_skb_cb(segs)->pkt_len = segs->len;
1666                         cobalt_set_enqueue_time(segs, now);
1667                         get_cobalt_cb(segs)->adjusted_len = cake_overhead(q,
1668                                                                           segs);
1669                         flow_queue_add(flow, segs);
1670
1671                         sch->q.qlen++;
1672                         slen += segs->len;
1673                         q->buffer_used += segs->truesize;
1674                         b->packets++;
1675                         segs = nskb;
1676                 }
1677
1678                 /* stats */
1679                 b->bytes            += slen;
1680                 b->backlogs[idx]    += slen;
1681                 b->tin_backlog      += slen;
1682                 sch->qstats.backlog += slen;
1683                 q->avg_window_bytes += slen;
1684
1685                 qdisc_tree_reduce_backlog(sch, 1, len);
1686                 consume_skb(skb);
1687         } else {
1688                 /* not splitting */
1689                 cobalt_set_enqueue_time(skb, now);
1690                 get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb);
1691                 flow_queue_add(flow, skb);
1692
1693                 if (q->ack_filter)
1694                         ack = cake_ack_filter(q, flow);
1695
1696                 if (ack) {
1697                         b->ack_drops++;
1698                         sch->qstats.drops++;
1699                         b->bytes += qdisc_pkt_len(ack);
1700                         len -= qdisc_pkt_len(ack);
1701                         q->buffer_used += skb->truesize - ack->truesize;
1702                         if (q->rate_flags & CAKE_FLAG_INGRESS)
1703                                 cake_advance_shaper(q, b, ack, now, true);
1704
1705                         qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack));
1706                         consume_skb(ack);
1707                 } else {
1708                         sch->q.qlen++;
1709                         q->buffer_used      += skb->truesize;
1710                 }
1711
1712                 /* stats */
1713                 b->packets++;
1714                 b->bytes            += len;
1715                 b->backlogs[idx]    += len;
1716                 b->tin_backlog      += len;
1717                 sch->qstats.backlog += len;
1718                 q->avg_window_bytes += len;
1719         }
1720
1721         if (q->overflow_timeout)
1722                 cake_heapify_up(q, b->overflow_idx[idx]);
1723
1724         /* incoming bandwidth capacity estimate */
1725         if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) {
1726                 u64 packet_interval = \
1727                         ktime_to_ns(ktime_sub(now, q->last_packet_time));
1728
1729                 if (packet_interval > NSEC_PER_SEC)
1730                         packet_interval = NSEC_PER_SEC;
1731
1732                 /* filter out short-term bursts, eg. wifi aggregation */
1733                 q->avg_packet_interval = \
1734                         cake_ewma(q->avg_packet_interval,
1735                                   packet_interval,
1736                                   (packet_interval > q->avg_packet_interval ?
1737                                           2 : 8));
1738
1739                 q->last_packet_time = now;
1740
1741                 if (packet_interval > q->avg_packet_interval) {
1742                         u64 window_interval = \
1743                                 ktime_to_ns(ktime_sub(now,
1744                                                       q->avg_window_begin));
1745                         u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC;
1746
1747                         do_div(b, window_interval);
1748                         q->avg_peak_bandwidth =
1749                                 cake_ewma(q->avg_peak_bandwidth, b,
1750                                           b > q->avg_peak_bandwidth ? 2 : 8);
1751                         q->avg_window_bytes = 0;
1752                         q->avg_window_begin = now;
1753
1754                         if (ktime_after(now,
1755                                         ktime_add_ms(q->last_reconfig_time,
1756                                                      250))) {
1757                                 q->rate_bps = (q->avg_peak_bandwidth * 15) >> 4;
1758                                 cake_reconfigure(sch);
1759                         }
1760                 }
1761         } else {
1762                 q->avg_window_bytes = 0;
1763                 q->last_packet_time = now;
1764         }
1765
1766         /* flowchain */
1767         if (!flow->set || flow->set == CAKE_SET_DECAYING) {
1768                 struct cake_host *srchost = &b->hosts[flow->srchost];
1769                 struct cake_host *dsthost = &b->hosts[flow->dsthost];
1770                 u16 host_load = 1;
1771
1772                 if (!flow->set) {
1773                         list_add_tail(&flow->flowchain, &b->new_flows);
1774                 } else {
1775                         b->decaying_flow_count--;
1776                         list_move_tail(&flow->flowchain, &b->new_flows);
1777                 }
1778                 flow->set = CAKE_SET_SPARSE;
1779                 b->sparse_flow_count++;
1780
1781                 if (cake_dsrc(q->flow_mode))
1782                         host_load = max(host_load, srchost->srchost_refcnt);
1783
1784                 if (cake_ddst(q->flow_mode))
1785                         host_load = max(host_load, dsthost->dsthost_refcnt);
1786
1787                 flow->deficit = (b->flow_quantum *
1788                                  quantum_div[host_load]) >> 16;
1789         } else if (flow->set == CAKE_SET_SPARSE_WAIT) {
1790                 /* this flow was empty, accounted as a sparse flow, but actually
1791                  * in the bulk rotation.
1792                  */
1793                 flow->set = CAKE_SET_BULK;
1794                 b->sparse_flow_count--;
1795                 b->bulk_flow_count++;
1796         }
1797
1798         if (q->buffer_used > q->buffer_max_used)
1799                 q->buffer_max_used = q->buffer_used;
1800
1801         if (q->buffer_used > q->buffer_limit) {
1802                 u32 dropped = 0;
1803
1804                 while (q->buffer_used > q->buffer_limit) {
1805                         dropped++;
1806                         cake_drop(sch, to_free);
1807                 }
1808                 b->drop_overlimit += dropped;
1809         }
1810         return NET_XMIT_SUCCESS;
1811 }
1812
1813 static struct sk_buff *cake_dequeue_one(struct Qdisc *sch)
1814 {
1815         struct cake_sched_data *q = qdisc_priv(sch);
1816         struct cake_tin_data *b = &q->tins[q->cur_tin];
1817         struct cake_flow *flow = &b->flows[q->cur_flow];
1818         struct sk_buff *skb = NULL;
1819         u32 len;
1820
1821         if (flow->head) {
1822                 skb = dequeue_head(flow);
1823                 len = qdisc_pkt_len(skb);
1824                 b->backlogs[q->cur_flow] -= len;
1825                 b->tin_backlog           -= len;
1826                 sch->qstats.backlog      -= len;
1827                 q->buffer_used           -= skb->truesize;
1828                 sch->q.qlen--;
1829
1830                 if (q->overflow_timeout)
1831                         cake_heapify(q, b->overflow_idx[q->cur_flow]);
1832         }
1833         return skb;
1834 }
1835
1836 /* Discard leftover packets from a tin no longer in use. */
1837 static void cake_clear_tin(struct Qdisc *sch, u16 tin)
1838 {
1839         struct cake_sched_data *q = qdisc_priv(sch);
1840         struct sk_buff *skb;
1841
1842         q->cur_tin = tin;
1843         for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++)
1844                 while (!!(skb = cake_dequeue_one(sch)))
1845                         kfree_skb(skb);
1846 }
1847
1848 static struct sk_buff *cake_dequeue(struct Qdisc *sch)
1849 {
1850         struct cake_sched_data *q = qdisc_priv(sch);
1851         struct cake_tin_data *b = &q->tins[q->cur_tin];
1852         struct cake_host *srchost, *dsthost;
1853         ktime_t now = ktime_get();
1854         struct cake_flow *flow;
1855         struct list_head *head;
1856         bool first_flow = true;
1857         struct sk_buff *skb;
1858         u16 host_load;
1859         u64 delay;
1860         u32 len;
1861
1862 begin:
1863         if (!sch->q.qlen)
1864                 return NULL;
1865
1866         /* global hard shaper */
1867         if (ktime_after(q->time_next_packet, now) &&
1868             ktime_after(q->failsafe_next_packet, now)) {
1869                 u64 next = min(ktime_to_ns(q->time_next_packet),
1870                                ktime_to_ns(q->failsafe_next_packet));
1871
1872                 sch->qstats.overlimits++;
1873                 qdisc_watchdog_schedule_ns(&q->watchdog, next);
1874                 return NULL;
1875         }
1876
1877         /* Choose a class to work on. */
1878         if (!q->rate_ns) {
1879                 /* In unlimited mode, can't rely on shaper timings, just balance
1880                  * with DRR
1881                  */
1882                 bool wrapped = false, empty = true;
1883
1884                 while (b->tin_deficit < 0 ||
1885                        !(b->sparse_flow_count + b->bulk_flow_count)) {
1886                         if (b->tin_deficit <= 0)
1887                                 b->tin_deficit += b->tin_quantum_band;
1888                         if (b->sparse_flow_count + b->bulk_flow_count)
1889                                 empty = false;
1890
1891                         q->cur_tin++;
1892                         b++;
1893                         if (q->cur_tin >= q->tin_cnt) {
1894                                 q->cur_tin = 0;
1895                                 b = q->tins;
1896
1897                                 if (wrapped) {
1898                                         /* It's possible for q->qlen to be
1899                                          * nonzero when we actually have no
1900                                          * packets anywhere.
1901                                          */
1902                                         if (empty)
1903                                                 return NULL;
1904                                 } else {
1905                                         wrapped = true;
1906                                 }
1907                         }
1908                 }
1909         } else {
1910                 /* In shaped mode, choose:
1911                  * - Highest-priority tin with queue and meeting schedule, or
1912                  * - The earliest-scheduled tin with queue.
1913                  */
1914                 ktime_t best_time = KTIME_MAX;
1915                 int tin, best_tin = 0;
1916
1917                 for (tin = 0; tin < q->tin_cnt; tin++) {
1918                         b = q->tins + tin;
1919                         if ((b->sparse_flow_count + b->bulk_flow_count) > 0) {
1920                                 ktime_t time_to_pkt = \
1921                                         ktime_sub(b->time_next_packet, now);
1922
1923                                 if (ktime_to_ns(time_to_pkt) <= 0 ||
1924                                     ktime_compare(time_to_pkt,
1925                                                   best_time) <= 0) {
1926                                         best_time = time_to_pkt;
1927                                         best_tin = tin;
1928                                 }
1929                         }
1930                 }
1931
1932                 q->cur_tin = best_tin;
1933                 b = q->tins + best_tin;
1934
1935                 /* No point in going further if no packets to deliver. */
1936                 if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count)))
1937                         return NULL;
1938         }
1939
1940 retry:
1941         /* service this class */
1942         head = &b->decaying_flows;
1943         if (!first_flow || list_empty(head)) {
1944                 head = &b->new_flows;
1945                 if (list_empty(head)) {
1946                         head = &b->old_flows;
1947                         if (unlikely(list_empty(head))) {
1948                                 head = &b->decaying_flows;
1949                                 if (unlikely(list_empty(head)))
1950                                         goto begin;
1951                         }
1952                 }
1953         }
1954         flow = list_first_entry(head, struct cake_flow, flowchain);
1955         q->cur_flow = flow - b->flows;
1956         first_flow = false;
1957
1958         /* triple isolation (modified DRR++) */
1959         srchost = &b->hosts[flow->srchost];
1960         dsthost = &b->hosts[flow->dsthost];
1961         host_load = 1;
1962
1963         if (cake_dsrc(q->flow_mode))
1964                 host_load = max(host_load, srchost->srchost_refcnt);
1965
1966         if (cake_ddst(q->flow_mode))
1967                 host_load = max(host_load, dsthost->dsthost_refcnt);
1968
1969         WARN_ON(host_load > CAKE_QUEUES);
1970
1971         /* flow isolation (DRR++) */
1972         if (flow->deficit <= 0) {
1973                 /* The shifted prandom_u32() is a way to apply dithering to
1974                  * avoid accumulating roundoff errors
1975                  */
1976                 flow->deficit += (b->flow_quantum * quantum_div[host_load] +
1977                                   (prandom_u32() >> 16)) >> 16;
1978                 list_move_tail(&flow->flowchain, &b->old_flows);
1979
1980                 /* Keep all flows with deficits out of the sparse and decaying
1981                  * rotations.  No non-empty flow can go into the decaying
1982                  * rotation, so they can't get deficits
1983                  */
1984                 if (flow->set == CAKE_SET_SPARSE) {
1985                         if (flow->head) {
1986                                 b->sparse_flow_count--;
1987                                 b->bulk_flow_count++;
1988                                 flow->set = CAKE_SET_BULK;
1989                         } else {
1990                                 /* we've moved it to the bulk rotation for
1991                                  * correct deficit accounting but we still want
1992                                  * to count it as a sparse flow, not a bulk one.
1993                                  */
1994                                 flow->set = CAKE_SET_SPARSE_WAIT;
1995                         }
1996                 }
1997                 goto retry;
1998         }
1999
2000         /* Retrieve a packet via the AQM */
2001         while (1) {
2002                 skb = cake_dequeue_one(sch);
2003                 if (!skb) {
2004                         /* this queue was actually empty */
2005                         if (cobalt_queue_empty(&flow->cvars, &b->cparams, now))
2006                                 b->unresponsive_flow_count--;
2007
2008                         if (flow->cvars.p_drop || flow->cvars.count ||
2009                             ktime_before(now, flow->cvars.drop_next)) {
2010                                 /* keep in the flowchain until the state has
2011                                  * decayed to rest
2012                                  */
2013                                 list_move_tail(&flow->flowchain,
2014                                                &b->decaying_flows);
2015                                 if (flow->set == CAKE_SET_BULK) {
2016                                         b->bulk_flow_count--;
2017                                         b->decaying_flow_count++;
2018                                 } else if (flow->set == CAKE_SET_SPARSE ||
2019                                            flow->set == CAKE_SET_SPARSE_WAIT) {
2020                                         b->sparse_flow_count--;
2021                                         b->decaying_flow_count++;
2022                                 }
2023                                 flow->set = CAKE_SET_DECAYING;
2024                         } else {
2025                                 /* remove empty queue from the flowchain */
2026                                 list_del_init(&flow->flowchain);
2027                                 if (flow->set == CAKE_SET_SPARSE ||
2028                                     flow->set == CAKE_SET_SPARSE_WAIT)
2029                                         b->sparse_flow_count--;
2030                                 else if (flow->set == CAKE_SET_BULK)
2031                                         b->bulk_flow_count--;
2032                                 else
2033                                         b->decaying_flow_count--;
2034
2035                                 flow->set = CAKE_SET_NONE;
2036                                 srchost->srchost_refcnt--;
2037                                 dsthost->dsthost_refcnt--;
2038                         }
2039                         goto begin;
2040                 }
2041
2042                 /* Last packet in queue may be marked, shouldn't be dropped */
2043                 if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb,
2044                                         (b->bulk_flow_count *
2045                                          !!(q->rate_flags &
2046                                             CAKE_FLAG_INGRESS))) ||
2047                     !flow->head)
2048                         break;
2049
2050                 /* drop this packet, get another one */
2051                 if (q->rate_flags & CAKE_FLAG_INGRESS) {
2052                         len = cake_advance_shaper(q, b, skb,
2053                                                   now, true);
2054                         flow->deficit -= len;
2055                         b->tin_deficit -= len;
2056                 }
2057                 flow->dropped++;
2058                 b->tin_dropped++;
2059                 qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
2060                 qdisc_qstats_drop(sch);
2061                 kfree_skb(skb);
2062                 if (q->rate_flags & CAKE_FLAG_INGRESS)
2063                         goto retry;
2064         }
2065
2066         b->tin_ecn_mark += !!flow->cvars.ecn_marked;
2067         qdisc_bstats_update(sch, skb);
2068
2069         /* collect delay stats */
2070         delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
2071         b->avge_delay = cake_ewma(b->avge_delay, delay, 8);
2072         b->peak_delay = cake_ewma(b->peak_delay, delay,
2073                                   delay > b->peak_delay ? 2 : 8);
2074         b->base_delay = cake_ewma(b->base_delay, delay,
2075                                   delay < b->base_delay ? 2 : 8);
2076
2077         len = cake_advance_shaper(q, b, skb, now, false);
2078         flow->deficit -= len;
2079         b->tin_deficit -= len;
2080
2081         if (ktime_after(q->time_next_packet, now) && sch->q.qlen) {
2082                 u64 next = min(ktime_to_ns(q->time_next_packet),
2083                                ktime_to_ns(q->failsafe_next_packet));
2084
2085                 qdisc_watchdog_schedule_ns(&q->watchdog, next);
2086         } else if (!sch->q.qlen) {
2087                 int i;
2088
2089                 for (i = 0; i < q->tin_cnt; i++) {
2090                         if (q->tins[i].decaying_flow_count) {
2091                                 ktime_t next = \
2092                                         ktime_add_ns(now,
2093                                                      q->tins[i].cparams.target);
2094
2095                                 qdisc_watchdog_schedule_ns(&q->watchdog,
2096                                                            ktime_to_ns(next));
2097                                 break;
2098                         }
2099                 }
2100         }
2101
2102         if (q->overflow_timeout)
2103                 q->overflow_timeout--;
2104
2105         return skb;
2106 }
2107
2108 static void cake_reset(struct Qdisc *sch)
2109 {
2110         u32 c;
2111
2112         for (c = 0; c < CAKE_MAX_TINS; c++)
2113                 cake_clear_tin(sch, c);
2114 }
2115
2116 static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = {
2117         [TCA_CAKE_BASE_RATE64]   = { .type = NLA_U64 },
2118         [TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 },
2119         [TCA_CAKE_ATM]           = { .type = NLA_U32 },
2120         [TCA_CAKE_FLOW_MODE]     = { .type = NLA_U32 },
2121         [TCA_CAKE_OVERHEAD]      = { .type = NLA_S32 },
2122         [TCA_CAKE_RTT]           = { .type = NLA_U32 },
2123         [TCA_CAKE_TARGET]        = { .type = NLA_U32 },
2124         [TCA_CAKE_AUTORATE]      = { .type = NLA_U32 },
2125         [TCA_CAKE_MEMORY]        = { .type = NLA_U32 },
2126         [TCA_CAKE_NAT]           = { .type = NLA_U32 },
2127         [TCA_CAKE_RAW]           = { .type = NLA_U32 },
2128         [TCA_CAKE_WASH]          = { .type = NLA_U32 },
2129         [TCA_CAKE_MPU]           = { .type = NLA_U32 },
2130         [TCA_CAKE_INGRESS]       = { .type = NLA_U32 },
2131         [TCA_CAKE_ACK_FILTER]    = { .type = NLA_U32 },
2132 };
2133
2134 static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu,
2135                           u64 target_ns, u64 rtt_est_ns)
2136 {
2137         /* convert byte-rate into time-per-byte
2138          * so it will always unwedge in reasonable time.
2139          */
2140         static const u64 MIN_RATE = 64;
2141         u32 byte_target = mtu;
2142         u64 byte_target_ns;
2143         u8  rate_shft = 0;
2144         u64 rate_ns = 0;
2145
2146         b->flow_quantum = 1514;
2147         if (rate) {
2148                 b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL);
2149                 rate_shft = 34;
2150                 rate_ns = ((u64)NSEC_PER_SEC) << rate_shft;
2151                 rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate));
2152                 while (!!(rate_ns >> 34)) {
2153                         rate_ns >>= 1;
2154                         rate_shft--;
2155                 }
2156         } /* else unlimited, ie. zero delay */
2157
2158         b->tin_rate_bps  = rate;
2159         b->tin_rate_ns   = rate_ns;
2160         b->tin_rate_shft = rate_shft;
2161
2162         byte_target_ns = (byte_target * rate_ns) >> rate_shft;
2163
2164         b->cparams.target = max((byte_target_ns * 3) / 2, target_ns);
2165         b->cparams.interval = max(rtt_est_ns +
2166                                      b->cparams.target - target_ns,
2167                                      b->cparams.target * 2);
2168         b->cparams.mtu_time = byte_target_ns;
2169         b->cparams.p_inc = 1 << 24; /* 1/256 */
2170         b->cparams.p_dec = 1 << 20; /* 1/4096 */
2171 }
2172
2173 static int cake_config_besteffort(struct Qdisc *sch)
2174 {
2175         struct cake_sched_data *q = qdisc_priv(sch);
2176         struct cake_tin_data *b = &q->tins[0];
2177         u32 mtu = psched_mtu(qdisc_dev(sch));
2178         u64 rate = q->rate_bps;
2179
2180         q->tin_cnt = 1;
2181
2182         q->tin_index = besteffort;
2183         q->tin_order = normal_order;
2184
2185         cake_set_rate(b, rate, mtu,
2186                       us_to_ns(q->target), us_to_ns(q->interval));
2187         b->tin_quantum_band = 65535;
2188         b->tin_quantum_prio = 65535;
2189
2190         return 0;
2191 }
2192
2193 static int cake_config_precedence(struct Qdisc *sch)
2194 {
2195         /* convert high-level (user visible) parameters into internal format */
2196         struct cake_sched_data *q = qdisc_priv(sch);
2197         u32 mtu = psched_mtu(qdisc_dev(sch));
2198         u64 rate = q->rate_bps;
2199         u32 quantum1 = 256;
2200         u32 quantum2 = 256;
2201         u32 i;
2202
2203         q->tin_cnt = 8;
2204         q->tin_index = precedence;
2205         q->tin_order = normal_order;
2206
2207         for (i = 0; i < q->tin_cnt; i++) {
2208                 struct cake_tin_data *b = &q->tins[i];
2209
2210                 cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2211                               us_to_ns(q->interval));
2212
2213                 b->tin_quantum_prio = max_t(u16, 1U, quantum1);
2214                 b->tin_quantum_band = max_t(u16, 1U, quantum2);
2215
2216                 /* calculate next class's parameters */
2217                 rate  *= 7;
2218                 rate >>= 3;
2219
2220                 quantum1  *= 3;
2221                 quantum1 >>= 1;
2222
2223                 quantum2  *= 7;
2224                 quantum2 >>= 3;
2225         }
2226
2227         return 0;
2228 }
2229
2230 /*      List of known Diffserv codepoints:
2231  *
2232  *      Least Effort (CS1)
2233  *      Best Effort (CS0)
2234  *      Max Reliability & LLT "Lo" (TOS1)
2235  *      Max Throughput (TOS2)
2236  *      Min Delay (TOS4)
2237  *      LLT "La" (TOS5)
2238  *      Assured Forwarding 1 (AF1x) - x3
2239  *      Assured Forwarding 2 (AF2x) - x3
2240  *      Assured Forwarding 3 (AF3x) - x3
2241  *      Assured Forwarding 4 (AF4x) - x3
2242  *      Precedence Class 2 (CS2)
2243  *      Precedence Class 3 (CS3)
2244  *      Precedence Class 4 (CS4)
2245  *      Precedence Class 5 (CS5)
2246  *      Precedence Class 6 (CS6)
2247  *      Precedence Class 7 (CS7)
2248  *      Voice Admit (VA)
2249  *      Expedited Forwarding (EF)
2250
2251  *      Total 25 codepoints.
2252  */
2253
2254 /*      List of traffic classes in RFC 4594:
2255  *              (roughly descending order of contended priority)
2256  *              (roughly ascending order of uncontended throughput)
2257  *
2258  *      Network Control (CS6,CS7)      - routing traffic
2259  *      Telephony (EF,VA)         - aka. VoIP streams
2260  *      Signalling (CS5)               - VoIP setup
2261  *      Multimedia Conferencing (AF4x) - aka. video calls
2262  *      Realtime Interactive (CS4)     - eg. games
2263  *      Multimedia Streaming (AF3x)    - eg. YouTube, NetFlix, Twitch
2264  *      Broadcast Video (CS3)
2265  *      Low Latency Data (AF2x,TOS4)      - eg. database
2266  *      Ops, Admin, Management (CS2,TOS1) - eg. ssh
2267  *      Standard Service (CS0 & unrecognised codepoints)
2268  *      High Throughput Data (AF1x,TOS2)  - eg. web traffic
2269  *      Low Priority Data (CS1)           - eg. BitTorrent
2270
2271  *      Total 12 traffic classes.
2272  */
2273
2274 static int cake_config_diffserv8(struct Qdisc *sch)
2275 {
2276 /*      Pruned list of traffic classes for typical applications:
2277  *
2278  *              Network Control          (CS6, CS7)
2279  *              Minimum Latency          (EF, VA, CS5, CS4)
2280  *              Interactive Shell        (CS2, TOS1)
2281  *              Low Latency Transactions (AF2x, TOS4)
2282  *              Video Streaming          (AF4x, AF3x, CS3)
2283  *              Bog Standard             (CS0 etc.)
2284  *              High Throughput          (AF1x, TOS2)
2285  *              Background Traffic       (CS1)
2286  *
2287  *              Total 8 traffic classes.
2288  */
2289
2290         struct cake_sched_data *q = qdisc_priv(sch);
2291         u32 mtu = psched_mtu(qdisc_dev(sch));
2292         u64 rate = q->rate_bps;
2293         u32 quantum1 = 256;
2294         u32 quantum2 = 256;
2295         u32 i;
2296
2297         q->tin_cnt = 8;
2298
2299         /* codepoint to class mapping */
2300         q->tin_index = diffserv8;
2301         q->tin_order = normal_order;
2302
2303         /* class characteristics */
2304         for (i = 0; i < q->tin_cnt; i++) {
2305                 struct cake_tin_data *b = &q->tins[i];
2306
2307                 cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2308                               us_to_ns(q->interval));
2309
2310                 b->tin_quantum_prio = max_t(u16, 1U, quantum1);
2311                 b->tin_quantum_band = max_t(u16, 1U, quantum2);
2312
2313                 /* calculate next class's parameters */
2314                 rate  *= 7;
2315                 rate >>= 3;
2316
2317                 quantum1  *= 3;
2318                 quantum1 >>= 1;
2319
2320                 quantum2  *= 7;
2321                 quantum2 >>= 3;
2322         }
2323
2324         return 0;
2325 }
2326
2327 static int cake_config_diffserv4(struct Qdisc *sch)
2328 {
2329 /*  Further pruned list of traffic classes for four-class system:
2330  *
2331  *          Latency Sensitive  (CS7, CS6, EF, VA, CS5, CS4)
2332  *          Streaming Media    (AF4x, AF3x, CS3, AF2x, TOS4, CS2, TOS1)
2333  *          Best Effort        (CS0, AF1x, TOS2, and those not specified)
2334  *          Background Traffic (CS1)
2335  *
2336  *              Total 4 traffic classes.
2337  */
2338
2339         struct cake_sched_data *q = qdisc_priv(sch);
2340         u32 mtu = psched_mtu(qdisc_dev(sch));
2341         u64 rate = q->rate_bps;
2342         u32 quantum = 1024;
2343
2344         q->tin_cnt = 4;
2345
2346         /* codepoint to class mapping */
2347         q->tin_index = diffserv4;
2348         q->tin_order = bulk_order;
2349
2350         /* class characteristics */
2351         cake_set_rate(&q->tins[0], rate, mtu,
2352                       us_to_ns(q->target), us_to_ns(q->interval));
2353         cake_set_rate(&q->tins[1], rate >> 4, mtu,
2354                       us_to_ns(q->target), us_to_ns(q->interval));
2355         cake_set_rate(&q->tins[2], rate >> 1, mtu,
2356                       us_to_ns(q->target), us_to_ns(q->interval));
2357         cake_set_rate(&q->tins[3], rate >> 2, mtu,
2358                       us_to_ns(q->target), us_to_ns(q->interval));
2359
2360         /* priority weights */
2361         q->tins[0].tin_quantum_prio = quantum;
2362         q->tins[1].tin_quantum_prio = quantum >> 4;
2363         q->tins[2].tin_quantum_prio = quantum << 2;
2364         q->tins[3].tin_quantum_prio = quantum << 4;
2365
2366         /* bandwidth-sharing weights */
2367         q->tins[0].tin_quantum_band = quantum;
2368         q->tins[1].tin_quantum_band = quantum >> 4;
2369         q->tins[2].tin_quantum_band = quantum >> 1;
2370         q->tins[3].tin_quantum_band = quantum >> 2;
2371
2372         return 0;
2373 }
2374
2375 static int cake_config_diffserv3(struct Qdisc *sch)
2376 {
2377 /*  Simplified Diffserv structure with 3 tins.
2378  *              Low Priority            (CS1)
2379  *              Best Effort
2380  *              Latency Sensitive       (TOS4, VA, EF, CS6, CS7)
2381  */
2382         struct cake_sched_data *q = qdisc_priv(sch);
2383         u32 mtu = psched_mtu(qdisc_dev(sch));
2384         u64 rate = q->rate_bps;
2385         u32 quantum = 1024;
2386
2387         q->tin_cnt = 3;
2388
2389         /* codepoint to class mapping */
2390         q->tin_index = diffserv3;
2391         q->tin_order = bulk_order;
2392
2393         /* class characteristics */
2394         cake_set_rate(&q->tins[0], rate, mtu,
2395                       us_to_ns(q->target), us_to_ns(q->interval));
2396         cake_set_rate(&q->tins[1], rate >> 4, mtu,
2397                       us_to_ns(q->target), us_to_ns(q->interval));
2398         cake_set_rate(&q->tins[2], rate >> 2, mtu,
2399                       us_to_ns(q->target), us_to_ns(q->interval));
2400
2401         /* priority weights */
2402         q->tins[0].tin_quantum_prio = quantum;
2403         q->tins[1].tin_quantum_prio = quantum >> 4;
2404         q->tins[2].tin_quantum_prio = quantum << 4;
2405
2406         /* bandwidth-sharing weights */
2407         q->tins[0].tin_quantum_band = quantum;
2408         q->tins[1].tin_quantum_band = quantum >> 4;
2409         q->tins[2].tin_quantum_band = quantum >> 2;
2410
2411         return 0;
2412 }
2413
2414 static void cake_reconfigure(struct Qdisc *sch)
2415 {
2416         struct cake_sched_data *q = qdisc_priv(sch);
2417         int c, ft;
2418
2419         switch (q->tin_mode) {
2420         case CAKE_DIFFSERV_BESTEFFORT:
2421                 ft = cake_config_besteffort(sch);
2422                 break;
2423
2424         case CAKE_DIFFSERV_PRECEDENCE:
2425                 ft = cake_config_precedence(sch);
2426                 break;
2427
2428         case CAKE_DIFFSERV_DIFFSERV8:
2429                 ft = cake_config_diffserv8(sch);
2430                 break;
2431
2432         case CAKE_DIFFSERV_DIFFSERV4:
2433                 ft = cake_config_diffserv4(sch);
2434                 break;
2435
2436         case CAKE_DIFFSERV_DIFFSERV3:
2437         default:
2438                 ft = cake_config_diffserv3(sch);
2439                 break;
2440         }
2441
2442         for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) {
2443                 cake_clear_tin(sch, c);
2444                 q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time;
2445         }
2446
2447         q->rate_ns   = q->tins[ft].tin_rate_ns;
2448         q->rate_shft = q->tins[ft].tin_rate_shft;
2449
2450         if (q->buffer_config_limit) {
2451                 q->buffer_limit = q->buffer_config_limit;
2452         } else if (q->rate_bps) {
2453                 u64 t = q->rate_bps * q->interval;
2454
2455                 do_div(t, USEC_PER_SEC / 4);
2456                 q->buffer_limit = max_t(u32, t, 4U << 20);
2457         } else {
2458                 q->buffer_limit = ~0;
2459         }
2460
2461         sch->flags &= ~TCQ_F_CAN_BYPASS;
2462
2463         q->buffer_limit = min(q->buffer_limit,
2464                               max(sch->limit * psched_mtu(qdisc_dev(sch)),
2465                                   q->buffer_config_limit));
2466 }
2467
2468 static int cake_change(struct Qdisc *sch, struct nlattr *opt,
2469                        struct netlink_ext_ack *extack)
2470 {
2471         struct cake_sched_data *q = qdisc_priv(sch);
2472         struct nlattr *tb[TCA_CAKE_MAX + 1];
2473         int err;
2474
2475         if (!opt)
2476                 return -EINVAL;
2477
2478         err = nla_parse_nested(tb, TCA_CAKE_MAX, opt, cake_policy, extack);
2479         if (err < 0)
2480                 return err;
2481
2482         if (tb[TCA_CAKE_NAT]) {
2483 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
2484                 q->flow_mode &= ~CAKE_FLOW_NAT_FLAG;
2485                 q->flow_mode |= CAKE_FLOW_NAT_FLAG *
2486                         !!nla_get_u32(tb[TCA_CAKE_NAT]);
2487 #else
2488                 NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT],
2489                                     "No conntrack support in kernel");
2490                 return -EOPNOTSUPP;
2491 #endif
2492         }
2493
2494         if (tb[TCA_CAKE_BASE_RATE64])
2495                 q->rate_bps = nla_get_u64(tb[TCA_CAKE_BASE_RATE64]);
2496
2497         if (tb[TCA_CAKE_DIFFSERV_MODE])
2498                 q->tin_mode = nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]);
2499
2500         if (tb[TCA_CAKE_WASH]) {
2501                 if (!!nla_get_u32(tb[TCA_CAKE_WASH]))
2502                         q->rate_flags |= CAKE_FLAG_WASH;
2503                 else
2504                         q->rate_flags &= ~CAKE_FLAG_WASH;
2505         }
2506
2507         if (tb[TCA_CAKE_FLOW_MODE])
2508                 q->flow_mode = ((q->flow_mode & CAKE_FLOW_NAT_FLAG) |
2509                                 (nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) &
2510                                         CAKE_FLOW_MASK));
2511
2512         if (tb[TCA_CAKE_ATM])
2513                 q->atm_mode = nla_get_u32(tb[TCA_CAKE_ATM]);
2514
2515         if (tb[TCA_CAKE_OVERHEAD]) {
2516                 q->rate_overhead = nla_get_s32(tb[TCA_CAKE_OVERHEAD]);
2517                 q->rate_flags |= CAKE_FLAG_OVERHEAD;
2518
2519                 q->max_netlen = 0;
2520                 q->max_adjlen = 0;
2521                 q->min_netlen = ~0;
2522                 q->min_adjlen = ~0;
2523         }
2524
2525         if (tb[TCA_CAKE_RAW]) {
2526                 q->rate_flags &= ~CAKE_FLAG_OVERHEAD;
2527
2528                 q->max_netlen = 0;
2529                 q->max_adjlen = 0;
2530                 q->min_netlen = ~0;
2531                 q->min_adjlen = ~0;
2532         }
2533
2534         if (tb[TCA_CAKE_MPU])
2535                 q->rate_mpu = nla_get_u32(tb[TCA_CAKE_MPU]);
2536
2537         if (tb[TCA_CAKE_RTT]) {
2538                 q->interval = nla_get_u32(tb[TCA_CAKE_RTT]);
2539
2540                 if (!q->interval)
2541                         q->interval = 1;
2542         }
2543
2544         if (tb[TCA_CAKE_TARGET]) {
2545                 q->target = nla_get_u32(tb[TCA_CAKE_TARGET]);
2546
2547                 if (!q->target)
2548                         q->target = 1;
2549         }
2550
2551         if (tb[TCA_CAKE_AUTORATE]) {
2552                 if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE]))
2553                         q->rate_flags |= CAKE_FLAG_AUTORATE_INGRESS;
2554                 else
2555                         q->rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS;
2556         }
2557
2558         if (tb[TCA_CAKE_INGRESS]) {
2559                 if (!!nla_get_u32(tb[TCA_CAKE_INGRESS]))
2560                         q->rate_flags |= CAKE_FLAG_INGRESS;
2561                 else
2562                         q->rate_flags &= ~CAKE_FLAG_INGRESS;
2563         }
2564
2565         if (tb[TCA_CAKE_ACK_FILTER])
2566                 q->ack_filter = nla_get_u32(tb[TCA_CAKE_ACK_FILTER]);
2567
2568         if (tb[TCA_CAKE_MEMORY])
2569                 q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]);
2570
2571         if (tb[TCA_CAKE_SPLIT_GSO]) {
2572                 if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO]))
2573                         q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2574                 else
2575                         q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO;
2576         }
2577
2578         if (q->tins) {
2579                 sch_tree_lock(sch);
2580                 cake_reconfigure(sch);
2581                 sch_tree_unlock(sch);
2582         }
2583
2584         return 0;
2585 }
2586
2587 static void cake_destroy(struct Qdisc *sch)
2588 {
2589         struct cake_sched_data *q = qdisc_priv(sch);
2590
2591         qdisc_watchdog_cancel(&q->watchdog);
2592         tcf_block_put(q->block);
2593         kvfree(q->tins);
2594 }
2595
2596 static int cake_init(struct Qdisc *sch, struct nlattr *opt,
2597                      struct netlink_ext_ack *extack)
2598 {
2599         struct cake_sched_data *q = qdisc_priv(sch);
2600         int i, j, err;
2601
2602         sch->limit = 10240;
2603         q->tin_mode = CAKE_DIFFSERV_DIFFSERV3;
2604         q->flow_mode  = CAKE_FLOW_TRIPLE;
2605
2606         q->rate_bps = 0; /* unlimited by default */
2607
2608         q->interval = 100000; /* 100ms default */
2609         q->target   =   5000; /* 5ms: codel RFC argues
2610                                * for 5 to 10% of interval
2611                                */
2612         q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2613         q->cur_tin = 0;
2614         q->cur_flow  = 0;
2615
2616         qdisc_watchdog_init(&q->watchdog, sch);
2617
2618         if (opt) {
2619                 int err = cake_change(sch, opt, extack);
2620
2621                 if (err)
2622                         return err;
2623         }
2624
2625         err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
2626         if (err)
2627                 return err;
2628
2629         quantum_div[0] = ~0;
2630         for (i = 1; i <= CAKE_QUEUES; i++)
2631                 quantum_div[i] = 65535 / i;
2632
2633         q->tins = kvzalloc(CAKE_MAX_TINS * sizeof(struct cake_tin_data),
2634                            GFP_KERNEL);
2635         if (!q->tins)
2636                 goto nomem;
2637
2638         for (i = 0; i < CAKE_MAX_TINS; i++) {
2639                 struct cake_tin_data *b = q->tins + i;
2640
2641                 INIT_LIST_HEAD(&b->new_flows);
2642                 INIT_LIST_HEAD(&b->old_flows);
2643                 INIT_LIST_HEAD(&b->decaying_flows);
2644                 b->sparse_flow_count = 0;
2645                 b->bulk_flow_count = 0;
2646                 b->decaying_flow_count = 0;
2647
2648                 for (j = 0; j < CAKE_QUEUES; j++) {
2649                         struct cake_flow *flow = b->flows + j;
2650                         u32 k = j * CAKE_MAX_TINS + i;
2651
2652                         INIT_LIST_HEAD(&flow->flowchain);
2653                         cobalt_vars_init(&flow->cvars);
2654
2655                         q->overflow_heap[k].t = i;
2656                         q->overflow_heap[k].b = j;
2657                         b->overflow_idx[j] = k;
2658                 }
2659         }
2660
2661         cake_reconfigure(sch);
2662         q->avg_peak_bandwidth = q->rate_bps;
2663         q->min_netlen = ~0;
2664         q->min_adjlen = ~0;
2665         return 0;
2666
2667 nomem:
2668         cake_destroy(sch);
2669         return -ENOMEM;
2670 }
2671
2672 static int cake_dump(struct Qdisc *sch, struct sk_buff *skb)
2673 {
2674         struct cake_sched_data *q = qdisc_priv(sch);
2675         struct nlattr *opts;
2676
2677         opts = nla_nest_start(skb, TCA_OPTIONS);
2678         if (!opts)
2679                 goto nla_put_failure;
2680
2681         if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64, q->rate_bps,
2682                               TCA_CAKE_PAD))
2683                 goto nla_put_failure;
2684
2685         if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE,
2686                         q->flow_mode & CAKE_FLOW_MASK))
2687                 goto nla_put_failure;
2688
2689         if (nla_put_u32(skb, TCA_CAKE_RTT, q->interval))
2690                 goto nla_put_failure;
2691
2692         if (nla_put_u32(skb, TCA_CAKE_TARGET, q->target))
2693                 goto nla_put_failure;
2694
2695         if (nla_put_u32(skb, TCA_CAKE_MEMORY, q->buffer_config_limit))
2696                 goto nla_put_failure;
2697
2698         if (nla_put_u32(skb, TCA_CAKE_AUTORATE,
2699                         !!(q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS)))
2700                 goto nla_put_failure;
2701
2702         if (nla_put_u32(skb, TCA_CAKE_INGRESS,
2703                         !!(q->rate_flags & CAKE_FLAG_INGRESS)))
2704                 goto nla_put_failure;
2705
2706         if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, q->ack_filter))
2707                 goto nla_put_failure;
2708
2709         if (nla_put_u32(skb, TCA_CAKE_NAT,
2710                         !!(q->flow_mode & CAKE_FLOW_NAT_FLAG)))
2711                 goto nla_put_failure;
2712
2713         if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, q->tin_mode))
2714                 goto nla_put_failure;
2715
2716         if (nla_put_u32(skb, TCA_CAKE_WASH,
2717                         !!(q->rate_flags & CAKE_FLAG_WASH)))
2718                 goto nla_put_failure;
2719
2720         if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, q->rate_overhead))
2721                 goto nla_put_failure;
2722
2723         if (!(q->rate_flags & CAKE_FLAG_OVERHEAD))
2724                 if (nla_put_u32(skb, TCA_CAKE_RAW, 0))
2725                         goto nla_put_failure;
2726
2727         if (nla_put_u32(skb, TCA_CAKE_ATM, q->atm_mode))
2728                 goto nla_put_failure;
2729
2730         if (nla_put_u32(skb, TCA_CAKE_MPU, q->rate_mpu))
2731                 goto nla_put_failure;
2732
2733         if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO,
2734                         !!(q->rate_flags & CAKE_FLAG_SPLIT_GSO)))
2735                 goto nla_put_failure;
2736
2737         return nla_nest_end(skb, opts);
2738
2739 nla_put_failure:
2740         return -1;
2741 }
2742
2743 static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
2744 {
2745         struct nlattr *stats = nla_nest_start(d->skb, TCA_STATS_APP);
2746         struct cake_sched_data *q = qdisc_priv(sch);
2747         struct nlattr *tstats, *ts;
2748         int i;
2749
2750         if (!stats)
2751                 return -1;
2752
2753 #define PUT_STAT_U32(attr, data) do {                                  \
2754                 if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2755                         goto nla_put_failure;                          \
2756         } while (0)
2757 #define PUT_STAT_U64(attr, data) do {                                  \
2758                 if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \
2759                                         data, TCA_CAKE_STATS_PAD)) \
2760                         goto nla_put_failure;                          \
2761         } while (0)
2762
2763         PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth);
2764         PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit);
2765         PUT_STAT_U32(MEMORY_USED, q->buffer_max_used);
2766         PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16));
2767         PUT_STAT_U32(MAX_NETLEN, q->max_netlen);
2768         PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen);
2769         PUT_STAT_U32(MIN_NETLEN, q->min_netlen);
2770         PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen);
2771
2772 #undef PUT_STAT_U32
2773 #undef PUT_STAT_U64
2774
2775         tstats = nla_nest_start(d->skb, TCA_CAKE_STATS_TIN_STATS);
2776         if (!tstats)
2777                 goto nla_put_failure;
2778
2779 #define PUT_TSTAT_U32(attr, data) do {                                  \
2780                 if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \
2781                         goto nla_put_failure;                           \
2782         } while (0)
2783 #define PUT_TSTAT_U64(attr, data) do {                                  \
2784                 if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \
2785                                         data, TCA_CAKE_TIN_STATS_PAD))  \
2786                         goto nla_put_failure;                           \
2787         } while (0)
2788
2789         for (i = 0; i < q->tin_cnt; i++) {
2790                 struct cake_tin_data *b = &q->tins[q->tin_order[i]];
2791
2792                 ts = nla_nest_start(d->skb, i + 1);
2793                 if (!ts)
2794                         goto nla_put_failure;
2795
2796                 PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps);
2797                 PUT_TSTAT_U64(SENT_BYTES64, b->bytes);
2798                 PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog);
2799
2800                 PUT_TSTAT_U32(TARGET_US,
2801                               ktime_to_us(ns_to_ktime(b->cparams.target)));
2802                 PUT_TSTAT_U32(INTERVAL_US,
2803                               ktime_to_us(ns_to_ktime(b->cparams.interval)));
2804
2805                 PUT_TSTAT_U32(SENT_PACKETS, b->packets);
2806                 PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped);
2807                 PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark);
2808                 PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops);
2809
2810                 PUT_TSTAT_U32(PEAK_DELAY_US,
2811                               ktime_to_us(ns_to_ktime(b->peak_delay)));
2812                 PUT_TSTAT_U32(AVG_DELAY_US,
2813                               ktime_to_us(ns_to_ktime(b->avge_delay)));
2814                 PUT_TSTAT_U32(BASE_DELAY_US,
2815                               ktime_to_us(ns_to_ktime(b->base_delay)));
2816
2817                 PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits);
2818                 PUT_TSTAT_U32(WAY_MISSES, b->way_misses);
2819                 PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions);
2820
2821                 PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count +
2822                                             b->decaying_flow_count);
2823                 PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count);
2824                 PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count);
2825                 PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen);
2826
2827                 PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum);
2828                 nla_nest_end(d->skb, ts);
2829         }
2830
2831 #undef PUT_TSTAT_U32
2832 #undef PUT_TSTAT_U64
2833
2834         nla_nest_end(d->skb, tstats);
2835         return nla_nest_end(d->skb, stats);
2836
2837 nla_put_failure:
2838         nla_nest_cancel(d->skb, stats);
2839         return -1;
2840 }
2841
2842 static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg)
2843 {
2844         return NULL;
2845 }
2846
2847 static unsigned long cake_find(struct Qdisc *sch, u32 classid)
2848 {
2849         return 0;
2850 }
2851
2852 static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent,
2853                                u32 classid)
2854 {
2855         return 0;
2856 }
2857
2858 static void cake_unbind(struct Qdisc *q, unsigned long cl)
2859 {
2860 }
2861
2862 static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl,
2863                                         struct netlink_ext_ack *extack)
2864 {
2865         struct cake_sched_data *q = qdisc_priv(sch);
2866
2867         if (cl)
2868                 return NULL;
2869         return q->block;
2870 }
2871
2872 static int cake_dump_class(struct Qdisc *sch, unsigned long cl,
2873                            struct sk_buff *skb, struct tcmsg *tcm)
2874 {
2875         tcm->tcm_handle |= TC_H_MIN(cl);
2876         return 0;
2877 }
2878
2879 static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl,
2880                                  struct gnet_dump *d)
2881 {
2882         struct cake_sched_data *q = qdisc_priv(sch);
2883         const struct cake_flow *flow = NULL;
2884         struct gnet_stats_queue qs = { 0 };
2885         struct nlattr *stats;
2886         u32 idx = cl - 1;
2887
2888         if (idx < CAKE_QUEUES * q->tin_cnt) {
2889                 const struct cake_tin_data *b = \
2890                         &q->tins[q->tin_order[idx / CAKE_QUEUES]];
2891                 const struct sk_buff *skb;
2892
2893                 flow = &b->flows[idx % CAKE_QUEUES];
2894
2895                 if (flow->head) {
2896                         sch_tree_lock(sch);
2897                         skb = flow->head;
2898                         while (skb) {
2899                                 qs.qlen++;
2900                                 skb = skb->next;
2901                         }
2902                         sch_tree_unlock(sch);
2903                 }
2904                 qs.backlog = b->backlogs[idx % CAKE_QUEUES];
2905                 qs.drops = flow->dropped;
2906         }
2907         if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
2908                 return -1;
2909         if (flow) {
2910                 ktime_t now = ktime_get();
2911
2912                 stats = nla_nest_start(d->skb, TCA_STATS_APP);
2913                 if (!stats)
2914                         return -1;
2915
2916 #define PUT_STAT_U32(attr, data) do {                                  \
2917                 if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2918                         goto nla_put_failure;                          \
2919         } while (0)
2920 #define PUT_STAT_S32(attr, data) do {                                  \
2921                 if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2922                         goto nla_put_failure;                          \
2923         } while (0)
2924
2925                 PUT_STAT_S32(DEFICIT, flow->deficit);
2926                 PUT_STAT_U32(DROPPING, flow->cvars.dropping);
2927                 PUT_STAT_U32(COBALT_COUNT, flow->cvars.count);
2928                 PUT_STAT_U32(P_DROP, flow->cvars.p_drop);
2929                 if (flow->cvars.p_drop) {
2930                         PUT_STAT_S32(BLUE_TIMER_US,
2931                                      ktime_to_us(
2932                                              ktime_sub(now,
2933                                                      flow->cvars.blue_timer)));
2934                 }
2935                 if (flow->cvars.dropping) {
2936                         PUT_STAT_S32(DROP_NEXT_US,
2937                                      ktime_to_us(
2938                                              ktime_sub(now,
2939                                                        flow->cvars.drop_next)));
2940                 }
2941
2942                 if (nla_nest_end(d->skb, stats) < 0)
2943                         return -1;
2944         }
2945
2946         return 0;
2947
2948 nla_put_failure:
2949         nla_nest_cancel(d->skb, stats);
2950         return -1;
2951 }
2952
2953 static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg)
2954 {
2955         struct cake_sched_data *q = qdisc_priv(sch);
2956         unsigned int i, j;
2957
2958         if (arg->stop)
2959                 return;
2960
2961         for (i = 0; i < q->tin_cnt; i++) {
2962                 struct cake_tin_data *b = &q->tins[q->tin_order[i]];
2963
2964                 for (j = 0; j < CAKE_QUEUES; j++) {
2965                         if (list_empty(&b->flows[j].flowchain) ||
2966                             arg->count < arg->skip) {
2967                                 arg->count++;
2968                                 continue;
2969                         }
2970                         if (arg->fn(sch, i * CAKE_QUEUES + j + 1, arg) < 0) {
2971                                 arg->stop = 1;
2972                                 break;
2973                         }
2974                         arg->count++;
2975                 }
2976         }
2977 }
2978
2979 static const struct Qdisc_class_ops cake_class_ops = {
2980         .leaf           =       cake_leaf,
2981         .find           =       cake_find,
2982         .tcf_block      =       cake_tcf_block,
2983         .bind_tcf       =       cake_bind,
2984         .unbind_tcf     =       cake_unbind,
2985         .dump           =       cake_dump_class,
2986         .dump_stats     =       cake_dump_class_stats,
2987         .walk           =       cake_walk,
2988 };
2989
2990 static struct Qdisc_ops cake_qdisc_ops __read_mostly = {
2991         .cl_ops         =       &cake_class_ops,
2992         .id             =       "cake",
2993         .priv_size      =       sizeof(struct cake_sched_data),
2994         .enqueue        =       cake_enqueue,
2995         .dequeue        =       cake_dequeue,
2996         .peek           =       qdisc_peek_dequeued,
2997         .init           =       cake_init,
2998         .reset          =       cake_reset,
2999         .destroy        =       cake_destroy,
3000         .change         =       cake_change,
3001         .dump           =       cake_dump,
3002         .dump_stats     =       cake_dump_stats,
3003         .owner          =       THIS_MODULE,
3004 };
3005
3006 static int __init cake_module_init(void)
3007 {
3008         return register_qdisc(&cake_qdisc_ops);
3009 }
3010
3011 static void __exit cake_module_exit(void)
3012 {
3013         unregister_qdisc(&cake_qdisc_ops);
3014 }
3015
3016 module_init(cake_module_init)
3017 module_exit(cake_module_exit)
3018 MODULE_AUTHOR("Jonathan Morton");
3019 MODULE_LICENSE("Dual BSD/GPL");
3020 MODULE_DESCRIPTION("The CAKE shaper.");