block/kyber-iosched.c

   1 /*
   2  * The Kyber I/O scheduler. Controls latency by throttling queue depths using
   3  * scalable techniques.
   4  *
   5  * Copyright (C) 2017 Facebook
   6  *
   7  * This program is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public
   9  * License v2 as published by the Free Software Foundation.
  10  *
  11  * This program is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * along with this program.  If not, see <https://www.gnu.org/licenses/>.
  18  */
  19
  20 #include <linux/kernel.h>
  21 #include <linux/blkdev.h>
  22 #include <linux/blk-mq.h>
  23 #include <linux/elevator.h>
  24 #include <linux/module.h>
  25 #include <linux/sbitmap.h>
  26
  27 #include "blk.h"
  28 #include "blk-mq.h"
  29 #include "blk-mq-debugfs.h"
  30 #include "blk-mq-sched.h"
  31 #include "blk-mq-tag.h"
  32 #include "blk-stat.h"
  33
  34 /* Scheduling domains. */
  35 enum {
  36         KYBER_READ,
  37         KYBER_SYNC_WRITE,
  38         KYBER_OTHER, /* Async writes, discard, etc. */
  39         KYBER_NUM_DOMAINS,
  40 };
  41
  42 enum {
  43         KYBER_MIN_DEPTH = 256,
  44
  45         /*
  46          * In order to prevent starvation of synchronous requests by a flood of
  47          * asynchronous requests, we reserve 25% of requests for synchronous
  48          * operations.
  49          */
  50         KYBER_ASYNC_PERCENT = 75,
  51 };
  52
  53 /*
  54  * Initial device-wide depths for each scheduling domain.
  55  *
  56  * Even for fast devices with lots of tags like NVMe, you can saturate
  57  * the device with only a fraction of the maximum possible queue depth.
  58  * So, we cap these to a reasonable value.
  59  */
  60 static const unsigned int kyber_depth[] = {
  61         [KYBER_READ] = 256,
  62         [KYBER_SYNC_WRITE] = 128,
  63         [KYBER_OTHER] = 64,
  64 };
  65
  66 /*
  67  * Scheduling domain batch sizes. We favor reads.
  68  */
  69 static const unsigned int kyber_batch_size[] = {
  70         [KYBER_READ] = 16,
  71         [KYBER_SYNC_WRITE] = 8,
  72         [KYBER_OTHER] = 8,
  73 };
  74
  75 /*
  76  * There is a same mapping between ctx & hctx and kcq & khd,
  77  * we use request->mq_ctx->index_hw to index the kcq in khd.
  78  */
  79 struct kyber_ctx_queue {
  80         /*
  81          * Used to ensure operations on rq_list and kcq_map to be an atmoic one.
  82          * Also protect the rqs on rq_list when merge.
  83          */
  84         spinlock_t lock;
  85         struct list_head rq_list[KYBER_NUM_DOMAINS];
  86 } ____cacheline_aligned_in_smp;
  87
  88 struct kyber_queue_data {
  89         struct request_queue *q;
  90
  91         struct blk_stat_callback *cb;
  92
  93         /*
  94          * The device is divided into multiple scheduling domains based on the
  95          * request type. Each domain has a fixed number of in-flight requests of
  96          * that type device-wide, limited by these tokens.
  97          */
  98         struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS];
  99
 100         /*
 101          * Async request percentage, converted to per-word depth for
 102          * sbitmap_get_shallow().
 103          */
 104         unsigned int async_depth;
 105
 106         /* Target latencies in nanoseconds. */
 107         u64 read_lat_nsec, write_lat_nsec;
 108 };
 109
 110 struct kyber_hctx_data {
 111         spinlock_t lock;
 112         struct list_head rqs[KYBER_NUM_DOMAINS];
 113         unsigned int cur_domain;
 114         unsigned int batching;
 115         struct kyber_ctx_queue *kcqs;
 116         struct sbitmap kcq_map[KYBER_NUM_DOMAINS];
 117         wait_queue_entry_t domain_wait[KYBER_NUM_DOMAINS];
 118         struct sbq_wait_state *domain_ws[KYBER_NUM_DOMAINS];
 119         atomic_t wait_index[KYBER_NUM_DOMAINS];
 120 };
 121
 122 static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
 123                              void *key);
 124
 125 static unsigned int kyber_sched_domain(unsigned int op)
 126 {
 127         if ((op & REQ_OP_MASK) == REQ_OP_READ)
 128                 return KYBER_READ;
 129         else if ((op & REQ_OP_MASK) == REQ_OP_WRITE && op_is_sync(op))
 130                 return KYBER_SYNC_WRITE;
 131         else
 132                 return KYBER_OTHER;
 133 }
 134
 135 enum {
 136         NONE = 0,
 137         GOOD = 1,
 138         GREAT = 2,
 139         BAD = -1,
 140         AWFUL = -2,
 141 };
 142
 143 #define IS_GOOD(status) ((status) > 0)
 144 #define IS_BAD(status) ((status) < 0)
 145
 146 static int kyber_lat_status(struct blk_stat_callback *cb,
 147                             unsigned int sched_domain, u64 target)
 148 {
 149         u64 latency;
 150
 151         if (!cb->stat[sched_domain].nr_samples)
 152                 return NONE;
 153
 154         latency = cb->stat[sched_domain].mean;
 155         if (latency >= 2 * target)
 156                 return AWFUL;
 157         else if (latency > target)
 158                 return BAD;
 159         else if (latency <= target / 2)
 160                 return GREAT;
 161         else /* (latency <= target) */
 162                 return GOOD;
 163 }
 164
 165 /*
 166  * Adjust the read or synchronous write depth given the status of reads and
 167  * writes. The goal is that the latencies of the two domains are fair (i.e., if
 168  * one is good, then the other is good).
 169  */
 170 static void kyber_adjust_rw_depth(struct kyber_queue_data *kqd,
 171                                   unsigned int sched_domain, int this_status,
 172                                   int other_status)
 173 {
 174         unsigned int orig_depth, depth;
 175
 176         /*
 177          * If this domain had no samples, or reads and writes are both good or
 178          * both bad, don't adjust the depth.
 179          */
 180         if (this_status == NONE ||
 181             (IS_GOOD(this_status) && IS_GOOD(other_status)) ||
 182             (IS_BAD(this_status) && IS_BAD(other_status)))
 183                 return;
 184
 185         orig_depth = depth = kqd->domain_tokens[sched_domain].sb.depth;
 186
 187         if (other_status == NONE) {
 188                 depth++;
 189         } else {
 190                 switch (this_status) {
 191                 case GOOD:
 192                         if (other_status == AWFUL)
 193                                 depth -= max(depth / 4, 1U);
 194                         else
 195                                 depth -= max(depth / 8, 1U);
 196                         break;
 197                 case GREAT:
 198                         if (other_status == AWFUL)
 199                                 depth /= 2;
 200                         else
 201                                 depth -= max(depth / 4, 1U);
 202                         break;
 203                 case BAD:
 204                         depth++;
 205                         break;
 206                 case AWFUL:
 207                         if (other_status == GREAT)
 208                                 depth += 2;
 209                         else
 210                                 depth++;
 211                         break;
 212                 }
 213         }
 214
 215         depth = clamp(depth, 1U, kyber_depth[sched_domain]);
 216         if (depth != orig_depth)
 217                 sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth);
 218 }
 219
 220 /*
 221  * Adjust the depth of other requests given the status of reads and synchronous
 222  * writes. As long as either domain is doing fine, we don't throttle, but if
 223  * both domains are doing badly, we throttle heavily.
 224  */
 225 static void kyber_adjust_other_depth(struct kyber_queue_data *kqd,
 226                                      int read_status, int write_status,
 227                                      bool have_samples)
 228 {
 229         unsigned int orig_depth, depth;
 230         int status;
 231
 232         orig_depth = depth = kqd->domain_tokens[KYBER_OTHER].sb.depth;
 233
 234         if (read_status == NONE && write_status == NONE) {
 235                 depth += 2;
 236         } else if (have_samples) {
 237                 if (read_status == NONE)
 238                         status = write_status;
 239                 else if (write_status == NONE)
 240                         status = read_status;
 241                 else
 242                         status = max(read_status, write_status);
 243                 switch (status) {
 244                 case GREAT:
 245                         depth += 2;
 246                         break;
 247                 case GOOD:
 248                         depth++;
 249                         break;
 250                 case BAD:
 251                         depth -= max(depth / 4, 1U);
 252                         break;
 253                 case AWFUL:
 254                         depth /= 2;
 255                         break;
 256                 }
 257         }
 258
 259         depth = clamp(depth, 1U, kyber_depth[KYBER_OTHER]);
 260         if (depth != orig_depth)
 261                 sbitmap_queue_resize(&kqd->domain_tokens[KYBER_OTHER], depth);
 262 }
 263
 264 /*
 265  * Apply heuristics for limiting queue depths based on gathered latency
 266  * statistics.
 267  */
 268 static void kyber_stat_timer_fn(struct blk_stat_callback *cb)
 269 {
 270         struct kyber_queue_data *kqd = cb->data;
 271         int read_status, write_status;
 272
 273         read_status = kyber_lat_status(cb, KYBER_READ, kqd->read_lat_nsec);
 274         write_status = kyber_lat_status(cb, KYBER_SYNC_WRITE, kqd->write_lat_nsec);
 275
 276         kyber_adjust_rw_depth(kqd, KYBER_READ, read_status, write_status);
 277         kyber_adjust_rw_depth(kqd, KYBER_SYNC_WRITE, write_status, read_status);
 278         kyber_adjust_other_depth(kqd, read_status, write_status,
 279                                  cb->stat[KYBER_OTHER].nr_samples != 0);
 280
 281         /*
 282          * Continue monitoring latencies if we aren't hitting the targets or
 283          * we're still throttling other requests.
 284          */
 285         if (!blk_stat_is_active(kqd->cb) &&
 286             ((IS_BAD(read_status) || IS_BAD(write_status) ||
 287               kqd->domain_tokens[KYBER_OTHER].sb.depth < kyber_depth[KYBER_OTHER])))
 288                 blk_stat_activate_msecs(kqd->cb, 100);
 289 }
 290
 291 static unsigned int kyber_sched_tags_shift(struct kyber_queue_data *kqd)
 292 {
 293         /*
 294          * All of the hardware queues have the same depth, so we can just grab
 295          * the shift of the first one.
 296          */
 297         return kqd->q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift;
 298 }
 299
 300 static int kyber_bucket_fn(const struct request *rq)
 301 {
 302         return kyber_sched_domain(rq->cmd_flags);
 303 }
 304
 305 static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
 306 {
 307         struct kyber_queue_data *kqd;
 308         unsigned int max_tokens;
 309         unsigned int shift;
 310         int ret = -ENOMEM;
 311         int i;
 312
 313         kqd = kmalloc_node(sizeof(*kqd), GFP_KERNEL, q->node);
 314         if (!kqd)
 315                 goto err;
 316         kqd->q = q;
 317
 318         kqd->cb = blk_stat_alloc_callback(kyber_stat_timer_fn, kyber_bucket_fn,
 319                                           KYBER_NUM_DOMAINS, kqd);
 320         if (!kqd->cb)
 321                 goto err_kqd;
 322
 323         /*
 324          * The maximum number of tokens for any scheduling domain is at least
 325          * the queue depth of a single hardware queue. If the hardware doesn't
 326          * have many tags, still provide a reasonable number.
 327          */
 328         max_tokens = max_t(unsigned int, q->tag_set->queue_depth,
 329                            KYBER_MIN_DEPTH);
 330         for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
 331                 WARN_ON(!kyber_depth[i]);
 332                 WARN_ON(!kyber_batch_size[i]);
 333                 ret = sbitmap_queue_init_node(&kqd->domain_tokens[i],
 334                                               max_tokens, -1, false, GFP_KERNEL,
 335                                               q->node);
 336                 if (ret) {
 337                         while (--i >= 0)
 338                                 sbitmap_queue_free(&kqd->domain_tokens[i]);
 339                         goto err_cb;
 340                 }
 341                 sbitmap_queue_resize(&kqd->domain_tokens[i], kyber_depth[i]);
 342         }
 343
 344         shift = kyber_sched_tags_shift(kqd);
 345         kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
 346
 347         kqd->read_lat_nsec = 2000000ULL;
 348         kqd->write_lat_nsec = 10000000ULL;
 349
 350         return kqd;
 351
 352 err_cb:
 353         blk_stat_free_callback(kqd->cb);
 354 err_kqd:
 355         kfree(kqd);
 356 err:
 357         return ERR_PTR(ret);
 358 }
 359
 360 static int kyber_init_sched(struct request_queue *q, struct elevator_type *e)
 361 {
 362         struct kyber_queue_data *kqd;
 363         struct elevator_queue *eq;
 364
 365         eq = elevator_alloc(q, e);
 366         if (!eq)
 367                 return -ENOMEM;
 368
 369         kqd = kyber_queue_data_alloc(q);
 370         if (IS_ERR(kqd)) {
 371                 kobject_put(&eq->kobj);
 372                 return PTR_ERR(kqd);
 373         }
 374
 375         eq->elevator_data = kqd;
 376         q->elevator = eq;
 377
 378         blk_stat_add_callback(q, kqd->cb);
 379
 380         return 0;
 381 }
 382
 383 static void kyber_exit_sched(struct elevator_queue *e)
 384 {
 385         struct kyber_queue_data *kqd = e->elevator_data;
 386         struct request_queue *q = kqd->q;
 387         int i;
 388
 389         blk_stat_remove_callback(q, kqd->cb);
 390
 391         for (i = 0; i < KYBER_NUM_DOMAINS; i++)
 392                 sbitmap_queue_free(&kqd->domain_tokens[i]);
 393         blk_stat_free_callback(kqd->cb);
 394         kfree(kqd);
 395 }
 396
 397 static void kyber_ctx_queue_init(struct kyber_ctx_queue *kcq)
 398 {
 399         unsigned int i;
 400
 401         spin_lock_init(&kcq->lock);
 402         for (i = 0; i < KYBER_NUM_DOMAINS; i++)
 403                 INIT_LIST_HEAD(&kcq->rq_list[i]);
 404 }
 405
 406 static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
 407 {
 408         struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
 409         struct kyber_hctx_data *khd;
 410         int i;
 411
 412         khd = kmalloc_node(sizeof(*khd), GFP_KERNEL, hctx->numa_node);
 413         if (!khd)
 414                 return -ENOMEM;
 415
 416         khd->kcqs = kmalloc_array_node(hctx->nr_ctx,
 417                                        sizeof(struct kyber_ctx_queue),
 418                                        GFP_KERNEL, hctx->numa_node);
 419         if (!khd->kcqs)
 420                 goto err_khd;
 421
 422         for (i = 0; i < hctx->nr_ctx; i++)
 423                 kyber_ctx_queue_init(&khd->kcqs[i]);
 424
 425         for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
 426                 if (sbitmap_init_node(&khd->kcq_map[i], hctx->nr_ctx,
 427                                       ilog2(8), GFP_KERNEL, hctx->numa_node)) {
 428                         while (--i >= 0)
 429                                 sbitmap_free(&khd->kcq_map[i]);
 430                         goto err_kcqs;
 431                 }
 432         }
 433
 434         spin_lock_init(&khd->lock);
 435
 436         for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
 437                 INIT_LIST_HEAD(&khd->rqs[i]);
 438                 init_waitqueue_func_entry(&khd->domain_wait[i],
 439                                           kyber_domain_wake);
 440                 khd->domain_wait[i].private = hctx;
 441                 INIT_LIST_HEAD(&khd->domain_wait[i].entry);
 442                 atomic_set(&khd->wait_index[i], 0);
 443         }
 444
 445         khd->cur_domain = 0;
 446         khd->batching = 0;
 447
 448         hctx->sched_data = khd;
 449         sbitmap_queue_min_shallow_depth(&hctx->sched_tags->bitmap_tags,
 450                                         kqd->async_depth);
 451
 452         return 0;
 453
 454 err_kcqs:
 455         kfree(khd->kcqs);
 456 err_khd:
 457         kfree(khd);
 458         return -ENOMEM;
 459 }
 460
 461 static void kyber_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
 462 {
 463         struct kyber_hctx_data *khd = hctx->sched_data;
 464         int i;
 465
 466         for (i = 0; i < KYBER_NUM_DOMAINS; i++)
 467                 sbitmap_free(&khd->kcq_map[i]);
 468         kfree(khd->kcqs);
 469         kfree(hctx->sched_data);
 470 }
 471
 472 static int rq_get_domain_token(struct request *rq)
 473 {
 474         return (long)rq->elv.priv[0];
 475 }
 476
 477 static void rq_set_domain_token(struct request *rq, int token)
 478 {
 479         rq->elv.priv[0] = (void *)(long)token;
 480 }
 481
 482 static void rq_clear_domain_token(struct kyber_queue_data *kqd,
 483                                   struct request *rq)
 484 {
 485         unsigned int sched_domain;
 486         int nr;
 487
 488         nr = rq_get_domain_token(rq);
 489         if (nr != -1) {
 490                 sched_domain = kyber_sched_domain(rq->cmd_flags);
 491                 sbitmap_queue_clear(&kqd->domain_tokens[sched_domain], nr,
 492                                     rq->mq_ctx->cpu);
 493         }
 494 }
 495
 496 static void kyber_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
 497 {
 498         /*
 499          * We use the scheduler tags as per-hardware queue queueing tokens.
 500          * Async requests can be limited at this stage.
 501          */
 502         if (!op_is_sync(op)) {
 503                 struct kyber_queue_data *kqd = data->q->elevator->elevator_data;
 504
 505                 data->shallow_depth = kqd->async_depth;
 506         }
 507 }
 508
 509 static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
 510 {
 511         struct kyber_hctx_data *khd = hctx->sched_data;
 512         struct blk_mq_ctx *ctx = blk_mq_get_ctx(hctx->queue);
 513         struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw];
 514         unsigned int sched_domain = kyber_sched_domain(bio->bi_opf);
 515         struct list_head *rq_list = &kcq->rq_list[sched_domain];
 516         bool merged;
 517
 518         spin_lock(&kcq->lock);
 519         merged = blk_mq_bio_list_merge(hctx->queue, rq_list, bio);
 520         spin_unlock(&kcq->lock);
 521         blk_mq_put_ctx(ctx);
 522
 523         return merged;
 524 }
 525
 526 static void kyber_prepare_request(struct request *rq, struct bio *bio)
 527 {
 528         rq_set_domain_token(rq, -1);
 529 }
 530
 531 static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx,
 532                                   struct list_head *rq_list, bool at_head)
 533 {
 534         struct kyber_hctx_data *khd = hctx->sched_data;
 535         struct request *rq, *next;
 536
 537         list_for_each_entry_safe(rq, next, rq_list, queuelist) {
 538                 unsigned int sched_domain = kyber_sched_domain(rq->cmd_flags);
 539                 struct kyber_ctx_queue *kcq = &khd->kcqs[rq->mq_ctx->index_hw];
 540                 struct list_head *head = &kcq->rq_list[sched_domain];
 541
 542                 spin_lock(&kcq->lock);
 543                 if (at_head)
 544                         list_move(&rq->queuelist, head);
 545                 else
 546                         list_move_tail(&rq->queuelist, head);
 547                 sbitmap_set_bit(&khd->kcq_map[sched_domain],
 548                                 rq->mq_ctx->index_hw);
 549                 blk_mq_sched_request_inserted(rq);
 550                 spin_unlock(&kcq->lock);
 551         }
 552 }
 553
 554 static void kyber_finish_request(struct request *rq)
 555 {
 556         struct kyber_queue_data *kqd = rq->q->elevator->elevator_data;
 557
 558         rq_clear_domain_token(kqd, rq);
 559 }
 560
 561 static void kyber_completed_request(struct request *rq)
 562 {
 563         struct request_queue *q = rq->q;
 564         struct kyber_queue_data *kqd = q->elevator->elevator_data;
 565         unsigned int sched_domain;
 566         u64 now, latency, target;
 567
 568         /*
 569          * Check if this request met our latency goal. If not, quickly gather
 570          * some statistics and start throttling.
 571          */
 572         sched_domain = kyber_sched_domain(rq->cmd_flags);
 573         switch (sched_domain) {
 574         case KYBER_READ:
 575                 target = kqd->read_lat_nsec;
 576                 break;
 577         case KYBER_SYNC_WRITE:
 578                 target = kqd->write_lat_nsec;
 579                 break;
 580         default:
 581                 return;
 582         }
 583
 584         /* If we are already monitoring latencies, don't check again. */
 585         if (blk_stat_is_active(kqd->cb))
 586                 return;
 587
 588         now = ktime_get_ns();
 589         if (now < rq->io_start_time_ns)
 590                 return;
 591
 592         latency = now - rq->io_start_time_ns;
 593
 594         if (latency > target)
 595                 blk_stat_activate_msecs(kqd->cb, 10);
 596 }
 597
 598 struct flush_kcq_data {
 599         struct kyber_hctx_data *khd;
 600         unsigned int sched_domain;
 601         struct list_head *list;
 602 };
 603
 604 static bool flush_busy_kcq(struct sbitmap *sb, unsigned int bitnr, void *data)
 605 {
 606         struct flush_kcq_data *flush_data = data;
 607         struct kyber_ctx_queue *kcq = &flush_data->khd->kcqs[bitnr];
 608
 609         spin_lock(&kcq->lock);
 610         list_splice_tail_init(&kcq->rq_list[flush_data->sched_domain],
 611                               flush_data->list);
 612         sbitmap_clear_bit(sb, bitnr);
 613         spin_unlock(&kcq->lock);
 614
 615         return true;
 616 }
 617
 618 static void kyber_flush_busy_kcqs(struct kyber_hctx_data *khd,
 619                                   unsigned int sched_domain,
 620                                   struct list_head *list)
 621 {
 622         struct flush_kcq_data data = {
 623                 .khd = khd,
 624                 .sched_domain = sched_domain,
 625                 .list = list,
 626         };
 627
 628         sbitmap_for_each_set(&khd->kcq_map[sched_domain],
 629                              flush_busy_kcq, &data);
 630 }
 631
 632 static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
 633                              void *key)
 634 {
 635         struct blk_mq_hw_ctx *hctx = READ_ONCE(wait->private);
 636
 637         list_del_init(&wait->entry);
 638         blk_mq_run_hw_queue(hctx, true);
 639         return 1;
 640 }
 641
 642 static int kyber_get_domain_token(struct kyber_queue_data *kqd,
 643                                   struct kyber_hctx_data *khd,
 644                                   struct blk_mq_hw_ctx *hctx)
 645 {
 646         unsigned int sched_domain = khd->cur_domain;
 647         struct sbitmap_queue *domain_tokens = &kqd->domain_tokens[sched_domain];
 648         wait_queue_entry_t *wait = &khd->domain_wait[sched_domain];
 649         struct sbq_wait_state *ws;
 650         int nr;
 651
 652         nr = __sbitmap_queue_get(domain_tokens);
 653
 654         /*
 655          * If we failed to get a domain token, make sure the hardware queue is
 656          * run when one becomes available. Note that this is serialized on
 657          * khd->lock, but we still need to be careful about the waker.
 658          */
 659         if (nr < 0 && list_empty_careful(&wait->entry)) {
 660                 ws = sbq_wait_ptr(domain_tokens,
 661                                   &khd->wait_index[sched_domain]);
 662                 khd->domain_ws[sched_domain] = ws;
 663                 add_wait_queue(&ws->wait, wait);
 664
 665                 /*
 666                  * Try again in case a token was freed before we got on the wait
 667                  * queue.
 668                  */
 669                 nr = __sbitmap_queue_get(domain_tokens);
 670         }
 671
 672         /*
 673          * If we got a token while we were on the wait queue, remove ourselves
 674          * from the wait queue to ensure that all wake ups make forward
 675          * progress. It's possible that the waker already deleted the entry
 676          * between the !list_empty_careful() check and us grabbing the lock, but
 677          * list_del_init() is okay with that.
 678          */
 679         if (nr >= 0 && !list_empty_careful(&wait->entry)) {
 680                 ws = khd->domain_ws[sched_domain];
 681                 spin_lock_irq(&ws->wait.lock);
 682                 list_del_init(&wait->entry);
 683                 spin_unlock_irq(&ws->wait.lock);
 684         }
 685
 686         return nr;
 687 }
 688
 689 static struct request *
 690 kyber_dispatch_cur_domain(struct kyber_queue_data *kqd,
 691                           struct kyber_hctx_data *khd,
 692                           struct blk_mq_hw_ctx *hctx)
 693 {
 694         struct list_head *rqs;
 695         struct request *rq;
 696         int nr;
 697
 698         rqs = &khd->rqs[khd->cur_domain];
 699
 700         /*
 701          * If we already have a flushed request, then we just need to get a
 702          * token for it. Otherwise, if there are pending requests in the kcqs,
 703          * flush the kcqs, but only if we can get a token. If not, we should
 704          * leave the requests in the kcqs so that they can be merged. Note that
 705          * khd->lock serializes the flushes, so if we observed any bit set in
 706          * the kcq_map, we will always get a request.
 707          */
 708         rq = list_first_entry_or_null(rqs, struct request, queuelist);
 709         if (rq) {
 710                 nr = kyber_get_domain_token(kqd, khd, hctx);
 711                 if (nr >= 0) {
 712                         khd->batching++;
 713                         rq_set_domain_token(rq, nr);
 714                         list_del_init(&rq->queuelist);
 715                         return rq;
 716                 }
 717         } else if (sbitmap_any_bit_set(&khd->kcq_map[khd->cur_domain])) {
 718                 nr = kyber_get_domain_token(kqd, khd, hctx);
 719                 if (nr >= 0) {
 720                         kyber_flush_busy_kcqs(khd, khd->cur_domain, rqs);
 721                         rq = list_first_entry(rqs, struct request, queuelist);
 722                         khd->batching++;
 723                         rq_set_domain_token(rq, nr);
 724                         list_del_init(&rq->queuelist);
 725                         return rq;
 726                 }
 727         }
 728
 729         /* There were either no pending requests or no tokens. */
 730         return NULL;
 731 }
 732
 733 static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx)
 734 {
 735         struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
 736         struct kyber_hctx_data *khd = hctx->sched_data;
 737         struct request *rq;
 738         int i;
 739
 740         spin_lock(&khd->lock);
 741
 742         /*
 743          * First, if we are still entitled to batch, try to dispatch a request
 744          * from the batch.
 745          */
 746         if (khd->batching < kyber_batch_size[khd->cur_domain]) {
 747                 rq = kyber_dispatch_cur_domain(kqd, khd, hctx);
 748                 if (rq)
 749                         goto out;
 750         }
 751
 752         /*
 753          * Either,
 754          * 1. We were no longer entitled to a batch.
 755          * 2. The domain we were batching didn't have any requests.
 756          * 3. The domain we were batching was out of tokens.
 757          *
 758          * Start another batch. Note that this wraps back around to the original
 759          * domain if no other domains have requests or tokens.
 760          */
 761         khd->batching = 0;
 762         for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
 763                 if (khd->cur_domain == KYBER_NUM_DOMAINS - 1)
 764                         khd->cur_domain = 0;
 765                 else
 766                         khd->cur_domain++;
 767
 768                 rq = kyber_dispatch_cur_domain(kqd, khd, hctx);
 769                 if (rq)
 770                         goto out;
 771         }
 772
 773         rq = NULL;
 774 out:
 775         spin_unlock(&khd->lock);
 776         return rq;
 777 }
 778
 779 static bool kyber_has_work(struct blk_mq_hw_ctx *hctx)
 780 {
 781         struct kyber_hctx_data *khd = hctx->sched_data;
 782         int i;
 783
 784         for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
 785                 if (!list_empty_careful(&khd->rqs[i]) ||
 786                     sbitmap_any_bit_set(&khd->kcq_map[i]))
 787                         return true;
 788         }
 789
 790         return false;
 791 }
 792
 793 #define KYBER_LAT_SHOW_STORE(op)                                        \
 794 static ssize_t kyber_##op##_lat_show(struct elevator_queue *e,          \
 795                                      char *page)                        \
 796 {                                                                       \
 797         struct kyber_queue_data *kqd = e->elevator_data;                \
 798                                                                         \
 799         return sprintf(page, "%llu\n", kqd->op##_lat_nsec);             \
 800 }                                                                       \
 801                                                                         \
 802 static ssize_t kyber_##op##_lat_store(struct elevator_queue *e,         \
 803                                       const char *page, size_t count)   \
 804 {                                                                       \
 805         struct kyber_queue_data *kqd = e->elevator_data;                \
 806         unsigned long long nsec;                                        \
 807         int ret;                                                        \
 808                                                                         \
 809         ret = kstrtoull(page, 10, &nsec);                               \
 810         if (ret)                                                        \
 811                 return ret;                                             \
 812                                                                         \
 813         kqd->op##_lat_nsec = nsec;                                      \
 814                                                                         \
 815         return count;                                                   \
 816 }
 817 KYBER_LAT_SHOW_STORE(read);
 818 KYBER_LAT_SHOW_STORE(write);
 819 #undef KYBER_LAT_SHOW_STORE
 820
 821 #define KYBER_LAT_ATTR(op) __ATTR(op##_lat_nsec, 0644, kyber_##op##_lat_show, kyber_##op##_lat_store)
 822 static struct elv_fs_entry kyber_sched_attrs[] = {
 823         KYBER_LAT_ATTR(read),
 824         KYBER_LAT_ATTR(write),
 825         __ATTR_NULL
 826 };
 827 #undef KYBER_LAT_ATTR
 828
 829 #ifdef CONFIG_BLK_DEBUG_FS
 830 #define KYBER_DEBUGFS_DOMAIN_ATTRS(domain, name)                        \
 831 static int kyber_##name##_tokens_show(void *data, struct seq_file *m)   \
 832 {                                                                       \
 833         struct request_queue *q = data;                                 \
 834         struct kyber_queue_data *kqd = q->elevator->elevator_data;      \
 835                                                                         \
 836         sbitmap_queue_show(&kqd->domain_tokens[domain], m);             \
 837         return 0;                                                       \
 838 }                                                                       \
 839                                                                         \
 840 static void *kyber_##name##_rqs_start(struct seq_file *m, loff_t *pos)  \
 841         __acquires(&khd->lock)                                          \
 842 {                                                                       \
 843         struct blk_mq_hw_ctx *hctx = m->private;                        \
 844         struct kyber_hctx_data *khd = hctx->sched_data;                 \
 845                                                                         \
 846         spin_lock(&khd->lock);                                          \
 847         return seq_list_start(&khd->rqs[domain], *pos);                 \
 848 }                                                                       \
 849                                                                         \
 850 static void *kyber_##name##_rqs_next(struct seq_file *m, void *v,       \
 851                                      loff_t *pos)                       \
 852 {                                                                       \
 853         struct blk_mq_hw_ctx *hctx = m->private;                        \
 854         struct kyber_hctx_data *khd = hctx->sched_data;                 \
 855                                                                         \
 856         return seq_list_next(v, &khd->rqs[domain], pos);                \
 857 }                                                                       \
 858                                                                         \
 859 static void kyber_##name##_rqs_stop(struct seq_file *m, void *v)        \
 860         __releases(&khd->lock)                                          \
 861 {                                                                       \
 862         struct blk_mq_hw_ctx *hctx = m->private;                        \
 863         struct kyber_hctx_data *khd = hctx->sched_data;                 \
 864                                                                         \
 865         spin_unlock(&khd->lock);                                        \
 866 }                                                                       \
 867                                                                         \
 868 static const struct seq_operations kyber_##name##_rqs_seq_ops = {       \
 869         .start  = kyber_##name##_rqs_start,                             \
 870         .next   = kyber_##name##_rqs_next,                              \
 871         .stop   = kyber_##name##_rqs_stop,                              \
 872         .show   = blk_mq_debugfs_rq_show,                               \
 873 };                                                                      \
 874                                                                         \
 875 static int kyber_##name##_waiting_show(void *data, struct seq_file *m)  \
 876 {                                                                       \
 877         struct blk_mq_hw_ctx *hctx = data;                              \
 878         struct kyber_hctx_data *khd = hctx->sched_data;                 \
 879         wait_queue_entry_t *wait = &khd->domain_wait[domain];           \
 880                                                                         \
 881         seq_printf(m, "%d\n", !list_empty_careful(&wait->entry));       \
 882         return 0;                                                       \
 883 }
 884 KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_READ, read)
 885 KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_SYNC_WRITE, sync_write)
 886 KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_OTHER, other)
 887 #undef KYBER_DEBUGFS_DOMAIN_ATTRS
 888
 889 static int kyber_async_depth_show(void *data, struct seq_file *m)
 890 {
 891         struct request_queue *q = data;
 892         struct kyber_queue_data *kqd = q->elevator->elevator_data;
 893
 894         seq_printf(m, "%u\n", kqd->async_depth);
 895         return 0;
 896 }
 897
 898 static int kyber_cur_domain_show(void *data, struct seq_file *m)
 899 {
 900         struct blk_mq_hw_ctx *hctx = data;
 901         struct kyber_hctx_data *khd = hctx->sched_data;
 902
 903         switch (khd->cur_domain) {
 904         case KYBER_READ:
 905                 seq_puts(m, "READ\n");
 906                 break;
 907         case KYBER_SYNC_WRITE:
 908                 seq_puts(m, "SYNC_WRITE\n");
 909                 break;
 910         case KYBER_OTHER:
 911                 seq_puts(m, "OTHER\n");
 912                 break;
 913         default:
 914                 seq_printf(m, "%u\n", khd->cur_domain);
 915                 break;
 916         }
 917         return 0;
 918 }
 919
 920 static int kyber_batching_show(void *data, struct seq_file *m)
 921 {
 922         struct blk_mq_hw_ctx *hctx = data;
 923         struct kyber_hctx_data *khd = hctx->sched_data;
 924
 925         seq_printf(m, "%u\n", khd->batching);
 926         return 0;
 927 }
 928
 929 #define KYBER_QUEUE_DOMAIN_ATTRS(name)  \
 930         {#name "_tokens", 0400, kyber_##name##_tokens_show}
 931 static const struct blk_mq_debugfs_attr kyber_queue_debugfs_attrs[] = {
 932         KYBER_QUEUE_DOMAIN_ATTRS(read),
 933         KYBER_QUEUE_DOMAIN_ATTRS(sync_write),
 934         KYBER_QUEUE_DOMAIN_ATTRS(other),
 935         {"async_depth", 0400, kyber_async_depth_show},
 936         {},
 937 };
 938 #undef KYBER_QUEUE_DOMAIN_ATTRS
 939
 940 #define KYBER_HCTX_DOMAIN_ATTRS(name)                                   \
 941         {#name "_rqs", 0400, .seq_ops = &kyber_##name##_rqs_seq_ops},   \
 942         {#name "_waiting", 0400, kyber_##name##_waiting_show}
 943 static const struct blk_mq_debugfs_attr kyber_hctx_debugfs_attrs[] = {
 944         KYBER_HCTX_DOMAIN_ATTRS(read),
 945         KYBER_HCTX_DOMAIN_ATTRS(sync_write),
 946         KYBER_HCTX_DOMAIN_ATTRS(other),
 947         {"cur_domain", 0400, kyber_cur_domain_show},
 948         {"batching", 0400, kyber_batching_show},
 949         {},
 950 };
 951 #undef KYBER_HCTX_DOMAIN_ATTRS
 952 #endif
 953
 954 static struct elevator_type kyber_sched = {
 955         .ops.mq = {
 956                 .init_sched = kyber_init_sched,
 957                 .exit_sched = kyber_exit_sched,
 958                 .init_hctx = kyber_init_hctx,
 959                 .exit_hctx = kyber_exit_hctx,
 960                 .limit_depth = kyber_limit_depth,
 961                 .bio_merge = kyber_bio_merge,
 962                 .prepare_request = kyber_prepare_request,
 963                 .insert_requests = kyber_insert_requests,
 964                 .finish_request = kyber_finish_request,
 965                 .requeue_request = kyber_finish_request,
 966                 .completed_request = kyber_completed_request,
 967                 .dispatch_request = kyber_dispatch_request,
 968                 .has_work = kyber_has_work,
 969         },
 970         .uses_mq = true,
 971 #ifdef CONFIG_BLK_DEBUG_FS
 972         .queue_debugfs_attrs = kyber_queue_debugfs_attrs,
 973         .hctx_debugfs_attrs = kyber_hctx_debugfs_attrs,
 974 #endif
 975         .elevator_attrs = kyber_sched_attrs,
 976         .elevator_name = "kyber",
 977         .elevator_owner = THIS_MODULE,
 978 };
 979
 980 static int __init kyber_init(void)
 981 {
 982         return elv_register(&kyber_sched);
 983 }
 984
 985 static void __exit kyber_exit(void)
 986 {
 987         elv_unregister(&kyber_sched);
 988 }
 989
 990 module_init(kyber_init);
 991 module_exit(kyber_exit);
 992
 993 MODULE_AUTHOR("Omar Sandoval");
 994 MODULE_LICENSE("GPL");
 995 MODULE_DESCRIPTION("Kyber I/O scheduler");