Merge tag 'selinux-pr-20180629' of git://git.kernel.org/pub/scm/linux/kernel/git...
[platform/kernel/linux-rpi.git] / block / kyber-iosched.c
1 /*
2  * The Kyber I/O scheduler. Controls latency by throttling queue depths using
3  * scalable techniques.
4  *
5  * Copyright (C) 2017 Facebook
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public
9  * License v2 as published by the Free Software Foundation.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program.  If not, see <https://www.gnu.org/licenses/>.
18  */
19
20 #include <linux/kernel.h>
21 #include <linux/blkdev.h>
22 #include <linux/blk-mq.h>
23 #include <linux/elevator.h>
24 #include <linux/module.h>
25 #include <linux/sbitmap.h>
26
27 #include "blk.h"
28 #include "blk-mq.h"
29 #include "blk-mq-debugfs.h"
30 #include "blk-mq-sched.h"
31 #include "blk-mq-tag.h"
32 #include "blk-stat.h"
33
34 /* Scheduling domains. */
35 enum {
36         KYBER_READ,
37         KYBER_SYNC_WRITE,
38         KYBER_OTHER, /* Async writes, discard, etc. */
39         KYBER_NUM_DOMAINS,
40 };
41
42 enum {
43         KYBER_MIN_DEPTH = 256,
44
45         /*
46          * In order to prevent starvation of synchronous requests by a flood of
47          * asynchronous requests, we reserve 25% of requests for synchronous
48          * operations.
49          */
50         KYBER_ASYNC_PERCENT = 75,
51 };
52
53 /*
54  * Initial device-wide depths for each scheduling domain.
55  *
56  * Even for fast devices with lots of tags like NVMe, you can saturate
57  * the device with only a fraction of the maximum possible queue depth.
58  * So, we cap these to a reasonable value.
59  */
60 static const unsigned int kyber_depth[] = {
61         [KYBER_READ] = 256,
62         [KYBER_SYNC_WRITE] = 128,
63         [KYBER_OTHER] = 64,
64 };
65
66 /*
67  * Scheduling domain batch sizes. We favor reads.
68  */
69 static const unsigned int kyber_batch_size[] = {
70         [KYBER_READ] = 16,
71         [KYBER_SYNC_WRITE] = 8,
72         [KYBER_OTHER] = 8,
73 };
74
75 /*
76  * There is a same mapping between ctx & hctx and kcq & khd,
77  * we use request->mq_ctx->index_hw to index the kcq in khd.
78  */
79 struct kyber_ctx_queue {
80         /*
81          * Used to ensure operations on rq_list and kcq_map to be an atmoic one.
82          * Also protect the rqs on rq_list when merge.
83          */
84         spinlock_t lock;
85         struct list_head rq_list[KYBER_NUM_DOMAINS];
86 } ____cacheline_aligned_in_smp;
87
88 struct kyber_queue_data {
89         struct request_queue *q;
90
91         struct blk_stat_callback *cb;
92
93         /*
94          * The device is divided into multiple scheduling domains based on the
95          * request type. Each domain has a fixed number of in-flight requests of
96          * that type device-wide, limited by these tokens.
97          */
98         struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS];
99
100         /*
101          * Async request percentage, converted to per-word depth for
102          * sbitmap_get_shallow().
103          */
104         unsigned int async_depth;
105
106         /* Target latencies in nanoseconds. */
107         u64 read_lat_nsec, write_lat_nsec;
108 };
109
110 struct kyber_hctx_data {
111         spinlock_t lock;
112         struct list_head rqs[KYBER_NUM_DOMAINS];
113         unsigned int cur_domain;
114         unsigned int batching;
115         struct kyber_ctx_queue *kcqs;
116         struct sbitmap kcq_map[KYBER_NUM_DOMAINS];
117         wait_queue_entry_t domain_wait[KYBER_NUM_DOMAINS];
118         struct sbq_wait_state *domain_ws[KYBER_NUM_DOMAINS];
119         atomic_t wait_index[KYBER_NUM_DOMAINS];
120 };
121
122 static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
123                              void *key);
124
125 static unsigned int kyber_sched_domain(unsigned int op)
126 {
127         if ((op & REQ_OP_MASK) == REQ_OP_READ)
128                 return KYBER_READ;
129         else if ((op & REQ_OP_MASK) == REQ_OP_WRITE && op_is_sync(op))
130                 return KYBER_SYNC_WRITE;
131         else
132                 return KYBER_OTHER;
133 }
134
135 enum {
136         NONE = 0,
137         GOOD = 1,
138         GREAT = 2,
139         BAD = -1,
140         AWFUL = -2,
141 };
142
143 #define IS_GOOD(status) ((status) > 0)
144 #define IS_BAD(status) ((status) < 0)
145
146 static int kyber_lat_status(struct blk_stat_callback *cb,
147                             unsigned int sched_domain, u64 target)
148 {
149         u64 latency;
150
151         if (!cb->stat[sched_domain].nr_samples)
152                 return NONE;
153
154         latency = cb->stat[sched_domain].mean;
155         if (latency >= 2 * target)
156                 return AWFUL;
157         else if (latency > target)
158                 return BAD;
159         else if (latency <= target / 2)
160                 return GREAT;
161         else /* (latency <= target) */
162                 return GOOD;
163 }
164
165 /*
166  * Adjust the read or synchronous write depth given the status of reads and
167  * writes. The goal is that the latencies of the two domains are fair (i.e., if
168  * one is good, then the other is good).
169  */
170 static void kyber_adjust_rw_depth(struct kyber_queue_data *kqd,
171                                   unsigned int sched_domain, int this_status,
172                                   int other_status)
173 {
174         unsigned int orig_depth, depth;
175
176         /*
177          * If this domain had no samples, or reads and writes are both good or
178          * both bad, don't adjust the depth.
179          */
180         if (this_status == NONE ||
181             (IS_GOOD(this_status) && IS_GOOD(other_status)) ||
182             (IS_BAD(this_status) && IS_BAD(other_status)))
183                 return;
184
185         orig_depth = depth = kqd->domain_tokens[sched_domain].sb.depth;
186
187         if (other_status == NONE) {
188                 depth++;
189         } else {
190                 switch (this_status) {
191                 case GOOD:
192                         if (other_status == AWFUL)
193                                 depth -= max(depth / 4, 1U);
194                         else
195                                 depth -= max(depth / 8, 1U);
196                         break;
197                 case GREAT:
198                         if (other_status == AWFUL)
199                                 depth /= 2;
200                         else
201                                 depth -= max(depth / 4, 1U);
202                         break;
203                 case BAD:
204                         depth++;
205                         break;
206                 case AWFUL:
207                         if (other_status == GREAT)
208                                 depth += 2;
209                         else
210                                 depth++;
211                         break;
212                 }
213         }
214
215         depth = clamp(depth, 1U, kyber_depth[sched_domain]);
216         if (depth != orig_depth)
217                 sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth);
218 }
219
220 /*
221  * Adjust the depth of other requests given the status of reads and synchronous
222  * writes. As long as either domain is doing fine, we don't throttle, but if
223  * both domains are doing badly, we throttle heavily.
224  */
225 static void kyber_adjust_other_depth(struct kyber_queue_data *kqd,
226                                      int read_status, int write_status,
227                                      bool have_samples)
228 {
229         unsigned int orig_depth, depth;
230         int status;
231
232         orig_depth = depth = kqd->domain_tokens[KYBER_OTHER].sb.depth;
233
234         if (read_status == NONE && write_status == NONE) {
235                 depth += 2;
236         } else if (have_samples) {
237                 if (read_status == NONE)
238                         status = write_status;
239                 else if (write_status == NONE)
240                         status = read_status;
241                 else
242                         status = max(read_status, write_status);
243                 switch (status) {
244                 case GREAT:
245                         depth += 2;
246                         break;
247                 case GOOD:
248                         depth++;
249                         break;
250                 case BAD:
251                         depth -= max(depth / 4, 1U);
252                         break;
253                 case AWFUL:
254                         depth /= 2;
255                         break;
256                 }
257         }
258
259         depth = clamp(depth, 1U, kyber_depth[KYBER_OTHER]);
260         if (depth != orig_depth)
261                 sbitmap_queue_resize(&kqd->domain_tokens[KYBER_OTHER], depth);
262 }
263
264 /*
265  * Apply heuristics for limiting queue depths based on gathered latency
266  * statistics.
267  */
268 static void kyber_stat_timer_fn(struct blk_stat_callback *cb)
269 {
270         struct kyber_queue_data *kqd = cb->data;
271         int read_status, write_status;
272
273         read_status = kyber_lat_status(cb, KYBER_READ, kqd->read_lat_nsec);
274         write_status = kyber_lat_status(cb, KYBER_SYNC_WRITE, kqd->write_lat_nsec);
275
276         kyber_adjust_rw_depth(kqd, KYBER_READ, read_status, write_status);
277         kyber_adjust_rw_depth(kqd, KYBER_SYNC_WRITE, write_status, read_status);
278         kyber_adjust_other_depth(kqd, read_status, write_status,
279                                  cb->stat[KYBER_OTHER].nr_samples != 0);
280
281         /*
282          * Continue monitoring latencies if we aren't hitting the targets or
283          * we're still throttling other requests.
284          */
285         if (!blk_stat_is_active(kqd->cb) &&
286             ((IS_BAD(read_status) || IS_BAD(write_status) ||
287               kqd->domain_tokens[KYBER_OTHER].sb.depth < kyber_depth[KYBER_OTHER])))
288                 blk_stat_activate_msecs(kqd->cb, 100);
289 }
290
291 static unsigned int kyber_sched_tags_shift(struct kyber_queue_data *kqd)
292 {
293         /*
294          * All of the hardware queues have the same depth, so we can just grab
295          * the shift of the first one.
296          */
297         return kqd->q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift;
298 }
299
300 static int kyber_bucket_fn(const struct request *rq)
301 {
302         return kyber_sched_domain(rq->cmd_flags);
303 }
304
305 static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
306 {
307         struct kyber_queue_data *kqd;
308         unsigned int max_tokens;
309         unsigned int shift;
310         int ret = -ENOMEM;
311         int i;
312
313         kqd = kmalloc_node(sizeof(*kqd), GFP_KERNEL, q->node);
314         if (!kqd)
315                 goto err;
316         kqd->q = q;
317
318         kqd->cb = blk_stat_alloc_callback(kyber_stat_timer_fn, kyber_bucket_fn,
319                                           KYBER_NUM_DOMAINS, kqd);
320         if (!kqd->cb)
321                 goto err_kqd;
322
323         /*
324          * The maximum number of tokens for any scheduling domain is at least
325          * the queue depth of a single hardware queue. If the hardware doesn't
326          * have many tags, still provide a reasonable number.
327          */
328         max_tokens = max_t(unsigned int, q->tag_set->queue_depth,
329                            KYBER_MIN_DEPTH);
330         for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
331                 WARN_ON(!kyber_depth[i]);
332                 WARN_ON(!kyber_batch_size[i]);
333                 ret = sbitmap_queue_init_node(&kqd->domain_tokens[i],
334                                               max_tokens, -1, false, GFP_KERNEL,
335                                               q->node);
336                 if (ret) {
337                         while (--i >= 0)
338                                 sbitmap_queue_free(&kqd->domain_tokens[i]);
339                         goto err_cb;
340                 }
341                 sbitmap_queue_resize(&kqd->domain_tokens[i], kyber_depth[i]);
342         }
343
344         shift = kyber_sched_tags_shift(kqd);
345         kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
346
347         kqd->read_lat_nsec = 2000000ULL;
348         kqd->write_lat_nsec = 10000000ULL;
349
350         return kqd;
351
352 err_cb:
353         blk_stat_free_callback(kqd->cb);
354 err_kqd:
355         kfree(kqd);
356 err:
357         return ERR_PTR(ret);
358 }
359
360 static int kyber_init_sched(struct request_queue *q, struct elevator_type *e)
361 {
362         struct kyber_queue_data *kqd;
363         struct elevator_queue *eq;
364
365         eq = elevator_alloc(q, e);
366         if (!eq)
367                 return -ENOMEM;
368
369         kqd = kyber_queue_data_alloc(q);
370         if (IS_ERR(kqd)) {
371                 kobject_put(&eq->kobj);
372                 return PTR_ERR(kqd);
373         }
374
375         eq->elevator_data = kqd;
376         q->elevator = eq;
377
378         blk_stat_add_callback(q, kqd->cb);
379
380         return 0;
381 }
382
383 static void kyber_exit_sched(struct elevator_queue *e)
384 {
385         struct kyber_queue_data *kqd = e->elevator_data;
386         struct request_queue *q = kqd->q;
387         int i;
388
389         blk_stat_remove_callback(q, kqd->cb);
390
391         for (i = 0; i < KYBER_NUM_DOMAINS; i++)
392                 sbitmap_queue_free(&kqd->domain_tokens[i]);
393         blk_stat_free_callback(kqd->cb);
394         kfree(kqd);
395 }
396
397 static void kyber_ctx_queue_init(struct kyber_ctx_queue *kcq)
398 {
399         unsigned int i;
400
401         spin_lock_init(&kcq->lock);
402         for (i = 0; i < KYBER_NUM_DOMAINS; i++)
403                 INIT_LIST_HEAD(&kcq->rq_list[i]);
404 }
405
406 static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
407 {
408         struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
409         struct kyber_hctx_data *khd;
410         int i;
411
412         khd = kmalloc_node(sizeof(*khd), GFP_KERNEL, hctx->numa_node);
413         if (!khd)
414                 return -ENOMEM;
415
416         khd->kcqs = kmalloc_array_node(hctx->nr_ctx,
417                                        sizeof(struct kyber_ctx_queue),
418                                        GFP_KERNEL, hctx->numa_node);
419         if (!khd->kcqs)
420                 goto err_khd;
421
422         for (i = 0; i < hctx->nr_ctx; i++)
423                 kyber_ctx_queue_init(&khd->kcqs[i]);
424
425         for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
426                 if (sbitmap_init_node(&khd->kcq_map[i], hctx->nr_ctx,
427                                       ilog2(8), GFP_KERNEL, hctx->numa_node)) {
428                         while (--i >= 0)
429                                 sbitmap_free(&khd->kcq_map[i]);
430                         goto err_kcqs;
431                 }
432         }
433
434         spin_lock_init(&khd->lock);
435
436         for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
437                 INIT_LIST_HEAD(&khd->rqs[i]);
438                 init_waitqueue_func_entry(&khd->domain_wait[i],
439                                           kyber_domain_wake);
440                 khd->domain_wait[i].private = hctx;
441                 INIT_LIST_HEAD(&khd->domain_wait[i].entry);
442                 atomic_set(&khd->wait_index[i], 0);
443         }
444
445         khd->cur_domain = 0;
446         khd->batching = 0;
447
448         hctx->sched_data = khd;
449         sbitmap_queue_min_shallow_depth(&hctx->sched_tags->bitmap_tags,
450                                         kqd->async_depth);
451
452         return 0;
453
454 err_kcqs:
455         kfree(khd->kcqs);
456 err_khd:
457         kfree(khd);
458         return -ENOMEM;
459 }
460
461 static void kyber_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
462 {
463         struct kyber_hctx_data *khd = hctx->sched_data;
464         int i;
465
466         for (i = 0; i < KYBER_NUM_DOMAINS; i++)
467                 sbitmap_free(&khd->kcq_map[i]);
468         kfree(khd->kcqs);
469         kfree(hctx->sched_data);
470 }
471
472 static int rq_get_domain_token(struct request *rq)
473 {
474         return (long)rq->elv.priv[0];
475 }
476
477 static void rq_set_domain_token(struct request *rq, int token)
478 {
479         rq->elv.priv[0] = (void *)(long)token;
480 }
481
482 static void rq_clear_domain_token(struct kyber_queue_data *kqd,
483                                   struct request *rq)
484 {
485         unsigned int sched_domain;
486         int nr;
487
488         nr = rq_get_domain_token(rq);
489         if (nr != -1) {
490                 sched_domain = kyber_sched_domain(rq->cmd_flags);
491                 sbitmap_queue_clear(&kqd->domain_tokens[sched_domain], nr,
492                                     rq->mq_ctx->cpu);
493         }
494 }
495
496 static void kyber_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
497 {
498         /*
499          * We use the scheduler tags as per-hardware queue queueing tokens.
500          * Async requests can be limited at this stage.
501          */
502         if (!op_is_sync(op)) {
503                 struct kyber_queue_data *kqd = data->q->elevator->elevator_data;
504
505                 data->shallow_depth = kqd->async_depth;
506         }
507 }
508
509 static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
510 {
511         struct kyber_hctx_data *khd = hctx->sched_data;
512         struct blk_mq_ctx *ctx = blk_mq_get_ctx(hctx->queue);
513         struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw];
514         unsigned int sched_domain = kyber_sched_domain(bio->bi_opf);
515         struct list_head *rq_list = &kcq->rq_list[sched_domain];
516         bool merged;
517
518         spin_lock(&kcq->lock);
519         merged = blk_mq_bio_list_merge(hctx->queue, rq_list, bio);
520         spin_unlock(&kcq->lock);
521         blk_mq_put_ctx(ctx);
522
523         return merged;
524 }
525
526 static void kyber_prepare_request(struct request *rq, struct bio *bio)
527 {
528         rq_set_domain_token(rq, -1);
529 }
530
531 static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx,
532                                   struct list_head *rq_list, bool at_head)
533 {
534         struct kyber_hctx_data *khd = hctx->sched_data;
535         struct request *rq, *next;
536
537         list_for_each_entry_safe(rq, next, rq_list, queuelist) {
538                 unsigned int sched_domain = kyber_sched_domain(rq->cmd_flags);
539                 struct kyber_ctx_queue *kcq = &khd->kcqs[rq->mq_ctx->index_hw];
540                 struct list_head *head = &kcq->rq_list[sched_domain];
541
542                 spin_lock(&kcq->lock);
543                 if (at_head)
544                         list_move(&rq->queuelist, head);
545                 else
546                         list_move_tail(&rq->queuelist, head);
547                 sbitmap_set_bit(&khd->kcq_map[sched_domain],
548                                 rq->mq_ctx->index_hw);
549                 blk_mq_sched_request_inserted(rq);
550                 spin_unlock(&kcq->lock);
551         }
552 }
553
554 static void kyber_finish_request(struct request *rq)
555 {
556         struct kyber_queue_data *kqd = rq->q->elevator->elevator_data;
557
558         rq_clear_domain_token(kqd, rq);
559 }
560
561 static void kyber_completed_request(struct request *rq)
562 {
563         struct request_queue *q = rq->q;
564         struct kyber_queue_data *kqd = q->elevator->elevator_data;
565         unsigned int sched_domain;
566         u64 now, latency, target;
567
568         /*
569          * Check if this request met our latency goal. If not, quickly gather
570          * some statistics and start throttling.
571          */
572         sched_domain = kyber_sched_domain(rq->cmd_flags);
573         switch (sched_domain) {
574         case KYBER_READ:
575                 target = kqd->read_lat_nsec;
576                 break;
577         case KYBER_SYNC_WRITE:
578                 target = kqd->write_lat_nsec;
579                 break;
580         default:
581                 return;
582         }
583
584         /* If we are already monitoring latencies, don't check again. */
585         if (blk_stat_is_active(kqd->cb))
586                 return;
587
588         now = ktime_get_ns();
589         if (now < rq->io_start_time_ns)
590                 return;
591
592         latency = now - rq->io_start_time_ns;
593
594         if (latency > target)
595                 blk_stat_activate_msecs(kqd->cb, 10);
596 }
597
598 struct flush_kcq_data {
599         struct kyber_hctx_data *khd;
600         unsigned int sched_domain;
601         struct list_head *list;
602 };
603
604 static bool flush_busy_kcq(struct sbitmap *sb, unsigned int bitnr, void *data)
605 {
606         struct flush_kcq_data *flush_data = data;
607         struct kyber_ctx_queue *kcq = &flush_data->khd->kcqs[bitnr];
608
609         spin_lock(&kcq->lock);
610         list_splice_tail_init(&kcq->rq_list[flush_data->sched_domain],
611                               flush_data->list);
612         sbitmap_clear_bit(sb, bitnr);
613         spin_unlock(&kcq->lock);
614
615         return true;
616 }
617
618 static void kyber_flush_busy_kcqs(struct kyber_hctx_data *khd,
619                                   unsigned int sched_domain,
620                                   struct list_head *list)
621 {
622         struct flush_kcq_data data = {
623                 .khd = khd,
624                 .sched_domain = sched_domain,
625                 .list = list,
626         };
627
628         sbitmap_for_each_set(&khd->kcq_map[sched_domain],
629                              flush_busy_kcq, &data);
630 }
631
632 static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
633                              void *key)
634 {
635         struct blk_mq_hw_ctx *hctx = READ_ONCE(wait->private);
636
637         list_del_init(&wait->entry);
638         blk_mq_run_hw_queue(hctx, true);
639         return 1;
640 }
641
642 static int kyber_get_domain_token(struct kyber_queue_data *kqd,
643                                   struct kyber_hctx_data *khd,
644                                   struct blk_mq_hw_ctx *hctx)
645 {
646         unsigned int sched_domain = khd->cur_domain;
647         struct sbitmap_queue *domain_tokens = &kqd->domain_tokens[sched_domain];
648         wait_queue_entry_t *wait = &khd->domain_wait[sched_domain];
649         struct sbq_wait_state *ws;
650         int nr;
651
652         nr = __sbitmap_queue_get(domain_tokens);
653
654         /*
655          * If we failed to get a domain token, make sure the hardware queue is
656          * run when one becomes available. Note that this is serialized on
657          * khd->lock, but we still need to be careful about the waker.
658          */
659         if (nr < 0 && list_empty_careful(&wait->entry)) {
660                 ws = sbq_wait_ptr(domain_tokens,
661                                   &khd->wait_index[sched_domain]);
662                 khd->domain_ws[sched_domain] = ws;
663                 add_wait_queue(&ws->wait, wait);
664
665                 /*
666                  * Try again in case a token was freed before we got on the wait
667                  * queue.
668                  */
669                 nr = __sbitmap_queue_get(domain_tokens);
670         }
671
672         /*
673          * If we got a token while we were on the wait queue, remove ourselves
674          * from the wait queue to ensure that all wake ups make forward
675          * progress. It's possible that the waker already deleted the entry
676          * between the !list_empty_careful() check and us grabbing the lock, but
677          * list_del_init() is okay with that.
678          */
679         if (nr >= 0 && !list_empty_careful(&wait->entry)) {
680                 ws = khd->domain_ws[sched_domain];
681                 spin_lock_irq(&ws->wait.lock);
682                 list_del_init(&wait->entry);
683                 spin_unlock_irq(&ws->wait.lock);
684         }
685
686         return nr;
687 }
688
689 static struct request *
690 kyber_dispatch_cur_domain(struct kyber_queue_data *kqd,
691                           struct kyber_hctx_data *khd,
692                           struct blk_mq_hw_ctx *hctx)
693 {
694         struct list_head *rqs;
695         struct request *rq;
696         int nr;
697
698         rqs = &khd->rqs[khd->cur_domain];
699
700         /*
701          * If we already have a flushed request, then we just need to get a
702          * token for it. Otherwise, if there are pending requests in the kcqs,
703          * flush the kcqs, but only if we can get a token. If not, we should
704          * leave the requests in the kcqs so that they can be merged. Note that
705          * khd->lock serializes the flushes, so if we observed any bit set in
706          * the kcq_map, we will always get a request.
707          */
708         rq = list_first_entry_or_null(rqs, struct request, queuelist);
709         if (rq) {
710                 nr = kyber_get_domain_token(kqd, khd, hctx);
711                 if (nr >= 0) {
712                         khd->batching++;
713                         rq_set_domain_token(rq, nr);
714                         list_del_init(&rq->queuelist);
715                         return rq;
716                 }
717         } else if (sbitmap_any_bit_set(&khd->kcq_map[khd->cur_domain])) {
718                 nr = kyber_get_domain_token(kqd, khd, hctx);
719                 if (nr >= 0) {
720                         kyber_flush_busy_kcqs(khd, khd->cur_domain, rqs);
721                         rq = list_first_entry(rqs, struct request, queuelist);
722                         khd->batching++;
723                         rq_set_domain_token(rq, nr);
724                         list_del_init(&rq->queuelist);
725                         return rq;
726                 }
727         }
728
729         /* There were either no pending requests or no tokens. */
730         return NULL;
731 }
732
733 static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx)
734 {
735         struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
736         struct kyber_hctx_data *khd = hctx->sched_data;
737         struct request *rq;
738         int i;
739
740         spin_lock(&khd->lock);
741
742         /*
743          * First, if we are still entitled to batch, try to dispatch a request
744          * from the batch.
745          */
746         if (khd->batching < kyber_batch_size[khd->cur_domain]) {
747                 rq = kyber_dispatch_cur_domain(kqd, khd, hctx);
748                 if (rq)
749                         goto out;
750         }
751
752         /*
753          * Either,
754          * 1. We were no longer entitled to a batch.
755          * 2. The domain we were batching didn't have any requests.
756          * 3. The domain we were batching was out of tokens.
757          *
758          * Start another batch. Note that this wraps back around to the original
759          * domain if no other domains have requests or tokens.
760          */
761         khd->batching = 0;
762         for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
763                 if (khd->cur_domain == KYBER_NUM_DOMAINS - 1)
764                         khd->cur_domain = 0;
765                 else
766                         khd->cur_domain++;
767
768                 rq = kyber_dispatch_cur_domain(kqd, khd, hctx);
769                 if (rq)
770                         goto out;
771         }
772
773         rq = NULL;
774 out:
775         spin_unlock(&khd->lock);
776         return rq;
777 }
778
779 static bool kyber_has_work(struct blk_mq_hw_ctx *hctx)
780 {
781         struct kyber_hctx_data *khd = hctx->sched_data;
782         int i;
783
784         for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
785                 if (!list_empty_careful(&khd->rqs[i]) ||
786                     sbitmap_any_bit_set(&khd->kcq_map[i]))
787                         return true;
788         }
789
790         return false;
791 }
792
793 #define KYBER_LAT_SHOW_STORE(op)                                        \
794 static ssize_t kyber_##op##_lat_show(struct elevator_queue *e,          \
795                                      char *page)                        \
796 {                                                                       \
797         struct kyber_queue_data *kqd = e->elevator_data;                \
798                                                                         \
799         return sprintf(page, "%llu\n", kqd->op##_lat_nsec);             \
800 }                                                                       \
801                                                                         \
802 static ssize_t kyber_##op##_lat_store(struct elevator_queue *e,         \
803                                       const char *page, size_t count)   \
804 {                                                                       \
805         struct kyber_queue_data *kqd = e->elevator_data;                \
806         unsigned long long nsec;                                        \
807         int ret;                                                        \
808                                                                         \
809         ret = kstrtoull(page, 10, &nsec);                               \
810         if (ret)                                                        \
811                 return ret;                                             \
812                                                                         \
813         kqd->op##_lat_nsec = nsec;                                      \
814                                                                         \
815         return count;                                                   \
816 }
817 KYBER_LAT_SHOW_STORE(read);
818 KYBER_LAT_SHOW_STORE(write);
819 #undef KYBER_LAT_SHOW_STORE
820
821 #define KYBER_LAT_ATTR(op) __ATTR(op##_lat_nsec, 0644, kyber_##op##_lat_show, kyber_##op##_lat_store)
822 static struct elv_fs_entry kyber_sched_attrs[] = {
823         KYBER_LAT_ATTR(read),
824         KYBER_LAT_ATTR(write),
825         __ATTR_NULL
826 };
827 #undef KYBER_LAT_ATTR
828
829 #ifdef CONFIG_BLK_DEBUG_FS
830 #define KYBER_DEBUGFS_DOMAIN_ATTRS(domain, name)                        \
831 static int kyber_##name##_tokens_show(void *data, struct seq_file *m)   \
832 {                                                                       \
833         struct request_queue *q = data;                                 \
834         struct kyber_queue_data *kqd = q->elevator->elevator_data;      \
835                                                                         \
836         sbitmap_queue_show(&kqd->domain_tokens[domain], m);             \
837         return 0;                                                       \
838 }                                                                       \
839                                                                         \
840 static void *kyber_##name##_rqs_start(struct seq_file *m, loff_t *pos)  \
841         __acquires(&khd->lock)                                          \
842 {                                                                       \
843         struct blk_mq_hw_ctx *hctx = m->private;                        \
844         struct kyber_hctx_data *khd = hctx->sched_data;                 \
845                                                                         \
846         spin_lock(&khd->lock);                                          \
847         return seq_list_start(&khd->rqs[domain], *pos);                 \
848 }                                                                       \
849                                                                         \
850 static void *kyber_##name##_rqs_next(struct seq_file *m, void *v,       \
851                                      loff_t *pos)                       \
852 {                                                                       \
853         struct blk_mq_hw_ctx *hctx = m->private;                        \
854         struct kyber_hctx_data *khd = hctx->sched_data;                 \
855                                                                         \
856         return seq_list_next(v, &khd->rqs[domain], pos);                \
857 }                                                                       \
858                                                                         \
859 static void kyber_##name##_rqs_stop(struct seq_file *m, void *v)        \
860         __releases(&khd->lock)                                          \
861 {                                                                       \
862         struct blk_mq_hw_ctx *hctx = m->private;                        \
863         struct kyber_hctx_data *khd = hctx->sched_data;                 \
864                                                                         \
865         spin_unlock(&khd->lock);                                        \
866 }                                                                       \
867                                                                         \
868 static const struct seq_operations kyber_##name##_rqs_seq_ops = {       \
869         .start  = kyber_##name##_rqs_start,                             \
870         .next   = kyber_##name##_rqs_next,                              \
871         .stop   = kyber_##name##_rqs_stop,                              \
872         .show   = blk_mq_debugfs_rq_show,                               \
873 };                                                                      \
874                                                                         \
875 static int kyber_##name##_waiting_show(void *data, struct seq_file *m)  \
876 {                                                                       \
877         struct blk_mq_hw_ctx *hctx = data;                              \
878         struct kyber_hctx_data *khd = hctx->sched_data;                 \
879         wait_queue_entry_t *wait = &khd->domain_wait[domain];           \
880                                                                         \
881         seq_printf(m, "%d\n", !list_empty_careful(&wait->entry));       \
882         return 0;                                                       \
883 }
884 KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_READ, read)
885 KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_SYNC_WRITE, sync_write)
886 KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_OTHER, other)
887 #undef KYBER_DEBUGFS_DOMAIN_ATTRS
888
889 static int kyber_async_depth_show(void *data, struct seq_file *m)
890 {
891         struct request_queue *q = data;
892         struct kyber_queue_data *kqd = q->elevator->elevator_data;
893
894         seq_printf(m, "%u\n", kqd->async_depth);
895         return 0;
896 }
897
898 static int kyber_cur_domain_show(void *data, struct seq_file *m)
899 {
900         struct blk_mq_hw_ctx *hctx = data;
901         struct kyber_hctx_data *khd = hctx->sched_data;
902
903         switch (khd->cur_domain) {
904         case KYBER_READ:
905                 seq_puts(m, "READ\n");
906                 break;
907         case KYBER_SYNC_WRITE:
908                 seq_puts(m, "SYNC_WRITE\n");
909                 break;
910         case KYBER_OTHER:
911                 seq_puts(m, "OTHER\n");
912                 break;
913         default:
914                 seq_printf(m, "%u\n", khd->cur_domain);
915                 break;
916         }
917         return 0;
918 }
919
920 static int kyber_batching_show(void *data, struct seq_file *m)
921 {
922         struct blk_mq_hw_ctx *hctx = data;
923         struct kyber_hctx_data *khd = hctx->sched_data;
924
925         seq_printf(m, "%u\n", khd->batching);
926         return 0;
927 }
928
929 #define KYBER_QUEUE_DOMAIN_ATTRS(name)  \
930         {#name "_tokens", 0400, kyber_##name##_tokens_show}
931 static const struct blk_mq_debugfs_attr kyber_queue_debugfs_attrs[] = {
932         KYBER_QUEUE_DOMAIN_ATTRS(read),
933         KYBER_QUEUE_DOMAIN_ATTRS(sync_write),
934         KYBER_QUEUE_DOMAIN_ATTRS(other),
935         {"async_depth", 0400, kyber_async_depth_show},
936         {},
937 };
938 #undef KYBER_QUEUE_DOMAIN_ATTRS
939
940 #define KYBER_HCTX_DOMAIN_ATTRS(name)                                   \
941         {#name "_rqs", 0400, .seq_ops = &kyber_##name##_rqs_seq_ops},   \
942         {#name "_waiting", 0400, kyber_##name##_waiting_show}
943 static const struct blk_mq_debugfs_attr kyber_hctx_debugfs_attrs[] = {
944         KYBER_HCTX_DOMAIN_ATTRS(read),
945         KYBER_HCTX_DOMAIN_ATTRS(sync_write),
946         KYBER_HCTX_DOMAIN_ATTRS(other),
947         {"cur_domain", 0400, kyber_cur_domain_show},
948         {"batching", 0400, kyber_batching_show},
949         {},
950 };
951 #undef KYBER_HCTX_DOMAIN_ATTRS
952 #endif
953
954 static struct elevator_type kyber_sched = {
955         .ops.mq = {
956                 .init_sched = kyber_init_sched,
957                 .exit_sched = kyber_exit_sched,
958                 .init_hctx = kyber_init_hctx,
959                 .exit_hctx = kyber_exit_hctx,
960                 .limit_depth = kyber_limit_depth,
961                 .bio_merge = kyber_bio_merge,
962                 .prepare_request = kyber_prepare_request,
963                 .insert_requests = kyber_insert_requests,
964                 .finish_request = kyber_finish_request,
965                 .requeue_request = kyber_finish_request,
966                 .completed_request = kyber_completed_request,
967                 .dispatch_request = kyber_dispatch_request,
968                 .has_work = kyber_has_work,
969         },
970         .uses_mq = true,
971 #ifdef CONFIG_BLK_DEBUG_FS
972         .queue_debugfs_attrs = kyber_queue_debugfs_attrs,
973         .hctx_debugfs_attrs = kyber_hctx_debugfs_attrs,
974 #endif
975         .elevator_attrs = kyber_sched_attrs,
976         .elevator_name = "kyber",
977         .elevator_owner = THIS_MODULE,
978 };
979
980 static int __init kyber_init(void)
981 {
982         return elv_register(&kyber_sched);
983 }
984
985 static void __exit kyber_exit(void)
986 {
987         elv_unregister(&kyber_sched);
988 }
989
990 module_init(kyber_init);
991 module_exit(kyber_exit);
992
993 MODULE_AUTHOR("Omar Sandoval");
994 MODULE_LICENSE("GPL");
995 MODULE_DESCRIPTION("Kyber I/O scheduler");