block: factor out a blk_try_enter_queue helper
[platform/kernel/linux-starfive.git] / block / mq-deadline.c
index 8eea2cb..7f3c393 100644 (file)
@@ -35,41 +35,122 @@ static const int writes_starved = 2;    /* max times reads can starve a write */
 static const int fifo_batch = 16;       /* # of sequential requests treated as one
                                     by the above parameters. For throughput. */
 
+enum dd_data_dir {
+       DD_READ         = READ,
+       DD_WRITE        = WRITE,
+};
+
+enum { DD_DIR_COUNT = 2 };
+
+enum dd_prio {
+       DD_RT_PRIO      = 0,
+       DD_BE_PRIO      = 1,
+       DD_IDLE_PRIO    = 2,
+       DD_PRIO_MAX     = 2,
+};
+
+enum { DD_PRIO_COUNT = 3 };
+
+/* I/O statistics per I/O priority. */
+struct io_stats_per_prio {
+       local_t inserted;
+       local_t merged;
+       local_t dispatched;
+       local_t completed;
+};
+
+/* I/O statistics for all I/O priorities (enum dd_prio). */
+struct io_stats {
+       struct io_stats_per_prio stats[DD_PRIO_COUNT];
+};
+
+/*
+ * Deadline scheduler data per I/O priority (enum dd_prio). Requests are
+ * present on both sort_list[] and fifo_list[].
+ */
+struct dd_per_prio {
+       struct list_head dispatch;
+       struct rb_root sort_list[DD_DIR_COUNT];
+       struct list_head fifo_list[DD_DIR_COUNT];
+       /* Next request in FIFO order. Read, write or both are NULL. */
+       struct request *next_rq[DD_DIR_COUNT];
+};
+
 struct deadline_data {
        /*
         * run time data
         */
 
-       /*
-        * requests (deadline_rq s) are present on both sort_list and fifo_list
-        */
-       struct rb_root sort_list[2];
-       struct list_head fifo_list[2];
+       struct dd_per_prio per_prio[DD_PRIO_COUNT];
 
-       /*
-        * next in sort order. read, write or both are NULL
-        */
-       struct request *next_rq[2];
+       /* Data direction of latest dispatched request. */
+       enum dd_data_dir last_dir;
        unsigned int batching;          /* number of sequential requests made */
        unsigned int starved;           /* times reads have starved writes */
 
+       struct io_stats __percpu *stats;
+
        /*
         * settings that change how the i/o scheduler behaves
         */
-       int fifo_expire[2];
+       int fifo_expire[DD_DIR_COUNT];
        int fifo_batch;
        int writes_starved;
        int front_merges;
+       u32 async_depth;
 
        spinlock_t lock;
        spinlock_t zone_lock;
-       struct list_head dispatch;
+};
+
+/* Count one event of type 'event_type' and with I/O priority 'prio' */
+#define dd_count(dd, event_type, prio) do {                            \
+       struct io_stats *io_stats = get_cpu_ptr((dd)->stats);           \
+                                                                       \
+       BUILD_BUG_ON(!__same_type((dd), struct deadline_data *));       \
+       BUILD_BUG_ON(!__same_type((prio), enum dd_prio));               \
+       local_inc(&io_stats->stats[(prio)].event_type);                 \
+       put_cpu_ptr(io_stats);                                          \
+} while (0)
+
+/*
+ * Returns the total number of dd_count(dd, event_type, prio) calls across all
+ * CPUs. No locking or barriers since it is fine if the returned sum is slightly
+ * outdated.
+ */
+#define dd_sum(dd, event_type, prio) ({                                        \
+       unsigned int cpu;                                               \
+       u32 sum = 0;                                                    \
+                                                                       \
+       BUILD_BUG_ON(!__same_type((dd), struct deadline_data *));       \
+       BUILD_BUG_ON(!__same_type((prio), enum dd_prio));               \
+       for_each_present_cpu(cpu)                                       \
+               sum += local_read(&per_cpu_ptr((dd)->stats, cpu)->      \
+                                 stats[(prio)].event_type);            \
+       sum;                                                            \
+})
+
+/* Maps an I/O priority class to a deadline scheduler priority. */
+static const enum dd_prio ioprio_class_to_prio[] = {
+       [IOPRIO_CLASS_NONE]     = DD_BE_PRIO,
+       [IOPRIO_CLASS_RT]       = DD_RT_PRIO,
+       [IOPRIO_CLASS_BE]       = DD_BE_PRIO,
+       [IOPRIO_CLASS_IDLE]     = DD_IDLE_PRIO,
 };
 
 static inline struct rb_root *
-deadline_rb_root(struct deadline_data *dd, struct request *rq)
+deadline_rb_root(struct dd_per_prio *per_prio, struct request *rq)
 {
-       return &dd->sort_list[rq_data_dir(rq)];
+       return &per_prio->sort_list[rq_data_dir(rq)];
+}
+
+/*
+ * Returns the I/O priority class (IOPRIO_CLASS_*) that has been assigned to a
+ * request.
+ */
+static u8 dd_rq_ioclass(struct request *rq)
+{
+       return IOPRIO_PRIO_CLASS(req_get_ioprio(rq));
 }
 
 /*
@@ -87,38 +168,38 @@ deadline_latter_request(struct request *rq)
 }
 
 static void
-deadline_add_rq_rb(struct deadline_data *dd, struct request *rq)
+deadline_add_rq_rb(struct dd_per_prio *per_prio, struct request *rq)
 {
-       struct rb_root *root = deadline_rb_root(dd, rq);
+       struct rb_root *root = deadline_rb_root(per_prio, rq);
 
        elv_rb_add(root, rq);
 }
 
 static inline void
-deadline_del_rq_rb(struct deadline_data *dd, struct request *rq)
+deadline_del_rq_rb(struct dd_per_prio *per_prio, struct request *rq)
 {
-       const int data_dir = rq_data_dir(rq);
+       const enum dd_data_dir data_dir = rq_data_dir(rq);
 
-       if (dd->next_rq[data_dir] == rq)
-               dd->next_rq[data_dir] = deadline_latter_request(rq);
+       if (per_prio->next_rq[data_dir] == rq)
+               per_prio->next_rq[data_dir] = deadline_latter_request(rq);
 
-       elv_rb_del(deadline_rb_root(dd, rq), rq);
+       elv_rb_del(deadline_rb_root(per_prio, rq), rq);
 }
 
 /*
  * remove rq from rbtree and fifo.
  */
-static void deadline_remove_request(struct request_queue *q, struct request *rq)
+static void deadline_remove_request(struct request_queue *q,
+                                   struct dd_per_prio *per_prio,
+                                   struct request *rq)
 {
-       struct deadline_data *dd = q->elevator->elevator_data;
-
        list_del_init(&rq->queuelist);
 
        /*
         * We might not be on the rbtree, if we are doing an insert merge
         */
        if (!RB_EMPTY_NODE(&rq->rb_node))
-               deadline_del_rq_rb(dd, rq);
+               deadline_del_rq_rb(per_prio, rq);
 
        elv_rqhash_del(q, rq);
        if (q->last_merge == rq)
@@ -129,19 +210,31 @@ static void dd_request_merged(struct request_queue *q, struct request *req,
                              enum elv_merge type)
 {
        struct deadline_data *dd = q->elevator->elevator_data;
+       const u8 ioprio_class = dd_rq_ioclass(req);
+       const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
+       struct dd_per_prio *per_prio = &dd->per_prio[prio];
 
        /*
         * if the merge was a front merge, we need to reposition request
         */
        if (type == ELEVATOR_FRONT_MERGE) {
-               elv_rb_del(deadline_rb_root(dd, req), req);
-               deadline_add_rq_rb(dd, req);
+               elv_rb_del(deadline_rb_root(per_prio, req), req);
+               deadline_add_rq_rb(per_prio, req);
        }
 }
 
+/*
+ * Callback function that is invoked after @next has been merged into @req.
+ */
 static void dd_merged_requests(struct request_queue *q, struct request *req,
                               struct request *next)
 {
+       struct deadline_data *dd = q->elevator->elevator_data;
+       const u8 ioprio_class = dd_rq_ioclass(next);
+       const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
+
+       dd_count(dd, merged, prio);
+
        /*
         * if next expires before rq, assign its expire time to rq
         * and move into next position (next will be deleted) in fifo
@@ -157,34 +250,34 @@ static void dd_merged_requests(struct request_queue *q, struct request *req,
        /*
         * kill knowledge of next, this one is a goner
         */
-       deadline_remove_request(q, next);
+       deadline_remove_request(q, &dd->per_prio[prio], next);
 }
 
 /*
  * move an entry to dispatch queue
  */
 static void
-deadline_move_request(struct deadline_data *dd, struct request *rq)
+deadline_move_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
+                     struct request *rq)
 {
-       const int data_dir = rq_data_dir(rq);
+       const enum dd_data_dir data_dir = rq_data_dir(rq);
 
-       dd->next_rq[READ] = NULL;
-       dd->next_rq[WRITE] = NULL;
-       dd->next_rq[data_dir] = deadline_latter_request(rq);
+       per_prio->next_rq[data_dir] = deadline_latter_request(rq);
 
        /*
         * take it off the sort and fifo list
         */
-       deadline_remove_request(rq->q, rq);
+       deadline_remove_request(rq->q, per_prio, rq);
 }
 
 /*
  * deadline_check_fifo returns 0 if there are no expired requests on the fifo,
  * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir])
  */
-static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)
+static inline int deadline_check_fifo(struct dd_per_prio *per_prio,
+                                     enum dd_data_dir data_dir)
 {
-       struct request *rq = rq_entry_fifo(dd->fifo_list[ddir].next);
+       struct request *rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next);
 
        /*
         * rq is expired!
@@ -200,19 +293,17 @@ static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)
  * dispatch using arrival ordered lists.
  */
 static struct request *
-deadline_fifo_request(struct deadline_data *dd, int data_dir)
+deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
+                     enum dd_data_dir data_dir)
 {
        struct request *rq;
        unsigned long flags;
 
-       if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
-               return NULL;
-
-       if (list_empty(&dd->fifo_list[data_dir]))
+       if (list_empty(&per_prio->fifo_list[data_dir]))
                return NULL;
 
-       rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
-       if (data_dir == READ || !blk_queue_is_zoned(rq->q))
+       rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next);
+       if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q))
                return rq;
 
        /*
@@ -220,7 +311,7 @@ deadline_fifo_request(struct deadline_data *dd, int data_dir)
         * an unlocked target zone.
         */
        spin_lock_irqsave(&dd->zone_lock, flags);
-       list_for_each_entry(rq, &dd->fifo_list[WRITE], queuelist) {
+       list_for_each_entry(rq, &per_prio->fifo_list[DD_WRITE], queuelist) {
                if (blk_req_can_dispatch_to_zone(rq))
                        goto out;
        }
@@ -236,19 +327,17 @@ out:
  * dispatch using sector position sorted lists.
  */
 static struct request *
-deadline_next_request(struct deadline_data *dd, int data_dir)
+deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
+                     enum dd_data_dir data_dir)
 {
        struct request *rq;
        unsigned long flags;
 
-       if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
-               return NULL;
-
-       rq = dd->next_rq[data_dir];
+       rq = per_prio->next_rq[data_dir];
        if (!rq)
                return NULL;
 
-       if (data_dir == READ || !blk_queue_is_zoned(rq->q))
+       if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q))
                return rq;
 
        /*
@@ -270,28 +359,27 @@ deadline_next_request(struct deadline_data *dd, int data_dir)
  * deadline_dispatch_requests selects the best request according to
  * read/write expire, fifo_batch, etc
  */
-static struct request *__dd_dispatch_request(struct deadline_data *dd)
+static struct request *__dd_dispatch_request(struct deadline_data *dd,
+                                            struct dd_per_prio *per_prio)
 {
        struct request *rq, *next_rq;
-       bool reads, writes;
-       int data_dir;
+       enum dd_data_dir data_dir;
+       enum dd_prio prio;
+       u8 ioprio_class;
 
-       if (!list_empty(&dd->dispatch)) {
-               rq = list_first_entry(&dd->dispatch, struct request, queuelist);
+       lockdep_assert_held(&dd->lock);
+
+       if (!list_empty(&per_prio->dispatch)) {
+               rq = list_first_entry(&per_prio->dispatch, struct request,
+                                     queuelist);
                list_del_init(&rq->queuelist);
                goto done;
        }
 
-       reads = !list_empty(&dd->fifo_list[READ]);
-       writes = !list_empty(&dd->fifo_list[WRITE]);
-
        /*
         * batches are currently reads XOR writes
         */
-       rq = deadline_next_request(dd, WRITE);
-       if (!rq)
-               rq = deadline_next_request(dd, READ);
-
+       rq = deadline_next_request(dd, per_prio, dd->last_dir);
        if (rq && dd->batching < dd->fifo_batch)
                /* we have a next request are still entitled to batch */
                goto dispatch_request;
@@ -301,14 +389,14 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd)
         * data direction (read / write)
         */
 
-       if (reads) {
-               BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ]));
+       if (!list_empty(&per_prio->fifo_list[DD_READ])) {
+               BUG_ON(RB_EMPTY_ROOT(&per_prio->sort_list[DD_READ]));
 
-               if (deadline_fifo_request(dd, WRITE) &&
+               if (deadline_fifo_request(dd, per_prio, DD_WRITE) &&
                    (dd->starved++ >= dd->writes_starved))
                        goto dispatch_writes;
 
-               data_dir = READ;
+               data_dir = DD_READ;
 
                goto dispatch_find_request;
        }
@@ -317,13 +405,13 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd)
         * there are either no reads or writes have been starved
         */
 
-       if (writes) {
+       if (!list_empty(&per_prio->fifo_list[DD_WRITE])) {
 dispatch_writes:
-               BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[WRITE]));
+               BUG_ON(RB_EMPTY_ROOT(&per_prio->sort_list[DD_WRITE]));
 
                dd->starved = 0;
 
-               data_dir = WRITE;
+               data_dir = DD_WRITE;
 
                goto dispatch_find_request;
        }
@@ -334,14 +422,14 @@ dispatch_find_request:
        /*
         * we are not running a batch, find best request for selected data_dir
         */
-       next_rq = deadline_next_request(dd, data_dir);
-       if (deadline_check_fifo(dd, data_dir) || !next_rq) {
+       next_rq = deadline_next_request(dd, per_prio, data_dir);
+       if (deadline_check_fifo(per_prio, data_dir) || !next_rq) {
                /*
                 * A deadline has expired, the last request was in the other
                 * direction, or we have run out of higher-sectored requests.
                 * Start again from the request with the earliest expiry time.
                 */
-               rq = deadline_fifo_request(dd, data_dir);
+               rq = deadline_fifo_request(dd, per_prio, data_dir);
        } else {
                /*
                 * The last req was the same dir and we have a next request in
@@ -357,6 +445,7 @@ dispatch_find_request:
        if (!rq)
                return NULL;
 
+       dd->last_dir = data_dir;
        dd->batching = 0;
 
 dispatch_request:
@@ -364,8 +453,11 @@ dispatch_request:
         * rq is the selected appropriate request.
         */
        dd->batching++;
-       deadline_move_request(dd, rq);
+       deadline_move_request(dd, per_prio, rq);
 done:
+       ioprio_class = dd_rq_ioclass(rq);
+       prio = ioprio_class_to_prio[ioprio_class];
+       dd_count(dd, dispatched, prio);
        /*
         * If the request needs its target zone locked, do it.
         */
@@ -375,6 +467,8 @@ done:
 }
 
 /*
+ * Called from blk_mq_run_hw_queue() -> __blk_mq_sched_dispatch_requests().
+ *
  * One confusing aspect here is that we get called for a specific
  * hardware queue, but we may return a request that is for a
  * different hardware queue. This is because mq-deadline has shared
@@ -384,20 +478,70 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
 {
        struct deadline_data *dd = hctx->queue->elevator->elevator_data;
        struct request *rq;
+       enum dd_prio prio;
 
        spin_lock(&dd->lock);
-       rq = __dd_dispatch_request(dd);
+       for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
+               rq = __dd_dispatch_request(dd, &dd->per_prio[prio]);
+               if (rq)
+                       break;
+       }
        spin_unlock(&dd->lock);
 
        return rq;
 }
 
-static void dd_exit_queue(struct elevator_queue *e)
+/*
+ * Called by __blk_mq_alloc_request(). The shallow_depth value set by this
+ * function is used by __blk_mq_get_tag().
+ */
+static void dd_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
+{
+       struct deadline_data *dd = data->q->elevator->elevator_data;
+
+       /* Do not throttle synchronous reads. */
+       if (op_is_sync(op) && !op_is_write(op))
+               return;
+
+       /*
+        * Throttle asynchronous requests and writes such that these requests
+        * do not block the allocation of synchronous requests.
+        */
+       data->shallow_depth = dd->async_depth;
+}
+
+/* Called by blk_mq_update_nr_requests(). */
+static void dd_depth_updated(struct blk_mq_hw_ctx *hctx)
+{
+       struct request_queue *q = hctx->queue;
+       struct deadline_data *dd = q->elevator->elevator_data;
+       struct blk_mq_tags *tags = hctx->sched_tags;
+
+       dd->async_depth = max(1UL, 3 * q->nr_requests / 4);
+
+       sbitmap_queue_min_shallow_depth(tags->bitmap_tags, dd->async_depth);
+}
+
+/* Called by blk_mq_init_hctx() and blk_mq_init_sched(). */
+static int dd_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
+{
+       dd_depth_updated(hctx);
+       return 0;
+}
+
+static void dd_exit_sched(struct elevator_queue *e)
 {
        struct deadline_data *dd = e->elevator_data;
+       enum dd_prio prio;
 
-       BUG_ON(!list_empty(&dd->fifo_list[READ]));
-       BUG_ON(!list_empty(&dd->fifo_list[WRITE]));
+       for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
+               struct dd_per_prio *per_prio = &dd->per_prio[prio];
+
+               WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_READ]));
+               WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_WRITE]));
+       }
+
+       free_percpu(dd->stats);
 
        kfree(dd);
 }
@@ -405,55 +549,82 @@ static void dd_exit_queue(struct elevator_queue *e)
 /*
  * initialize elevator private data (deadline_data).
  */
-static int dd_init_queue(struct request_queue *q, struct elevator_type *e)
+static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
 {
        struct deadline_data *dd;
        struct elevator_queue *eq;
+       enum dd_prio prio;
+       int ret = -ENOMEM;
 
        eq = elevator_alloc(q, e);
        if (!eq)
-               return -ENOMEM;
+               return ret;
 
        dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node);
-       if (!dd) {
-               kobject_put(&eq->kobj);
-               return -ENOMEM;
-       }
+       if (!dd)
+               goto put_eq;
+
        eq->elevator_data = dd;
 
-       INIT_LIST_HEAD(&dd->fifo_list[READ]);
-       INIT_LIST_HEAD(&dd->fifo_list[WRITE]);
-       dd->sort_list[READ] = RB_ROOT;
-       dd->sort_list[WRITE] = RB_ROOT;
-       dd->fifo_expire[READ] = read_expire;
-       dd->fifo_expire[WRITE] = write_expire;
+       dd->stats = alloc_percpu_gfp(typeof(*dd->stats),
+                                    GFP_KERNEL | __GFP_ZERO);
+       if (!dd->stats)
+               goto free_dd;
+
+       for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
+               struct dd_per_prio *per_prio = &dd->per_prio[prio];
+
+               INIT_LIST_HEAD(&per_prio->dispatch);
+               INIT_LIST_HEAD(&per_prio->fifo_list[DD_READ]);
+               INIT_LIST_HEAD(&per_prio->fifo_list[DD_WRITE]);
+               per_prio->sort_list[DD_READ] = RB_ROOT;
+               per_prio->sort_list[DD_WRITE] = RB_ROOT;
+       }
+       dd->fifo_expire[DD_READ] = read_expire;
+       dd->fifo_expire[DD_WRITE] = write_expire;
        dd->writes_starved = writes_starved;
        dd->front_merges = 1;
+       dd->last_dir = DD_WRITE;
        dd->fifo_batch = fifo_batch;
        spin_lock_init(&dd->lock);
        spin_lock_init(&dd->zone_lock);
-       INIT_LIST_HEAD(&dd->dispatch);
 
        q->elevator = eq;
        return 0;
+
+free_dd:
+       kfree(dd);
+
+put_eq:
+       kobject_put(&eq->kobj);
+       return ret;
 }
 
+/*
+ * Try to merge @bio into an existing request. If @bio has been merged into
+ * an existing request, store the pointer to that request into *@rq.
+ */
 static int dd_request_merge(struct request_queue *q, struct request **rq,
                            struct bio *bio)
 {
        struct deadline_data *dd = q->elevator->elevator_data;
+       const u8 ioprio_class = IOPRIO_PRIO_CLASS(bio->bi_ioprio);
+       const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
+       struct dd_per_prio *per_prio = &dd->per_prio[prio];
        sector_t sector = bio_end_sector(bio);
        struct request *__rq;
 
        if (!dd->front_merges)
                return ELEVATOR_NO_MERGE;
 
-       __rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector);
+       __rq = elv_rb_find(&per_prio->sort_list[bio_data_dir(bio)], sector);
        if (__rq) {
                BUG_ON(sector != blk_rq_pos(__rq));
 
                if (elv_bio_merge_ok(__rq, bio)) {
                        *rq = __rq;
+                       if (blk_discard_mergable(__rq))
+                               return ELEVATOR_DISCARD_MERGE;
                        return ELEVATOR_FRONT_MERGE;
                }
        }
@@ -461,6 +632,10 @@ static int dd_request_merge(struct request_queue *q, struct request **rq,
        return ELEVATOR_NO_MERGE;
 }
 
+/*
+ * Attempt to merge a bio into an existing request. This function is called
+ * before @bio is associated with a request.
+ */
 static bool dd_bio_merge(struct request_queue *q, struct bio *bio,
                unsigned int nr_segs)
 {
@@ -486,7 +661,14 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 {
        struct request_queue *q = hctx->queue;
        struct deadline_data *dd = q->elevator->elevator_data;
-       const int data_dir = rq_data_dir(rq);
+       const enum dd_data_dir data_dir = rq_data_dir(rq);
+       u16 ioprio = req_get_ioprio(rq);
+       u8 ioprio_class = IOPRIO_PRIO_CLASS(ioprio);
+       struct dd_per_prio *per_prio;
+       enum dd_prio prio;
+       LIST_HEAD(free);
+
+       lockdep_assert_held(&dd->lock);
 
        /*
         * This may be a requeue of a write request that has locked its
@@ -494,15 +676,22 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
         */
        blk_req_zone_write_unlock(rq);
 
-       if (blk_mq_sched_try_insert_merge(q, rq))
+       prio = ioprio_class_to_prio[ioprio_class];
+       dd_count(dd, inserted, prio);
+       rq->elv.priv[0] = (void *)(uintptr_t)1;
+
+       if (blk_mq_sched_try_insert_merge(q, rq, &free)) {
+               blk_mq_free_requests(&free);
                return;
+       }
 
        trace_block_rq_insert(rq);
 
+       per_prio = &dd->per_prio[prio];
        if (at_head) {
-               list_add(&rq->queuelist, &dd->dispatch);
+               list_add(&rq->queuelist, &per_prio->dispatch);
        } else {
-               deadline_add_rq_rb(dd, rq);
+               deadline_add_rq_rb(per_prio, rq);
 
                if (rq_mergeable(rq)) {
                        elv_rqhash_add(q, rq);
@@ -514,10 +703,13 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
                 * set expire time and add to fifo list
                 */
                rq->fifo_time = jiffies + dd->fifo_expire[data_dir];
-               list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]);
+               list_add_tail(&rq->queuelist, &per_prio->fifo_list[data_dir]);
        }
 }
 
+/*
+ * Called from blk_mq_sched_insert_request() or blk_mq_sched_insert_requests().
+ */
 static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
                               struct list_head *list, bool at_head)
 {
@@ -535,15 +727,15 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
        spin_unlock(&dd->lock);
 }
 
-/*
- * Nothing to do here. This is defined only to ensure that .finish_request
- * method is called upon request completion.
- */
+/* Callback from inside blk_mq_rq_ctx_init(). */
 static void dd_prepare_request(struct request *rq)
 {
+       rq->elv.priv[0] = NULL;
 }
 
 /*
+ * Callback from inside blk_mq_free_request().
+ *
  * For zoned block devices, write unlock the target zone of
  * completed write requests. Do this while holding the zone lock
  * spinlock so that the zone is never unlocked while deadline_fifo_request()
@@ -560,83 +752,99 @@ static void dd_prepare_request(struct request *rq)
 static void dd_finish_request(struct request *rq)
 {
        struct request_queue *q = rq->q;
+       struct deadline_data *dd = q->elevator->elevator_data;
+       const u8 ioprio_class = dd_rq_ioclass(rq);
+       const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
+       struct dd_per_prio *per_prio = &dd->per_prio[prio];
+
+       /*
+        * The block layer core may call dd_finish_request() without having
+        * called dd_insert_requests(). Hence only update statistics for
+        * requests for which dd_insert_requests() has been called. See also
+        * blk_mq_request_bypass_insert().
+        */
+       if (rq->elv.priv[0])
+               dd_count(dd, completed, prio);
 
        if (blk_queue_is_zoned(q)) {
-               struct deadline_data *dd = q->elevator->elevator_data;
                unsigned long flags;
 
                spin_lock_irqsave(&dd->zone_lock, flags);
                blk_req_zone_write_unlock(rq);
-               if (!list_empty(&dd->fifo_list[WRITE]))
+               if (!list_empty(&per_prio->fifo_list[DD_WRITE]))
                        blk_mq_sched_mark_restart_hctx(rq->mq_hctx);
                spin_unlock_irqrestore(&dd->zone_lock, flags);
        }
 }
 
+static bool dd_has_work_for_prio(struct dd_per_prio *per_prio)
+{
+       return !list_empty_careful(&per_prio->dispatch) ||
+               !list_empty_careful(&per_prio->fifo_list[DD_READ]) ||
+               !list_empty_careful(&per_prio->fifo_list[DD_WRITE]);
+}
+
 static bool dd_has_work(struct blk_mq_hw_ctx *hctx)
 {
        struct deadline_data *dd = hctx->queue->elevator->elevator_data;
+       enum dd_prio prio;
 
-       return !list_empty_careful(&dd->dispatch) ||
-               !list_empty_careful(&dd->fifo_list[0]) ||
-               !list_empty_careful(&dd->fifo_list[1]);
+       for (prio = 0; prio <= DD_PRIO_MAX; prio++)
+               if (dd_has_work_for_prio(&dd->per_prio[prio]))
+                       return true;
+
+       return false;
 }
 
 /*
  * sysfs parts below
  */
-static ssize_t
-deadline_var_show(int var, char *page)
-{
-       return sprintf(page, "%d\n", var);
-}
-
-static void
-deadline_var_store(int *var, const char *page)
-{
-       char *p = (char *) page;
-
-       *var = simple_strtol(p, &p, 10);
-}
-
-#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)                           \
+#define SHOW_INT(__FUNC, __VAR)                                                \
 static ssize_t __FUNC(struct elevator_queue *e, char *page)            \
 {                                                                      \
        struct deadline_data *dd = e->elevator_data;                    \
-       int __data = __VAR;                                             \
-       if (__CONV)                                                     \
-               __data = jiffies_to_msecs(__data);                      \
-       return deadline_var_show(__data, (page));                       \
-}
-SHOW_FUNCTION(deadline_read_expire_show, dd->fifo_expire[READ], 1);
-SHOW_FUNCTION(deadline_write_expire_show, dd->fifo_expire[WRITE], 1);
-SHOW_FUNCTION(deadline_writes_starved_show, dd->writes_starved, 0);
-SHOW_FUNCTION(deadline_front_merges_show, dd->front_merges, 0);
-SHOW_FUNCTION(deadline_fifo_batch_show, dd->fifo_batch, 0);
-#undef SHOW_FUNCTION
+                                                                       \
+       return sysfs_emit(page, "%d\n", __VAR);                         \
+}
+#define SHOW_JIFFIES(__FUNC, __VAR) SHOW_INT(__FUNC, jiffies_to_msecs(__VAR))
+SHOW_JIFFIES(deadline_read_expire_show, dd->fifo_expire[DD_READ]);
+SHOW_JIFFIES(deadline_write_expire_show, dd->fifo_expire[DD_WRITE]);
+SHOW_INT(deadline_writes_starved_show, dd->writes_starved);
+SHOW_INT(deadline_front_merges_show, dd->front_merges);
+SHOW_INT(deadline_async_depth_show, dd->front_merges);
+SHOW_INT(deadline_fifo_batch_show, dd->fifo_batch);
+#undef SHOW_INT
+#undef SHOW_JIFFIES
 
 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)                        \
 static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)        \
 {                                                                      \
        struct deadline_data *dd = e->elevator_data;                    \
-       int __data;                                                     \
-       deadline_var_store(&__data, (page));                            \
+       int __data, __ret;                                              \
+                                                                       \
+       __ret = kstrtoint(page, 0, &__data);                            \
+       if (__ret < 0)                                                  \
+               return __ret;                                           \
        if (__data < (MIN))                                             \
                __data = (MIN);                                         \
        else if (__data > (MAX))                                        \
                __data = (MAX);                                         \
-       if (__CONV)                                                     \
-               *(__PTR) = msecs_to_jiffies(__data);                    \
-       else                                                            \
-               *(__PTR) = __data;                                      \
+       *(__PTR) = __CONV(__data);                                      \
        return count;                                                   \
 }
-STORE_FUNCTION(deadline_read_expire_store, &dd->fifo_expire[READ], 0, INT_MAX, 1);
-STORE_FUNCTION(deadline_write_expire_store, &dd->fifo_expire[WRITE], 0, INT_MAX, 1);
-STORE_FUNCTION(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX, 0);
-STORE_FUNCTION(deadline_front_merges_store, &dd->front_merges, 0, 1, 0);
-STORE_FUNCTION(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX, 0);
+#define STORE_INT(__FUNC, __PTR, MIN, MAX)                             \
+       STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, )
+#define STORE_JIFFIES(__FUNC, __PTR, MIN, MAX)                         \
+       STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, msecs_to_jiffies)
+STORE_JIFFIES(deadline_read_expire_store, &dd->fifo_expire[DD_READ], 0, INT_MAX);
+STORE_JIFFIES(deadline_write_expire_store, &dd->fifo_expire[DD_WRITE], 0, INT_MAX);
+STORE_INT(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX);
+STORE_INT(deadline_front_merges_store, &dd->front_merges, 0, 1);
+STORE_INT(deadline_async_depth_store, &dd->front_merges, 1, INT_MAX);
+STORE_INT(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX);
 #undef STORE_FUNCTION
+#undef STORE_INT
+#undef STORE_JIFFIES
 
 #define DD_ATTR(name) \
        __ATTR(name, 0644, deadline_##name##_show, deadline_##name##_store)
@@ -646,21 +854,23 @@ static struct elv_fs_entry deadline_attrs[] = {
        DD_ATTR(write_expire),
        DD_ATTR(writes_starved),
        DD_ATTR(front_merges),
+       DD_ATTR(async_depth),
        DD_ATTR(fifo_batch),
        __ATTR_NULL
 };
 
 #ifdef CONFIG_BLK_DEBUG_FS
-#define DEADLINE_DEBUGFS_DDIR_ATTRS(ddir, name)                                \
+#define DEADLINE_DEBUGFS_DDIR_ATTRS(prio, data_dir, name)              \
 static void *deadline_##name##_fifo_start(struct seq_file *m,          \
                                          loff_t *pos)                  \
        __acquires(&dd->lock)                                           \
 {                                                                      \
        struct request_queue *q = m->private;                           \
        struct deadline_data *dd = q->elevator->elevator_data;          \
+       struct dd_per_prio *per_prio = &dd->per_prio[prio];             \
                                                                        \
        spin_lock(&dd->lock);                                           \
-       return seq_list_start(&dd->fifo_list[ddir], *pos);              \
+       return seq_list_start(&per_prio->fifo_list[data_dir], *pos);    \
 }                                                                      \
                                                                        \
 static void *deadline_##name##_fifo_next(struct seq_file *m, void *v,  \
@@ -668,8 +878,9 @@ static void *deadline_##name##_fifo_next(struct seq_file *m, void *v,       \
 {                                                                      \
        struct request_queue *q = m->private;                           \
        struct deadline_data *dd = q->elevator->elevator_data;          \
+       struct dd_per_prio *per_prio = &dd->per_prio[prio];             \
                                                                        \
-       return seq_list_next(v, &dd->fifo_list[ddir], pos);             \
+       return seq_list_next(v, &per_prio->fifo_list[data_dir], pos);   \
 }                                                                      \
                                                                        \
 static void deadline_##name##_fifo_stop(struct seq_file *m, void *v)   \
@@ -693,14 +904,20 @@ static int deadline_##name##_next_rq_show(void *data,                     \
 {                                                                      \
        struct request_queue *q = data;                                 \
        struct deadline_data *dd = q->elevator->elevator_data;          \
-       struct request *rq = dd->next_rq[ddir];                         \
+       struct dd_per_prio *per_prio = &dd->per_prio[prio];             \
+       struct request *rq = per_prio->next_rq[data_dir];               \
                                                                        \
        if (rq)                                                         \
                __blk_mq_debugfs_rq_show(m, rq);                        \
        return 0;                                                       \
 }
-DEADLINE_DEBUGFS_DDIR_ATTRS(READ, read)
-DEADLINE_DEBUGFS_DDIR_ATTRS(WRITE, write)
+
+DEADLINE_DEBUGFS_DDIR_ATTRS(DD_RT_PRIO, DD_READ, read0);
+DEADLINE_DEBUGFS_DDIR_ATTRS(DD_RT_PRIO, DD_WRITE, write0);
+DEADLINE_DEBUGFS_DDIR_ATTRS(DD_BE_PRIO, DD_READ, read1);
+DEADLINE_DEBUGFS_DDIR_ATTRS(DD_BE_PRIO, DD_WRITE, write1);
+DEADLINE_DEBUGFS_DDIR_ATTRS(DD_IDLE_PRIO, DD_READ, read2);
+DEADLINE_DEBUGFS_DDIR_ATTRS(DD_IDLE_PRIO, DD_WRITE, write2);
 #undef DEADLINE_DEBUGFS_DDIR_ATTRS
 
 static int deadline_batching_show(void *data, struct seq_file *m)
@@ -721,49 +938,120 @@ static int deadline_starved_show(void *data, struct seq_file *m)
        return 0;
 }
 
-static void *deadline_dispatch_start(struct seq_file *m, loff_t *pos)
-       __acquires(&dd->lock)
+static int dd_async_depth_show(void *data, struct seq_file *m)
 {
-       struct request_queue *q = m->private;
+       struct request_queue *q = data;
        struct deadline_data *dd = q->elevator->elevator_data;
 
-       spin_lock(&dd->lock);
-       return seq_list_start(&dd->dispatch, *pos);
+       seq_printf(m, "%u\n", dd->async_depth);
+       return 0;
+}
+
+/* Number of requests queued for a given priority level. */
+static u32 dd_queued(struct deadline_data *dd, enum dd_prio prio)
+{
+       return dd_sum(dd, inserted, prio) - dd_sum(dd, completed, prio);
 }
 
-static void *deadline_dispatch_next(struct seq_file *m, void *v, loff_t *pos)
+static int dd_queued_show(void *data, struct seq_file *m)
 {
-       struct request_queue *q = m->private;
+       struct request_queue *q = data;
        struct deadline_data *dd = q->elevator->elevator_data;
 
-       return seq_list_next(v, &dd->dispatch, pos);
+       seq_printf(m, "%u %u %u\n", dd_queued(dd, DD_RT_PRIO),
+                  dd_queued(dd, DD_BE_PRIO),
+                  dd_queued(dd, DD_IDLE_PRIO));
+       return 0;
+}
+
+/* Number of requests owned by the block driver for a given priority. */
+static u32 dd_owned_by_driver(struct deadline_data *dd, enum dd_prio prio)
+{
+       return dd_sum(dd, dispatched, prio) + dd_sum(dd, merged, prio)
+               - dd_sum(dd, completed, prio);
 }
 
-static void deadline_dispatch_stop(struct seq_file *m, void *v)
-       __releases(&dd->lock)
+static int dd_owned_by_driver_show(void *data, struct seq_file *m)
 {
-       struct request_queue *q = m->private;
+       struct request_queue *q = data;
        struct deadline_data *dd = q->elevator->elevator_data;
 
-       spin_unlock(&dd->lock);
+       seq_printf(m, "%u %u %u\n", dd_owned_by_driver(dd, DD_RT_PRIO),
+                  dd_owned_by_driver(dd, DD_BE_PRIO),
+                  dd_owned_by_driver(dd, DD_IDLE_PRIO));
+       return 0;
 }
 
-static const struct seq_operations deadline_dispatch_seq_ops = {
-       .start  = deadline_dispatch_start,
-       .next   = deadline_dispatch_next,
-       .stop   = deadline_dispatch_stop,
-       .show   = blk_mq_debugfs_rq_show,
-};
+#define DEADLINE_DISPATCH_ATTR(prio)                                   \
+static void *deadline_dispatch##prio##_start(struct seq_file *m,       \
+                                            loff_t *pos)               \
+       __acquires(&dd->lock)                                           \
+{                                                                      \
+       struct request_queue *q = m->private;                           \
+       struct deadline_data *dd = q->elevator->elevator_data;          \
+       struct dd_per_prio *per_prio = &dd->per_prio[prio];             \
+                                                                       \
+       spin_lock(&dd->lock);                                           \
+       return seq_list_start(&per_prio->dispatch, *pos);               \
+}                                                                      \
+                                                                       \
+static void *deadline_dispatch##prio##_next(struct seq_file *m,                \
+                                           void *v, loff_t *pos)       \
+{                                                                      \
+       struct request_queue *q = m->private;                           \
+       struct deadline_data *dd = q->elevator->elevator_data;          \
+       struct dd_per_prio *per_prio = &dd->per_prio[prio];             \
+                                                                       \
+       return seq_list_next(v, &per_prio->dispatch, pos);              \
+}                                                                      \
+                                                                       \
+static void deadline_dispatch##prio##_stop(struct seq_file *m, void *v)        \
+       __releases(&dd->lock)                                           \
+{                                                                      \
+       struct request_queue *q = m->private;                           \
+       struct deadline_data *dd = q->elevator->elevator_data;          \
+                                                                       \
+       spin_unlock(&dd->lock);                                         \
+}                                                                      \
+                                                                       \
+static const struct seq_operations deadline_dispatch##prio##_seq_ops = { \
+       .start  = deadline_dispatch##prio##_start,                      \
+       .next   = deadline_dispatch##prio##_next,                       \
+       .stop   = deadline_dispatch##prio##_stop,                       \
+       .show   = blk_mq_debugfs_rq_show,                               \
+}
+
+DEADLINE_DISPATCH_ATTR(0);
+DEADLINE_DISPATCH_ATTR(1);
+DEADLINE_DISPATCH_ATTR(2);
+#undef DEADLINE_DISPATCH_ATTR
 
-#define DEADLINE_QUEUE_DDIR_ATTRS(name)                                                \
-       {#name "_fifo_list", 0400, .seq_ops = &deadline_##name##_fifo_seq_ops}, \
+#define DEADLINE_QUEUE_DDIR_ATTRS(name)                                        \
+       {#name "_fifo_list", 0400,                                      \
+                       .seq_ops = &deadline_##name##_fifo_seq_ops}
+#define DEADLINE_NEXT_RQ_ATTR(name)                                    \
        {#name "_next_rq", 0400, deadline_##name##_next_rq_show}
 static const struct blk_mq_debugfs_attr deadline_queue_debugfs_attrs[] = {
-       DEADLINE_QUEUE_DDIR_ATTRS(read),
-       DEADLINE_QUEUE_DDIR_ATTRS(write),
+       DEADLINE_QUEUE_DDIR_ATTRS(read0),
+       DEADLINE_QUEUE_DDIR_ATTRS(write0),
+       DEADLINE_QUEUE_DDIR_ATTRS(read1),
+       DEADLINE_QUEUE_DDIR_ATTRS(write1),
+       DEADLINE_QUEUE_DDIR_ATTRS(read2),
+       DEADLINE_QUEUE_DDIR_ATTRS(write2),
+       DEADLINE_NEXT_RQ_ATTR(read0),
+       DEADLINE_NEXT_RQ_ATTR(write0),
+       DEADLINE_NEXT_RQ_ATTR(read1),
+       DEADLINE_NEXT_RQ_ATTR(write1),
+       DEADLINE_NEXT_RQ_ATTR(read2),
+       DEADLINE_NEXT_RQ_ATTR(write2),
        {"batching", 0400, deadline_batching_show},
        {"starved", 0400, deadline_starved_show},
-       {"dispatch", 0400, .seq_ops = &deadline_dispatch_seq_ops},
+       {"async_depth", 0400, dd_async_depth_show},
+       {"dispatch0", 0400, .seq_ops = &deadline_dispatch0_seq_ops},
+       {"dispatch1", 0400, .seq_ops = &deadline_dispatch1_seq_ops},
+       {"dispatch2", 0400, .seq_ops = &deadline_dispatch2_seq_ops},
+       {"owned_by_driver", 0400, dd_owned_by_driver_show},
+       {"queued", 0400, dd_queued_show},
        {},
 };
 #undef DEADLINE_QUEUE_DDIR_ATTRS
@@ -771,6 +1059,8 @@ static const struct blk_mq_debugfs_attr deadline_queue_debugfs_attrs[] = {
 
 static struct elevator_type mq_deadline = {
        .ops = {
+               .depth_updated          = dd_depth_updated,
+               .limit_depth            = dd_limit_depth,
                .insert_requests        = dd_insert_requests,
                .dispatch_request       = dd_dispatch_request,
                .prepare_request        = dd_prepare_request,
@@ -782,8 +1072,9 @@ static struct elevator_type mq_deadline = {
                .requests_merged        = dd_merged_requests,
                .request_merged         = dd_request_merged,
                .has_work               = dd_has_work,
-               .init_sched             = dd_init_queue,
-               .exit_sched             = dd_exit_queue,
+               .init_sched             = dd_init_sched,
+               .exit_sched             = dd_exit_sched,
+               .init_hctx              = dd_init_hctx,
        },
 
 #ifdef CONFIG_BLK_DEBUG_FS
@@ -810,6 +1101,6 @@ static void __exit deadline_exit(void)
 module_init(deadline_init);
 module_exit(deadline_exit);
 
-MODULE_AUTHOR("Jens Axboe");
+MODULE_AUTHOR("Jens Axboe, Damien Le Moal and Bart Van Assche");
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("MQ deadline IO scheduler");