block, bfq: inject I/O to underutilized actuators

author Davide Zini <davidezini2@gmail.com>

Tue, 3 Jan 2023 14:55:02 +0000 (15:55 +0100)

committer Jens Axboe <axboe@kernel.dk>

Sun, 29 Jan 2023 22:18:33 +0000 (15:18 -0700)
author Davide Zini <davidezini2@gmail.com>
Tue, 3 Jan 2023 14:55:02 +0000 (15:55 +0100)
committer Jens Axboe <axboe@kernel.dk>
Sun, 29 Jan 2023 22:18:33 +0000 (15:18 -0700)
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c

index 5f081f4..b42956a 100644 (file)
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -706,7 +706,7 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
                 bfq_activate_bfqq(bfqd, bfqq);
         }
  
-       if (!bfqd->in_service_queue && !bfqd->rq_in_driver)
+       if (!bfqd->in_service_queue && !bfqd->tot_rq_in_driver)
                 bfq_schedule_dispatch(bfqd);
         /* release extra ref taken above, bfqq may happen to be freed now */
         bfq_put_queue(bfqq);
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c

index 56486f2..d42a229 100644 (file)
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -2259,9 +2259,9 @@ static void bfq_add_request(struct request *rq)
                  *   elapsed.
                  */
                 if (bfqq == bfqd->in_service_queue &&
-                   (bfqd->rq_in_driver == 0 ||
+                   (bfqd->tot_rq_in_driver == 0 ||
                      (bfqq->last_serv_time_ns > 0 &&
-                     bfqd->rqs_injected && bfqd->rq_in_driver > 0)) &&
+                     bfqd->rqs_injected && bfqd->tot_rq_in_driver > 0)) &&
                     time_is_before_eq_jiffies(bfqq->decrease_time_jif +
                                               msecs_to_jiffies(10))) {
                         bfqd->last_empty_occupied_ns = ktime_get_ns();
@@ -2285,7 +2285,7 @@ static void bfq_add_request(struct request *rq)
                          * will be set in case injection is performed
                          * on bfqq before rq is completed).
                          */
-                       if (bfqd->rq_in_driver == 0)
+                       if (bfqd->tot_rq_in_driver == 0)
                                 bfqd->rqs_injected = false;
                 }
         }
@@ -2650,11 +2650,14 @@ void bfq_end_wr_async_queues(struct bfq_data *bfqd,
  static void bfq_end_wr(struct bfq_data *bfqd)
  {
         struct bfq_queue *bfqq;
+       int i;
  
         spin_lock_irq(&bfqd->lock);
  
-       list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)
-               bfq_bfqq_end_wr(bfqq);
+       for (i = 0; i < bfqd->num_actuators; i++) {
+               list_for_each_entry(bfqq, &bfqd->active_list[i], bfqq_list)
+                       bfq_bfqq_end_wr(bfqq);
+       }
         list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)
                 bfq_bfqq_end_wr(bfqq);
         bfq_end_wr_async(bfqd);
@@ -3611,13 +3614,13 @@ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq)
          * - start a new observation interval with this dispatch
          */
         if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC &&
-           bfqd->rq_in_driver == 0)
+           bfqd->tot_rq_in_driver == 0)
                 goto update_rate_and_reset;
  
         /* Update sampling information */
         bfqd->peak_rate_samples++;
  
-       if ((bfqd->rq_in_driver > 0 ||
+       if ((bfqd->tot_rq_in_driver > 0 ||
                 now_ns - bfqd->last_completion < BFQ_MIN_TT)
             && !BFQ_RQ_SEEKY(bfqd, bfqd->last_position, rq))
                 bfqd->sequential_samples++;
@@ -3882,10 +3885,8 @@ static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd,
                 return false;
  
         return (bfqq->wr_coeff > 1 &&
-               (bfqd->wr_busy_queues <
-                tot_busy_queues ||
-                bfqd->rq_in_driver >=
-                bfqq->dispatched + 4)) ||
+               (bfqd->wr_busy_queues < tot_busy_queues ||
+                bfqd->tot_rq_in_driver >= bfqq->dispatched + 4)) ||
                 bfq_asymmetric_scenario(bfqd, bfqq) ||
                 tot_busy_queues == 1;
  }
@@ -4656,6 +4657,8 @@ bfq_choose_bfqq_for_injection(struct bfq_data *bfqd)
  {
         struct bfq_queue *bfqq, *in_serv_bfqq = bfqd->in_service_queue;
         unsigned int limit = in_serv_bfqq->inject_limit;
+       int i;
+
         /*
          * If
          * - bfqq is not weight-raised and therefore does not carry
@@ -4687,7 +4690,7 @@ bfq_choose_bfqq_for_injection(struct bfq_data *bfqd)
                 )
                 limit = 1;
  
-       if (bfqd->rq_in_driver >= limit)
+       if (bfqd->tot_rq_in_driver >= limit)
                 return NULL;
  
         /*
@@ -4702,11 +4705,12 @@ bfq_choose_bfqq_for_injection(struct bfq_data *bfqd)
          *   (and re-added only if it gets new requests, but then it
          *   is assigned again enough budget for its new backlog).
          */
-       list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)
-               if (!RB_EMPTY_ROOT(&bfqq->sort_list) &&
-                   (in_serv_always_inject || bfqq->wr_coeff > 1) &&
-                   bfq_serv_to_charge(bfqq->next_rq, bfqq) <=
-                   bfq_bfqq_budget_left(bfqq)) {
+       for (i = 0; i < bfqd->num_actuators; i++) {
+               list_for_each_entry(bfqq, &bfqd->active_list[i], bfqq_list)
+                       if (!RB_EMPTY_ROOT(&bfqq->sort_list) &&
+                               (in_serv_always_inject || bfqq->wr_coeff > 1) &&
+                               bfq_serv_to_charge(bfqq->next_rq, bfqq) <=
+                               bfq_bfqq_budget_left(bfqq)) {
                         /*
                          * Allow for only one large in-flight request
                          * on non-rotational devices, for the
@@ -4731,22 +4735,69 @@ bfq_choose_bfqq_for_injection(struct bfq_data *bfqd)
                         else
                                 limit = in_serv_bfqq->inject_limit;
  
-                       if (bfqd->rq_in_driver < limit) {
+                       if (bfqd->tot_rq_in_driver < limit) {
                                 bfqd->rqs_injected = true;
                                 return bfqq;
                         }
                 }
+       }
+
+       return NULL;
+}
+
+static struct bfq_queue *
+bfq_find_active_bfqq_for_actuator(struct bfq_data *bfqd, int idx)
+{
+       struct bfq_queue *bfqq;
+
+       if (bfqd->in_service_queue &&
+           bfqd->in_service_queue->actuator_idx == idx)
+               return bfqd->in_service_queue;
+
+       list_for_each_entry(bfqq, &bfqd->active_list[idx], bfqq_list) {
+               if (!RB_EMPTY_ROOT(&bfqq->sort_list) &&
+                       bfq_serv_to_charge(bfqq->next_rq, bfqq) <=
+                               bfq_bfqq_budget_left(bfqq)) {
+                       return bfqq;
+               }
+       }
  
         return NULL;
  }
  
  /*
+ * Perform a linear scan of each actuator, until an actuator is found
+ * for which the following two conditions hold: the load of the
+ * actuator is below the threshold (see comments on actuator_load_threshold
+ * for details), and there is a queue that contains I/O for that
+ * actuator. On success, return that queue.
+ */
+static struct bfq_queue *
+bfq_find_bfqq_for_underused_actuator(struct bfq_data *bfqd)
+{
+       int i;
+
+       for (i = 0 ; i < bfqd->num_actuators; i++) {
+               if (bfqd->rq_in_driver[i] < bfqd->actuator_load_threshold) {
+                       struct bfq_queue *bfqq =
+                               bfq_find_active_bfqq_for_actuator(bfqd, i);
+
+                       if (bfqq)
+                               return bfqq;
+               }
+       }
+
+       return NULL;
+}
+
+
+/*
   * Select a queue for service.  If we have a current queue in service,
   * check whether to continue servicing it, or retrieve and set a new one.
   */
  static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
  {
-       struct bfq_queue *bfqq;
+       struct bfq_queue *bfqq, *inject_bfqq;
         struct request *next_rq;
         enum bfqq_expiration reason = BFQQE_BUDGET_TIMEOUT;
  
@@ -4769,6 +4820,15 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
  
  check_queue:
         /*
+        *  If some actuator is underutilized, but the in-service
+        *  queue does not contain I/O for that actuator, then try to
+        *  inject I/O for that actuator.
+        */
+       inject_bfqq = bfq_find_bfqq_for_underused_actuator(bfqd);
+       if (inject_bfqq && inject_bfqq != bfqq)
+               return inject_bfqq;
+
+       /*
          * This loop is rarely executed more than once. Even when it
          * happens, it is much more convenient to re-execute this loop
          * than to return NULL and trigger a new dispatch to get a
@@ -5123,11 +5183,11 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
  
                 /*
                  * We exploit the bfq_finish_requeue_request hook to
-                * decrement rq_in_driver, but
+                * decrement tot_rq_in_driver, but
                  * bfq_finish_requeue_request will not be invoked on
                  * this request. So, to avoid unbalance, just start
-                * this request, without incrementing rq_in_driver. As
-                * a negative consequence, rq_in_driver is deceptively
+                * this request, without incrementing tot_rq_in_driver. As
+                * a negative consequence, tot_rq_in_driver is deceptively
                  * lower than it should be while this request is in
                  * service. This may cause bfq_schedule_dispatch to be
                  * invoked uselessly.
@@ -5136,7 +5196,7 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
                  * bfq_finish_requeue_request hook, if defined, is
                  * probably invoked also on this request. So, by
                  * exploiting this hook, we could 1) increment
-                * rq_in_driver here, and 2) decrement it in
+                * tot_rq_in_driver here, and 2) decrement it in
                  * bfq_finish_requeue_request. Such a solution would
                  * let the value of the counter be always accurate,
                  * but it would entail using an extra interface
@@ -5165,7 +5225,7 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
          * Of course, serving one request at a time may cause loss of
          * throughput.
          */
-       if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0)
+       if (bfqd->strict_guarantees && bfqd->tot_rq_in_driver > 0)
                 goto exit;
  
         bfqq = bfq_select_queue(bfqd);
@@ -5176,7 +5236,8 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
  
         if (rq) {
  inc_in_driver_start_rq:
-               bfqd->rq_in_driver++;
+               bfqd->rq_in_driver[bfqq->actuator_idx]++;
+               bfqd->tot_rq_in_driver++;
  start_rq:
                 rq->rq_flags |= RQF_STARTED;
         }
@@ -6243,7 +6304,7 @@ static void bfq_update_hw_tag(struct bfq_data *bfqd)
         struct bfq_queue *bfqq = bfqd->in_service_queue;
  
         bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver,
-                                      bfqd->rq_in_driver);
+                                      bfqd->tot_rq_in_driver);
  
         if (bfqd->hw_tag == 1)
                 return;
@@ -6254,7 +6315,7 @@ static void bfq_update_hw_tag(struct bfq_data *bfqd)
          * sum is not exact, as it's not taking into account deactivated
          * requests.
          */
-       if (bfqd->rq_in_driver + bfqd->queued <= BFQ_HW_QUEUE_THRESHOLD)
+       if (bfqd->tot_rq_in_driver + bfqd->queued <= BFQ_HW_QUEUE_THRESHOLD)
                 return;
  
         /*
@@ -6265,7 +6326,7 @@ static void bfq_update_hw_tag(struct bfq_data *bfqd)
         if (bfqq && bfq_bfqq_has_short_ttime(bfqq) &&
             bfqq->dispatched + bfqq->queued[0] + bfqq->queued[1] <
             BFQ_HW_QUEUE_THRESHOLD &&
-           bfqd->rq_in_driver < BFQ_HW_QUEUE_THRESHOLD)
+           bfqd->tot_rq_in_driver < BFQ_HW_QUEUE_THRESHOLD)
                 return;
  
         if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
@@ -6286,7 +6347,8 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
  
         bfq_update_hw_tag(bfqd);
  
-       bfqd->rq_in_driver--;
+       bfqd->rq_in_driver[bfqq->actuator_idx]--;
+       bfqd->tot_rq_in_driver--;
         bfqq->dispatched--;
  
         if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) {
@@ -6406,7 +6468,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
                                         BFQQE_NO_MORE_REQUESTS);
         }
  
-       if (!bfqd->rq_in_driver)
+       if (!bfqd->tot_rq_in_driver)
                 bfq_schedule_dispatch(bfqd);
  }
  
@@ -6537,13 +6599,13 @@ static void bfq_update_inject_limit(struct bfq_data *bfqd,
          * conditions to do it, or we can lower the last base value
          * computed.
          *
-        * NOTE: (bfqd->rq_in_driver == 1) means that there is no I/O
+        * NOTE: (bfqd->tot_rq_in_driver == 1) means that there is no I/O
          * request in flight, because this function is in the code
          * path that handles the completion of a request of bfqq, and,
          * in particular, this function is executed before
-        * bfqd->rq_in_driver is decremented in such a code path.
+        * bfqd->tot_rq_in_driver is decremented in such a code path.
          */
-       if ((bfqq->last_serv_time_ns == 0 && bfqd->rq_in_driver == 1) ||
+       if ((bfqq->last_serv_time_ns == 0 && bfqd->tot_rq_in_driver == 1) ||
             tot_time_ns < bfqq->last_serv_time_ns) {
                 if (bfqq->last_serv_time_ns == 0) {
                         /*
@@ -6553,7 +6615,7 @@ static void bfq_update_inject_limit(struct bfq_data *bfqd,
                         bfqq->inject_limit = max_t(unsigned int, 1, old_limit);
                 }
                 bfqq->last_serv_time_ns = tot_time_ns;
-       } else if (!bfqd->rqs_injected && bfqd->rq_in_driver == 1)
+       } else if (!bfqd->rqs_injected && bfqd->tot_rq_in_driver == 1)
                 /*
                  * No I/O injected and no request still in service in
                  * the drive: these are the exact conditions for
@@ -7208,7 +7270,8 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
         bfqd->num_groups_with_pending_reqs = 0;
  #endif
  
-       INIT_LIST_HEAD(&bfqd->active_list);
+       INIT_LIST_HEAD(&bfqd->active_list[0]);
+       INIT_LIST_HEAD(&bfqd->active_list[1]);
         INIT_LIST_HEAD(&bfqd->idle_list);
         INIT_HLIST_HEAD(&bfqd->burst_list);
  
@@ -7253,6 +7316,9 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
                 ref_wr_duration[blk_queue_nonrot(bfqd->queue)];
         bfqd->peak_rate = ref_rate[blk_queue_nonrot(bfqd->queue)] * 2 / 3;
  
+       /* see comments on the definition of next field inside bfq_data */
+       bfqd->actuator_load_threshold = 4;
+
         spin_lock_init(&bfqd->lock);
  
         /*
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h

index ba2ece8..2b4893a 100644 (file)
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -590,7 +590,12 @@ struct bfq_data {
         /* number of queued requests */
         int queued;
         /* number of requests dispatched and waiting for completion */
-       int rq_in_driver;
+       int tot_rq_in_driver;
+       /*
+        * number of requests dispatched and waiting for completion
+        * for each actuator
+        */
+       int rq_in_driver[BFQ_MAX_ACTUATORS];
  
         /* true if the device is non rotational and performs queueing */
         bool nonrot_with_queueing;
@@ -684,8 +689,13 @@ struct bfq_data {
         /* maximum budget allotted to a bfq_queue before rescheduling */
         int bfq_max_budget;
  
-       /* list of all the bfq_queues active on the device */
-       struct list_head active_list;
+       /*
+        * List of all the bfq_queues active for a specific actuator
+        * on the device. Keeping active queues separate on a
+        * per-actuator basis helps implementing per-actuator
+        * injection more efficiently.
+        */
+       struct list_head active_list[BFQ_MAX_ACTUATORS];
         /* list of all the bfq_queues idle on the device */
         struct list_head idle_list;
  
@@ -821,6 +831,29 @@ struct bfq_data {
         sector_t sector[BFQ_MAX_ACTUATORS];
         sector_t nr_sectors[BFQ_MAX_ACTUATORS];
         struct blk_independent_access_range ia_ranges[BFQ_MAX_ACTUATORS];
+
+       /*
+        * If the number of I/O requests queued in the device for a
+        * given actuator is below next threshold, then the actuator
+        * is deemed as underutilized. If this condition is found to
+        * hold for some actuator upon a dispatch, but (i) the
+        * in-service queue does not contain I/O for that actuator,
+        * while (ii) some other queue does contain I/O for that
+        * actuator, then the head I/O request of the latter queue is
+        * returned (injected), instead of the head request of the
+        * currently in-service queue.
+        *
+        * We set the threshold, empirically, to the minimum possible
+        * value for which an actuator is fully utilized, or close to
+        * be fully utilized. By doing so, injected I/O 'steals' as
+        * few drive-queue slots as possibile to the in-service
+        * queue. This reduces as much as possible the probability
+        * that the service of I/O from the in-service bfq_queue gets
+        * delayed because of slot exhaustion, i.e., because all the
+        * slots of the drive queue are filled with I/O injected from
+        * other queues (NCQ provides for 32 slots).
+        */
+       unsigned int actuator_load_threshold;
  };
  
  enum bfqq_state_flags {
diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c

index ea4c3d7..7941b6f 100644 (file)
--- a/block/bfq-wf2q.c
+++ b/block/bfq-wf2q.c
@@ -493,7 +493,7 @@ static void bfq_active_insert(struct bfq_service_tree *st,
         bfq_update_active_tree(node);
  
         if (bfqq)
-               list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
+               list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list[bfqq->actuator_idx]);
  
         bfq_inc_active_entities(entity);
  }
author	Davide Zini <davidezini2@gmail.com>
	Tue, 3 Jan 2023 14:55:02 +0000 (15:55 +0100)
committer	Jens Axboe <axboe@kernel.dk>
	Sun, 29 Jan 2023 22:18:33 +0000 (15:18 -0700)
block/bfq-cgroup.c		patch \| blob \| history
block/bfq-iosched.c		patch \| blob \| history
block/bfq-iosched.h		patch \| blob \| history
block/bfq-wf2q.c		patch \| blob \| history