blk-iocost: revamp donation amount determination
authorTejun Heo <tj@kernel.org>
Tue, 1 Sep 2020 18:52:49 +0000 (14:52 -0400)
committerJens Axboe <axboe@kernel.dk>
Wed, 2 Sep 2020 01:38:32 +0000 (19:38 -0600)
iocost has various safety nets to combat inuse adjustment calculation
inaccuracies. With Andy's method implemented in transfer_surpluses(), inuse
adjustment calculations are now accurate and we can make donation amount
determinations accurate too.

* Stop keeping track of past usage history and using the maximum. Act on the
  immediate usage information.

* Remove donation constraints defined by SURPLUS_* constants. Donate
  whatever isn't used.

* Determine the donation amount so that the iocg will end up with
  MARGIN_TARGET_PCT budget at the end of the coming period assuming the same
  usage as the previous period. TARGET is set at 50% of period, which is the
  previous maximum. This provides smooth convergence for most repetitive IO
  patterns.

* Apply donation logic early at 20% budget. There's no risk in doing so as
  the calculation is based on the delta between the current budget and the
  target budget at the end of the coming period.

* Remove preemptive iocg activation for zero cost IOs. As donation can reach
  near zero now, the mere activation doesn't provide any protection anymore.
  In the unlikely case that this becomes a problem, the right solution is
  assigning appropriate costs for such IOs.

This significantly improves the donation determination logic while also
simplifying it. Now all donations are immediate, exact and smooth.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Andy Newell <newella@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
block/blk-iocost.c

index ecc23b827e5d2f35d2b1fdec056147951e579015..694f1487208aca0f23ffb2cc95c51b3546032cff 100644 (file)
@@ -217,12 +217,14 @@ enum {
        MAX_PERIOD              = USEC_PER_SEC,
 
        /*
-        * A cgroup's vtime can run 50% behind the device vtime, which
+        * iocg->vtime is targeted at 50% behind the device vtime, which
         * serves as its IO credit buffer.  Surplus weight adjustment is
         * immediately canceled if the vtime margin runs below 10%.
         */
        MARGIN_MIN_PCT          = 10,
-       MARGIN_MAX_PCT          = 50,
+       MARGIN_LOW_PCT          = 20,
+       MARGIN_TARGET_PCT       = 50,
+       MARGIN_MAX_PCT          = 100,
 
        /* Have some play in timer operations */
        TIMER_SLACK_PCT         = 1,
@@ -234,17 +236,6 @@ enum {
         */
        VTIME_VALID_DUR         = 300 * USEC_PER_SEC,
 
-       /*
-        * Remember the past three non-zero usages and use the max for
-        * surplus calculation.  Three slots guarantee that we remember one
-        * full period usage from the last active stretch even after
-        * partial deactivation and re-activation periods.  Don't start
-        * giving away weight before collecting two data points to prevent
-        * hweight adjustments based on one partial activation period.
-        */
-       NR_USAGE_SLOTS          = 3,
-       MIN_VALID_USAGES        = 2,
-
        /* 1/64k is granular enough and can easily be handled w/ u32 */
        WEIGHT_ONE              = 1 << 16,
 
@@ -280,14 +271,6 @@ enum {
        /* don't let cmds which take a very long time pin lagging for too long */
        MAX_LAGGING_PERIODS     = 10,
 
-       /*
-        * If usage% * 1.25 + 2% is lower than hweight% by more than 3%,
-        * donate the surplus.
-        */
-       SURPLUS_SCALE_PCT       = 125,                  /* * 125% */
-       SURPLUS_SCALE_ABS       = WEIGHT_ONE / 50,      /* + 2% */
-       SURPLUS_MIN_ADJ_DELTA   = WEIGHT_ONE / 33,      /* 3% */
-
        /* switch iff the conditions are met for longer than this */
        AUTOP_CYCLE_NSEC        = 10LLU * NSEC_PER_SEC,
 
@@ -376,6 +359,8 @@ struct ioc_params {
 
 struct ioc_margins {
        s64                             min;
+       s64                             low;
+       s64                             target;
        s64                             max;
 };
 
@@ -514,11 +499,7 @@ struct ioc_gq {
        struct iocg_stat                desc_stat;
        struct iocg_stat                last_stat;
        u64                             last_stat_abs_vusage;
-
-       /* usage is recorded as fractions of WEIGHT_ONE */
-       u32                             usage_delta_us;
-       int                             usage_idx;
-       u32                             usages[NR_USAGE_SLOTS];
+       u64                             usage_delta_us;
 
        /* this iocg's depth in the hierarchy and ancestors including self */
        int                             level;
@@ -737,6 +718,8 @@ static void ioc_refresh_margins(struct ioc *ioc)
        u64 vrate = atomic64_read(&ioc->vtime_rate);
 
        margins->min = (period_us * MARGIN_MIN_PCT / 100) * vrate;
+       margins->low = (period_us * MARGIN_LOW_PCT / 100) * vrate;
+       margins->target = (period_us * MARGIN_TARGET_PCT / 100) * vrate;
        margins->max = (period_us * MARGIN_MAX_PCT / 100) * vrate;
 }
 
@@ -1228,7 +1211,7 @@ static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now)
                return false;
        }
        if (!atomic_read(&blkg->use_delay) &&
-           time_before_eq64(vtime, now->vnow + ioc->margins.max))
+           time_before_eq64(vtime, now->vnow + ioc->margins.target))
                return false;
 
        /* use delay */
@@ -1527,7 +1510,7 @@ static u32 hweight_after_donation(struct ioc_gq *iocg, u32 hwm, u32 usage,
 {
        struct ioc *ioc = iocg->ioc;
        u64 vtime = atomic64_read(&iocg->vtime);
-       s64 excess;
+       s64 excess, delta, target, new_hwi;
 
        /* see whether minimum margin requirement is met */
        if (waitqueue_active(&iocg->waitq) ||
@@ -1542,15 +1525,28 @@ static u32 hweight_after_donation(struct ioc_gq *iocg, u32 hwm, u32 usage,
                vtime += excess;
        }
 
-       /* add margin */
-       usage = DIV_ROUND_UP(usage * SURPLUS_SCALE_PCT, 100);
-       usage += SURPLUS_SCALE_ABS;
-
-       /* don't bother if the surplus is too small */
-       if (usage + SURPLUS_MIN_ADJ_DELTA > hwm)
-               return hwm;
+       /*
+        * Let's say the distance between iocg's and device's vtimes as a
+        * fraction of period duration is delta. Assuming that the iocg will
+        * consume the usage determined above, we want to determine new_hwi so
+        * that delta equals MARGIN_TARGET at the end of the next period.
+        *
+        * We need to execute usage worth of IOs while spending the sum of the
+        * new budget (1 - MARGIN_TARGET) and the leftover from the last period
+        * (delta):
+        *
+        *   usage = (1 - MARGIN_TARGET + delta) * new_hwi
+        *
+        * Therefore, the new_hwi is:
+        *
+        *   new_hwi = usage / (1 - MARGIN_TARGET + delta)
+        */
+       delta = div64_s64(WEIGHT_ONE * (now->vnow - vtime),
+                         now->vnow - ioc->period_at_vtime);
+       target = WEIGHT_ONE * MARGIN_TARGET_PCT / 100;
+       new_hwi = div64_s64(WEIGHT_ONE * usage, WEIGHT_ONE - target + delta);
 
-       return usage;
+       return clamp_t(s64, new_hwi, 1, hwm);
 }
 
 /*
@@ -1812,7 +1808,7 @@ static void ioc_timer_fn(struct timer_list *timer)
        u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
        u32 missed_ppm[2], rq_wait_pct;
        u64 period_vtime;
-       int prev_busy_level, i;
+       int prev_busy_level;
 
        /* how were the latencies during the period? */
        ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
@@ -1857,11 +1853,10 @@ static void ioc_timer_fn(struct timer_list *timer)
        }
        commit_weights(ioc);
 
-       /* calc usages and see whether some weights need to be moved around */
+       /* calc usage and see whether some weights need to be moved around */
        list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
-               u64 vdone, vtime, usage_us;
-               u32 hw_active, hw_inuse, usage;
-               int uidx, nr_valid;
+               u64 vdone, vtime, usage_us, usage_dur;
+               u32 usage, hw_active, hw_inuse;
 
                /*
                 * Collect unused and wind vtime closer to vnow to prevent
@@ -1886,15 +1881,11 @@ static void ioc_timer_fn(struct timer_list *timer)
                        nr_lagging++;
 
                /*
-                * Determine absolute usage factoring in pending and in-flight
-                * IOs to avoid stalls and high-latency completions appearing as
-                * idle.
+                * Determine absolute usage factoring in in-flight IOs to avoid
+                * high-latency completions appearing as idle.
                 */
                usage_us = iocg->usage_delta_us;
-               if (waitqueue_active(&iocg->waitq) && time_before64(vtime, now.vnow))
-                       usage_us += DIV64_U64_ROUND_UP(
-                               cost_to_abs_cost(now.vnow - vtime, hw_inuse),
-                               now.vrate);
+
                if (vdone != vtime) {
                        u64 inflight_us = DIV64_U64_ROUND_UP(
                                cost_to_abs_cost(vtime - vdone, hw_inuse),
@@ -1902,43 +1893,22 @@ static void ioc_timer_fn(struct timer_list *timer)
                        usage_us = max(usage_us, inflight_us);
                }
 
-               /* convert to hweight based usage ratio and record */
-               uidx = (iocg->usage_idx + 1) % NR_USAGE_SLOTS;
-
-               if (time_after64(vtime, now.vnow - ioc->margins.min)) {
-                       iocg->usage_idx = uidx;
-                       iocg->usages[uidx] = WEIGHT_ONE;
-               } else if (usage_us) {
-                       u64 started_at, dur;
-
-                       if (time_after64(iocg->activated_at, ioc->period_at))
-                               started_at = iocg->activated_at;
-                       else
-                               started_at = ioc->period_at;
-
-                       dur = max_t(u64, now.now - started_at, 1);
+               /* convert to hweight based usage ratio */
+               if (time_after64(iocg->activated_at, ioc->period_at))
+                       usage_dur = max_t(u64, now.now - iocg->activated_at, 1);
+               else
+                       usage_dur = max_t(u64, now.now - ioc->period_at, 1);
 
-                       iocg->usage_idx = uidx;
-                       iocg->usages[uidx] = clamp_t(u32,
-                               DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE, dur),
+               usage = clamp_t(u32,
+                               DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE,
+                                                  usage_dur),
                                1, WEIGHT_ONE);
-               }
-
-               /* base the decision on max historical usage */
-               for (i = 0, usage = 0, nr_valid = 0; i < NR_USAGE_SLOTS; i++) {
-                       if (iocg->usages[i]) {
-                               usage = max(usage, iocg->usages[i]);
-                               nr_valid++;
-                       }
-               }
-               if (nr_valid < MIN_VALID_USAGES)
-                       usage = WEIGHT_ONE;
 
                /* see whether there's surplus vtime */
                WARN_ON_ONCE(!list_empty(&iocg->surplus_list));
                if (hw_inuse < hw_active ||
                    (!waitqueue_active(&iocg->waitq) &&
-                    time_before64(vtime, now.vnow - ioc->margins.max))) {
+                    time_before64(vtime, now.vnow - ioc->margins.low))) {
                        u32 hwa, hwm, new_hwi;
 
                        /*
@@ -2175,15 +2145,14 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
        if (!ioc->enabled || !iocg->level)
                return;
 
-       /* always activate so that even 0 cost IOs get protected to some level */
-       if (!iocg_activate(iocg, &now))
-               return;
-
        /* calculate the absolute vtime cost */
        abs_cost = calc_vtime_cost(bio, iocg, false);
        if (!abs_cost)
                return;
 
+       if (!iocg_activate(iocg, &now))
+               return;
+
        iocg->cursor = bio_end_sector(bio);
 
        vtime = atomic64_read(&iocg->vtime);