MARGIN_MIN_PCT = 10,
MARGIN_LOW_PCT = 20,
MARGIN_TARGET_PCT = 50,
- MARGIN_MAX_PCT = 100,
INUSE_ADJ_STEP_PCT = 25,
/* Have some play in timer operations */
TIMER_SLACK_PCT = 1,
- /*
- * vtime can wrap well within a reasonable uptime when vrate is
- * consistently raised. Don't trust recorded cgroup vtime if the
- * period counter indicates that it's older than 5mins.
- */
- VTIME_VALID_DUR = 300 * USEC_PER_SEC,
-
/* 1/64k is granular enough and can easily be handled w/ u32 */
WEIGHT_ONE = 1 << 16,
s64 min;
s64 low;
s64 target;
- s64 max;
};
struct ioc_missed {
enum ioc_running running;
atomic64_t vtime_rate;
+ u64 vtime_base_rate;
+ s64 vtime_err;
seqcount_spinlock_t period_seqcount;
u64 period_at; /* wallclock starttime */
{
struct ioc_margins *margins = &ioc->margins;
u32 period_us = ioc->period_us;
- u64 vrate = atomic64_read(&ioc->vtime_rate);
+ u64 vrate = ioc->vtime_base_rate;
margins->min = (period_us * MARGIN_MIN_PCT / 100) * vrate;
margins->low = (period_us * MARGIN_LOW_PCT / 100) * vrate;
margins->target = (period_us * MARGIN_TARGET_PCT / 100) * vrate;
- margins->max = (period_us * MARGIN_MAX_PCT / 100) * vrate;
}
/* latency Qos params changed, update period_us and all the dependent params */
return idx;
/* step up/down based on the vrate */
- vrate_pct = div64_u64(atomic64_read(&ioc->vtime_rate) * 100,
- VTIME_PER_USEC);
+ vrate_pct = div64_u64(ioc->vtime_base_rate * 100, VTIME_PER_USEC);
now_ns = ktime_get_ns();
if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) {
return true;
}
+/*
+ * When an iocg accumulates too much vtime or gets deactivated, we throw away
+ * some vtime, which lowers the overall device utilization. As the exact amount
+ * which is being thrown away is known, we can compensate by accelerating the
+ * vrate accordingly so that the extra vtime generated in the current period
+ * matches what got lost.
+ */
+static void ioc_refresh_vrate(struct ioc *ioc, struct ioc_now *now)
+{
+ s64 pleft = ioc->period_at + ioc->period_us - now->now;
+ s64 vperiod = ioc->period_us * ioc->vtime_base_rate;
+ s64 vcomp, vcomp_min, vcomp_max;
+
+ lockdep_assert_held(&ioc->lock);
+
+ /* we need some time left in this period */
+ if (pleft <= 0)
+ goto done;
+
+ /*
+ * Calculate how much vrate should be adjusted to offset the error.
+ * Limit the amount of adjustment and deduct the adjusted amount from
+ * the error.
+ */
+ vcomp = -div64_s64(ioc->vtime_err, pleft);
+ vcomp_min = -(ioc->vtime_base_rate >> 1);
+ vcomp_max = ioc->vtime_base_rate;
+ vcomp = clamp(vcomp, vcomp_min, vcomp_max);
+
+ ioc->vtime_err += vcomp * pleft;
+
+ atomic64_set(&ioc->vtime_rate, ioc->vtime_base_rate + vcomp);
+done:
+ /* bound how much error can accumulate */
+ ioc->vtime_err = clamp(ioc->vtime_err, -vperiod, vperiod);
+}
+
/* take a snapshot of the current [v]time and vrate */
static void ioc_now(struct ioc *ioc, struct ioc_now *now)
{
static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now)
{
struct ioc *ioc = iocg->ioc;
- u64 last_period, cur_period, max_period_delta;
- u64 vtime, vmin;
+ u64 last_period, cur_period;
+ u64 vtime, vtarget;
int i;
/*
goto fail_unlock;
/*
- * vtime may wrap when vrate is raised substantially due to
- * underestimated IO costs. Look at the period and ignore its
- * vtime if the iocg has been idle for too long. Also, cap the
- * budget it can start with to the margin.
+ * Always start with the target budget. On deactivation, we throw away
+ * anything above it.
*/
- max_period_delta = DIV64_U64_ROUND_UP(VTIME_VALID_DUR, ioc->period_us);
+ vtarget = now->vnow - ioc->margins.target;
vtime = atomic64_read(&iocg->vtime);
- vmin = now->vnow - ioc->margins.max;
- if (last_period + max_period_delta < cur_period ||
- time_before64(vtime, vmin)) {
- atomic64_add(vmin - vtime, &iocg->vtime);
- atomic64_add(vmin - vtime, &iocg->done_vtime);
- vtime = vmin;
- }
+ atomic64_add(vtarget - vtime, &iocg->vtime);
+ atomic64_add(vtarget - vtime, &iocg->done_vtime);
+ vtime = vtarget;
/*
* Activate, propagate weight and start period timer if not
current_hweight(iocg, &hwa, NULL);
vover = atomic64_read(&iocg->vtime) +
abs_cost_to_cost(iocg->abs_vdebt, hwa) - now->vnow;
- vover_pct = div64_s64(100 * vover, ioc->period_us * now->vrate);
+ vover_pct = div64_s64(100 * vover,
+ ioc->period_us * ioc->vtime_base_rate);
if (vover_pct <= MIN_DELAY_THR_PCT)
new_delay = 0;
/* determine next wakeup, add a timer margin to guarantee chunking */
vshortage = -ctx.vbudget;
expires = now->now_ns +
- DIV64_U64_ROUND_UP(vshortage, now->vrate) * NSEC_PER_USEC;
+ DIV64_U64_ROUND_UP(vshortage, ioc->vtime_base_rate) *
+ NSEC_PER_USEC;
expires += ioc->timer_slack_ns;
/* if already active and close enough, don't bother */
/* collect per-cpu counters and propagate the deltas to the parent */
static void iocg_flush_stat_one(struct ioc_gq *iocg, struct ioc_now *now)
{
+ struct ioc *ioc = iocg->ioc;
struct iocg_stat new_stat;
u64 abs_vusage = 0;
u64 vusage_delta;
vusage_delta = abs_vusage - iocg->last_stat_abs_vusage;
iocg->last_stat_abs_vusage = abs_vusage;
- iocg->usage_delta_us = div64_u64(vusage_delta, now->vrate);
+ iocg->usage_delta_us = div64_u64(vusage_delta, ioc->vtime_base_rate);
iocg->local_stat.usage_us += iocg->usage_delta_us;
new_stat.usage_us =
* capacity. @hwm is the upper bound and used to signal no donation. This
* function also throws away @iocg's excess budget.
*/
-static u32 hweight_after_donation(struct ioc_gq *iocg, u32 hwm, u32 usage,
- struct ioc_now *now)
+static u32 hweight_after_donation(struct ioc_gq *iocg, u32 old_hwi, u32 hwm,
+ u32 usage, struct ioc_now *now)
{
struct ioc *ioc = iocg->ioc;
u64 vtime = atomic64_read(&iocg->vtime);
time_after64(vtime, now->vnow - ioc->margins.min))
return hwm;
- /* throw away excess above max */
- excess = now->vnow - vtime - ioc->margins.max;
+ /* throw away excess above target */
+ excess = now->vnow - vtime - ioc->margins.target;
if (excess > 0) {
atomic64_add(excess, &iocg->vtime);
atomic64_add(excess, &iocg->done_vtime);
vtime += excess;
+ ioc->vtime_err -= div64_u64(excess * old_hwi, WEIGHT_ONE);
}
/*
nr_debtors++;
} else if (iocg_is_idle(iocg)) {
/* no waiter and idle, deactivate */
+ u64 vtime = atomic64_read(&iocg->vtime);
+ s64 excess;
+
+ /*
+ * @iocg has been inactive for a full duration and will
+ * have a high budget. Account anything above target as
+ * error and throw away. On reactivation, it'll start
+ * with the target budget.
+ */
+ excess = now.vnow - vtime - ioc->margins.target;
+ if (excess > 0) {
+ u32 old_hwi;
+
+ current_hweight(iocg, NULL, &old_hwi);
+ ioc->vtime_err -= div64_u64(excess * old_hwi,
+ WEIGHT_ONE);
+ }
+
__propagate_weights(iocg, 0, 0, false, &now);
list_del_init(&iocg->active_list);
}
if (vdone != vtime) {
u64 inflight_us = DIV64_U64_ROUND_UP(
cost_to_abs_cost(vtime - vdone, hw_inuse),
- now.vrate);
+ ioc->vtime_base_rate);
usage_us = max(usage_us, inflight_us);
}
if (hw_inuse < hw_active ||
(!waitqueue_active(&iocg->waitq) &&
time_before64(vtime, now.vnow - ioc->margins.low))) {
- u32 hwa, hwm, new_hwi;
+ u32 hwa, old_hwi, hwm, new_hwi;
/*
* Already donating or accumulated enough to start.
* Determine the donation amount.
*/
- current_hweight(iocg, &hwa, NULL);
+ current_hweight(iocg, &hwa, &old_hwi);
hwm = current_hweight_max(iocg);
- new_hwi = hweight_after_donation(iocg, hwm, usage,
- &now);
+ new_hwi = hweight_after_donation(iocg, old_hwi, hwm,
+ usage, &now);
if (new_hwi < hwm) {
iocg->hweight_donating = hwa;
iocg->hweight_after_donation = new_hwi;
ioc->busy_level = clamp(ioc->busy_level, -1000, 1000);
if (ioc->busy_level > 0 || (ioc->busy_level < 0 && !nr_lagging)) {
- u64 vrate = atomic64_read(&ioc->vtime_rate);
+ u64 vrate = ioc->vtime_base_rate;
u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max;
/* rq_wait signal is always reliable, ignore user vrate_min */
trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct,
nr_lagging, nr_shortages);
- atomic64_set(&ioc->vtime_rate, vrate);
+ ioc->vtime_base_rate = vrate;
ioc_refresh_margins(ioc);
} else if (ioc->busy_level != prev_busy_level || nr_lagging) {
trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate),
ioc_start_period(ioc, &now);
} else {
ioc->busy_level = 0;
+ ioc->vtime_err = 0;
ioc->running = IOC_IDLE;
}
+
+ ioc_refresh_vrate(ioc, &now);
}
spin_unlock_irq(&ioc->lock);
INIT_LIST_HEAD(&ioc->active_iocgs);
ioc->running = IOC_IDLE;
+ ioc->vtime_base_rate = VTIME_PER_USEC;
atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
seqcount_spinlock_init(&ioc->period_seqcount, &ioc->lock);
ioc->period_at = ktime_to_us(ktime_get());
if (iocg->level == 0) {
unsigned vp10k = DIV64_U64_ROUND_CLOSEST(
- atomic64_read(&ioc->vtime_rate) * 10000,
+ ioc->vtime_base_rate * 10000,
VTIME_PER_USEC);
pos += scnprintf(buf + pos, size - pos, " cost.vrate=%u.%02u",
vp10k / 100, vp10k % 100);