sched/eevdf: Fix vruntime adjustment on reweight
authorAbel Wu <wuyun.abel@bytedance.com>
Tue, 7 Nov 2023 09:05:07 +0000 (17:05 +0800)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Sun, 3 Dec 2023 06:33:02 +0000 (07:33 +0100)
[ Upstream commit eab03c23c2a162085b13200d7942fc5a00b5ccc8 ]

vruntime of the (on_rq && !0-lag) entity needs to be adjusted when
it gets re-weighted, and the calculations can be simplified based
on the fact that re-weight won't change the w-average of all the
entities. Please check the proofs in comments.

But adjusting vruntime can also cause position change in RB-tree
hence require re-queue to fix up which might be costly. This might
be avoided by deferring adjustment to the time the entity actually
leaves tree (dequeue/pick), but that will negatively affect task
selection and probably not good enough either.

Fixes: 147f3efaa241 ("sched/fair: Implement an EEVDF-like scheduling policy")
Signed-off-by: Abel Wu <wuyun.abel@bytedance.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20231107090510.71322-2-wuyun.abel@bytedance.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
kernel/sched/fair.c

index 1795f6f..0351320 100644 (file)
@@ -3626,41 +3626,140 @@ static inline void
 dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
 #endif
 
+static void reweight_eevdf(struct cfs_rq *cfs_rq, struct sched_entity *se,
+                          unsigned long weight)
+{
+       unsigned long old_weight = se->load.weight;
+       u64 avruntime = avg_vruntime(cfs_rq);
+       s64 vlag, vslice;
+
+       /*
+        * VRUNTIME
+        * ========
+        *
+        * COROLLARY #1: The virtual runtime of the entity needs to be
+        * adjusted if re-weight at !0-lag point.
+        *
+        * Proof: For contradiction assume this is not true, so we can
+        * re-weight without changing vruntime at !0-lag point.
+        *
+        *             Weight   VRuntime   Avg-VRuntime
+        *     before    w          v            V
+        *      after    w'         v'           V'
+        *
+        * Since lag needs to be preserved through re-weight:
+        *
+        *      lag = (V - v)*w = (V'- v')*w', where v = v'
+        *      ==>     V' = (V - v)*w/w' + v           (1)
+        *
+        * Let W be the total weight of the entities before reweight,
+        * since V' is the new weighted average of entities:
+        *
+        *      V' = (WV + w'v - wv) / (W + w' - w)     (2)
+        *
+        * by using (1) & (2) we obtain:
+        *
+        *      (WV + w'v - wv) / (W + w' - w) = (V - v)*w/w' + v
+        *      ==> (WV-Wv+Wv+w'v-wv)/(W+w'-w) = (V - v)*w/w' + v
+        *      ==> (WV - Wv)/(W + w' - w) + v = (V - v)*w/w' + v
+        *      ==>     (V - v)*W/(W + w' - w) = (V - v)*w/w' (3)
+        *
+        * Since we are doing at !0-lag point which means V != v, we
+        * can simplify (3):
+        *
+        *      ==>     W / (W + w' - w) = w / w'
+        *      ==>     Ww' = Ww + ww' - ww
+        *      ==>     W * (w' - w) = w * (w' - w)
+        *      ==>     W = w   (re-weight indicates w' != w)
+        *
+        * So the cfs_rq contains only one entity, hence vruntime of
+        * the entity @v should always equal to the cfs_rq's weighted
+        * average vruntime @V, which means we will always re-weight
+        * at 0-lag point, thus breach assumption. Proof completed.
+        *
+        *
+        * COROLLARY #2: Re-weight does NOT affect weighted average
+        * vruntime of all the entities.
+        *
+        * Proof: According to corollary #1, Eq. (1) should be:
+        *
+        *      (V - v)*w = (V' - v')*w'
+        *      ==>    v' = V' - (V - v)*w/w'           (4)
+        *
+        * According to the weighted average formula, we have:
+        *
+        *      V' = (WV - wv + w'v') / (W - w + w')
+        *         = (WV - wv + w'(V' - (V - v)w/w')) / (W - w + w')
+        *         = (WV - wv + w'V' - Vw + wv) / (W - w + w')
+        *         = (WV + w'V' - Vw) / (W - w + w')
+        *
+        *      ==>  V'*(W - w + w') = WV + w'V' - Vw
+        *      ==>     V' * (W - w) = (W - w) * V      (5)
+        *
+        * If the entity is the only one in the cfs_rq, then reweight
+        * always occurs at 0-lag point, so V won't change. Or else
+        * there are other entities, hence W != w, then Eq. (5) turns
+        * into V' = V. So V won't change in either case, proof done.
+        *
+        *
+        * So according to corollary #1 & #2, the effect of re-weight
+        * on vruntime should be:
+        *
+        *      v' = V' - (V - v) * w / w'              (4)
+        *         = V  - (V - v) * w / w'
+        *         = V  - vl * w / w'
+        *         = V  - vl'
+        */
+       if (avruntime != se->vruntime) {
+               vlag = (s64)(avruntime - se->vruntime);
+               vlag = div_s64(vlag * old_weight, weight);
+               se->vruntime = avruntime - vlag;
+       }
+
+       /*
+        * DEADLINE
+        * ========
+        *
+        * When the weight changes, the virtual time slope changes and
+        * we should adjust the relative virtual deadline accordingly.
+        *
+        *      d' = v' + (d - v)*w/w'
+        *         = V' - (V - v)*w/w' + (d - v)*w/w'
+        *         = V  - (V - v)*w/w' + (d - v)*w/w'
+        *         = V  + (d - V)*w/w'
+        */
+       vslice = (s64)(se->deadline - avruntime);
+       vslice = div_s64(vslice * old_weight, weight);
+       se->deadline = avruntime + vslice;
+}
+
 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
                            unsigned long weight)
 {
-       unsigned long old_weight = se->load.weight;
+       bool curr = cfs_rq->curr == se;
 
        if (se->on_rq) {
                /* commit outstanding execution time */
-               if (cfs_rq->curr == se)
+               if (curr)
                        update_curr(cfs_rq);
                else
-                       avg_vruntime_sub(cfs_rq, se);
+                       __dequeue_entity(cfs_rq, se);
                update_load_sub(&cfs_rq->load, se->load.weight);
        }
        dequeue_load_avg(cfs_rq, se);
 
-       update_load_set(&se->load, weight);
-
        if (!se->on_rq) {
                /*
                 * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
                 * we need to scale se->vlag when w_i changes.
                 */
-               se->vlag = div_s64(se->vlag * old_weight, weight);
+               se->vlag = div_s64(se->vlag * se->load.weight, weight);
        } else {
-               s64 deadline = se->deadline - se->vruntime;
-               /*
-                * When the weight changes, the virtual time slope changes and
-                * we should adjust the relative virtual deadline accordingly.
-                */
-               deadline = div_s64(deadline * old_weight, weight);
-               se->deadline = se->vruntime + deadline;
-               if (se != cfs_rq->curr)
-                       min_deadline_cb_propagate(&se->run_node, NULL);
+               reweight_eevdf(cfs_rq, se, weight);
        }
 
+       update_load_set(&se->load, weight);
+
 #ifdef CONFIG_SMP
        do {
                u32 divider = get_pelt_divider(&se->avg);
@@ -3672,8 +3771,17 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
        enqueue_load_avg(cfs_rq, se);
        if (se->on_rq) {
                update_load_add(&cfs_rq->load, se->load.weight);
-               if (cfs_rq->curr != se)
-                       avg_vruntime_add(cfs_rq, se);
+               if (!curr) {
+                       /*
+                        * The entity's vruntime has been adjusted, so let's check
+                        * whether the rq-wide min_vruntime needs updated too. Since
+                        * the calculations above require stable min_vruntime rather
+                        * than up-to-date one, we do the update at the end of the
+                        * reweight process.
+                        */
+                       __enqueue_entity(cfs_rq, se);
+                       update_min_vruntime(cfs_rq);
+               }
        }
 }
 
@@ -3817,14 +3925,11 @@ static void update_cfs_group(struct sched_entity *se)
 
 #ifndef CONFIG_SMP
        shares = READ_ONCE(gcfs_rq->tg->shares);
-
-       if (likely(se->load.weight == shares))
-               return;
 #else
-       shares   = calc_group_shares(gcfs_rq);
+       shares = calc_group_shares(gcfs_rq);
 #endif
-
-       reweight_entity(cfs_rq_of(se), se, shares);
+       if (unlikely(se->load.weight != shares))
+               reweight_entity(cfs_rq_of(se), se, shares);
 }
 
 #else /* CONFIG_FAIR_GROUP_SCHED */