Merge branch 'for-4.15' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

[platform/kernel/linux-starfive.git] / kernel / sched / fair.c
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 0ae69af..4037e19 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
  /*
   * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
   *
@@ -32,6 +33,7 @@
  #include <linux/mempolicy.h>
  #include <linux/migrate.h>
  #include <linux/task_work.h>
+#include <linux/sched/isolation.h>
  
  #include <trace/events/sched.h>
  
@@ -716,13 +718,8 @@ void init_entity_runnable_average(struct sched_entity *se)
  {
         struct sched_avg *sa = &se->avg;
  
-       sa->last_update_time = 0;
-       /*
-        * sched_avg's period_contrib should be strictly less then 1024, so
-        * we give it 1023 to make sure it is almost a period (1024us), and
-        * will definitely be update (after enqueue).
-        */
-       sa->period_contrib = 1023;
+       memset(sa, 0, sizeof(*sa));
+
         /*
          * Tasks are intialized with full load to be seen as heavy tasks until
          * they get a chance to stabilize to their real load level.
@@ -730,13 +727,10 @@ void init_entity_runnable_average(struct sched_entity *se)
          * nothing has been attached to the task group yet.
          */
         if (entity_is_task(se))
-               sa->load_avg = scale_load_down(se->load.weight);
-       sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
-       /*
-        * At this point, util_avg won't be used in select_task_rq_fair anyway
-        */
-       sa->util_avg = 0;
-       sa->util_sum = 0;
+               sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight);
+
+       se->runnable_weight = se->load.weight;
+
         /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
  }
  
@@ -784,7 +778,6 @@ void post_init_entity_util_avg(struct sched_entity *se)
                 } else {
                         sa->util_avg = cap;
                 }
-               sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
         }
  
         if (entity_is_task(se)) {
@@ -2025,7 +2018,7 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
                 delta = runtime - p->last_sum_exec_runtime;
                 *period = now - p->last_task_numa_placement;
         } else {
-               delta = p->se.avg.load_sum / p->se.load.weight;
+               delta = p->se.avg.load_sum;
                 *period = LOAD_AVG_MAX;
         }
  
@@ -2692,18 +2685,226 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
         cfs_rq->nr_running--;
  }
  
+/*
+ * Signed add and clamp on underflow.
+ *
+ * Explicitly do a load-store to ensure the intermediate value never hits
+ * memory. This allows lockless observations without ever seeing the negative
+ * values.
+ */
+#define add_positive(_ptr, _val) do {                           \
+       typeof(_ptr) ptr = (_ptr);                              \
+       typeof(_val) val = (_val);                              \
+       typeof(*ptr) res, var = READ_ONCE(*ptr);                \
+                                                               \
+       res = var + val;                                        \
+                                                               \
+       if (val < 0 && res > var)                               \
+               res = 0;                                        \
+                                                               \
+       WRITE_ONCE(*ptr, res);                                  \
+} while (0)
+
+/*
+ * Unsigned subtract and clamp on underflow.
+ *
+ * Explicitly do a load-store to ensure the intermediate value never hits
+ * memory. This allows lockless observations without ever seeing the negative
+ * values.
+ */
+#define sub_positive(_ptr, _val) do {                          \
+       typeof(_ptr) ptr = (_ptr);                              \
+       typeof(*ptr) val = (_val);                              \
+       typeof(*ptr) res, var = READ_ONCE(*ptr);                \
+       res = var - val;                                        \
+       if (res > var)                                          \
+               res = 0;                                        \
+       WRITE_ONCE(*ptr, res);                                  \
+} while (0)
+
+#ifdef CONFIG_SMP
+/*
+ * XXX we want to get rid of these helpers and use the full load resolution.
+ */
+static inline long se_weight(struct sched_entity *se)
+{
+       return scale_load_down(se->load.weight);
+}
+
+static inline long se_runnable(struct sched_entity *se)
+{
+       return scale_load_down(se->runnable_weight);
+}
+
+static inline void
+enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+       cfs_rq->runnable_weight += se->runnable_weight;
+
+       cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg;
+       cfs_rq->avg.runnable_load_sum += se_runnable(se) * se->avg.runnable_load_sum;
+}
+
+static inline void
+dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+       cfs_rq->runnable_weight -= se->runnable_weight;
+
+       sub_positive(&cfs_rq->avg.runnable_load_avg, se->avg.runnable_load_avg);
+       sub_positive(&cfs_rq->avg.runnable_load_sum,
+                    se_runnable(se) * se->avg.runnable_load_sum);
+}
+
+static inline void
+enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+       cfs_rq->avg.load_avg += se->avg.load_avg;
+       cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
+}
+
+static inline void
+dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+       sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
+       sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
+}
+#else
+static inline void
+enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
+static inline void
+dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
+static inline void
+enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
+static inline void
+dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
+#endif
+
+static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+                           unsigned long weight, unsigned long runnable)
+{
+       if (se->on_rq) {
+               /* commit outstanding execution time */
+               if (cfs_rq->curr == se)
+                       update_curr(cfs_rq);
+               account_entity_dequeue(cfs_rq, se);
+               dequeue_runnable_load_avg(cfs_rq, se);
+       }
+       dequeue_load_avg(cfs_rq, se);
+
+       se->runnable_weight = runnable;
+       update_load_set(&se->load, weight);
+
+#ifdef CONFIG_SMP
+       do {
+               u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib;
+
+               se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
+               se->avg.runnable_load_avg =
+                       div_u64(se_runnable(se) * se->avg.runnable_load_sum, divider);
+       } while (0);
+#endif
+
+       enqueue_load_avg(cfs_rq, se);
+       if (se->on_rq) {
+               account_entity_enqueue(cfs_rq, se);
+               enqueue_runnable_load_avg(cfs_rq, se);
+       }
+}
+
+void reweight_task(struct task_struct *p, int prio)
+{
+       struct sched_entity *se = &p->se;
+       struct cfs_rq *cfs_rq = cfs_rq_of(se);
+       struct load_weight *load = &se->load;
+       unsigned long weight = scale_load(sched_prio_to_weight[prio]);
+
+       reweight_entity(cfs_rq, se, weight, weight);
+       load->inv_weight = sched_prio_to_wmult[prio];
+}
+
  #ifdef CONFIG_FAIR_GROUP_SCHED
  # ifdef CONFIG_SMP
-static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
+/*
+ * All this does is approximate the hierarchical proportion which includes that
+ * global sum we all love to hate.
+ *
+ * That is, the weight of a group entity, is the proportional share of the
+ * group weight based on the group runqueue weights. That is:
+ *
+ *                     tg->weight * grq->load.weight
+ *   ge->load.weight = -----------------------------               (1)
+ *                       \Sum grq->load.weight
+ *
+ * Now, because computing that sum is prohibitively expensive to compute (been
+ * there, done that) we approximate it with this average stuff. The average
+ * moves slower and therefore the approximation is cheaper and more stable.
+ *
+ * So instead of the above, we substitute:
+ *
+ *   grq->load.weight -> grq->avg.load_avg                         (2)
+ *
+ * which yields the following:
+ *
+ *                     tg->weight * grq->avg.load_avg
+ *   ge->load.weight = ------------------------------              (3)
+ *                             tg->load_avg
+ *
+ * Where: tg->load_avg ~= \Sum grq->avg.load_avg
+ *
+ * That is shares_avg, and it is right (given the approximation (2)).
+ *
+ * The problem with it is that because the average is slow -- it was designed
+ * to be exactly that of course -- this leads to transients in boundary
+ * conditions. In specific, the case where the group was idle and we start the
+ * one task. It takes time for our CPU's grq->avg.load_avg to build up,
+ * yielding bad latency etc..
+ *
+ * Now, in that special case (1) reduces to:
+ *
+ *                     tg->weight * grq->load.weight
+ *   ge->load.weight = ----------------------------- = tg->weight   (4)
+ *                         grp->load.weight
+ *
+ * That is, the sum collapses because all other CPUs are idle; the UP scenario.
+ *
+ * So what we do is modify our approximation (3) to approach (4) in the (near)
+ * UP case, like:
+ *
+ *   ge->load.weight =
+ *
+ *              tg->weight * grq->load.weight
+ *     ---------------------------------------------------         (5)
+ *     tg->load_avg - grq->avg.load_avg + grq->load.weight
+ *
+ * But because grq->load.weight can drop to 0, resulting in a divide by zero,
+ * we need to use grq->avg.load_avg as its lower bound, which then gives:
+ *
+ *
+ *                     tg->weight * grq->load.weight
+ *   ge->load.weight = -----------------------------              (6)
+ *                             tg_load_avg'
+ *
+ * Where:
+ *
+ *   tg_load_avg' = tg->load_avg - grq->avg.load_avg +
+ *                  max(grq->load.weight, grq->avg.load_avg)
+ *
+ * And that is shares_weight and is icky. In the (near) UP case it approaches
+ * (4) while in the normal case it approaches (3). It consistently
+ * overestimates the ge->load.weight and therefore:
+ *
+ *   \Sum ge->load.weight >= tg->weight
+ *
+ * hence icky!
+ */
+static long calc_group_shares(struct cfs_rq *cfs_rq)
  {
-       long tg_weight, load, shares;
+       long tg_weight, tg_shares, load, shares;
+       struct task_group *tg = cfs_rq->tg;
  
-       /*
-        * This really should be: cfs_rq->avg.load_avg, but instead we use
-        * cfs_rq->load.weight, which is its upper bound. This helps ramp up
-        * the shares for small weight interactive tasks.
-        */
-       load = scale_load_down(cfs_rq->load.weight);
+       tg_shares = READ_ONCE(tg->shares);
+
+       load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg);
  
         tg_weight = atomic_long_read(&tg->load_avg);
  
@@ -2711,7 +2912,7 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
         tg_weight -= cfs_rq->tg_load_avg_contrib;
         tg_weight += load;
  
-       shares = (tg->shares * load);
+       shares = (tg_shares * load);
         if (tg_weight)
                 shares /= tg_weight;
  
@@ -2727,63 +2928,86 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
          * case no task is runnable on a CPU MIN_SHARES=2 should be returned
          * instead of 0.
          */
-       if (shares < MIN_SHARES)
-               shares = MIN_SHARES;
-       if (shares > tg->shares)
-               shares = tg->shares;
-
-       return shares;
+       return clamp_t(long, shares, MIN_SHARES, tg_shares);
  }
-# else /* CONFIG_SMP */
-static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
-{
-       return tg->shares;
-}
-# endif /* CONFIG_SMP */
  
-static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
-                           unsigned long weight)
+/*
+ * This calculates the effective runnable weight for a group entity based on
+ * the group entity weight calculated above.
+ *
+ * Because of the above approximation (2), our group entity weight is
+ * an load_avg based ratio (3). This means that it includes blocked load and
+ * does not represent the runnable weight.
+ *
+ * Approximate the group entity's runnable weight per ratio from the group
+ * runqueue:
+ *
+ *                                          grq->avg.runnable_load_avg
+ *   ge->runnable_weight = ge->load.weight * -------------------------- (7)
+ *                                              grq->avg.load_avg
+ *
+ * However, analogous to above, since the avg numbers are slow, this leads to
+ * transients in the from-idle case. Instead we use:
+ *
+ *   ge->runnable_weight = ge->load.weight *
+ *
+ *             max(grq->avg.runnable_load_avg, grq->runnable_weight)
+ *             -----------------------------------------------------   (8)
+ *                   max(grq->avg.load_avg, grq->load.weight)
+ *
+ * Where these max() serve both to use the 'instant' values to fix the slow
+ * from-idle and avoid the /0 on to-idle, similar to (6).
+ */
+static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares)
  {
-       if (se->on_rq) {
-               /* commit outstanding execution time */
-               if (cfs_rq->curr == se)
-                       update_curr(cfs_rq);
-               account_entity_dequeue(cfs_rq, se);
-       }
+       long runnable, load_avg;
  
-       update_load_set(&se->load, weight);
+       load_avg = max(cfs_rq->avg.load_avg,
+                      scale_load_down(cfs_rq->load.weight));
  
-       if (se->on_rq)
-               account_entity_enqueue(cfs_rq, se);
+       runnable = max(cfs_rq->avg.runnable_load_avg,
+                      scale_load_down(cfs_rq->runnable_weight));
+
+       runnable *= shares;
+       if (load_avg)
+               runnable /= load_avg;
+
+       return clamp_t(long, runnable, MIN_SHARES, shares);
  }
+# endif /* CONFIG_SMP */
  
  static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
  
-static void update_cfs_shares(struct sched_entity *se)
+/*
+ * Recomputes the group entity based on the current state of its group
+ * runqueue.
+ */
+static void update_cfs_group(struct sched_entity *se)
  {
-       struct cfs_rq *cfs_rq = group_cfs_rq(se);
-       struct task_group *tg;
-       long shares;
+       struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+       long shares, runnable;
  
-       if (!cfs_rq)
+       if (!gcfs_rq)
                 return;
  
-       if (throttled_hierarchy(cfs_rq))
+       if (throttled_hierarchy(gcfs_rq))
                 return;
  
-       tg = cfs_rq->tg;
-
  #ifndef CONFIG_SMP
-       if (likely(se->load.weight == tg->shares))
+       runnable = shares = READ_ONCE(gcfs_rq->tg->shares);
+
+       if (likely(se->load.weight == shares))
                 return;
+#else
+       shares   = calc_group_shares(gcfs_rq);
+       runnable = calc_group_runnable(gcfs_rq, shares);
  #endif
-       shares = calc_cfs_shares(cfs_rq, tg);
  
-       reweight_entity(cfs_rq_of(se), se, shares);
+       reweight_entity(cfs_rq_of(se), se, shares, runnable);
  }
  
  #else /* CONFIG_FAIR_GROUP_SCHED */
-static inline void update_cfs_shares(struct sched_entity *se)
+static inline void update_cfs_group(struct sched_entity *se)
  {
  }
  #endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -2892,7 +3116,7 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
   */
  static __always_inline u32
  accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
-              unsigned long weight, int running, struct cfs_rq *cfs_rq)
+              unsigned long load, unsigned long runnable, int running)
  {
         unsigned long scale_freq, scale_cpu;
         u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
@@ -2909,10 +3133,8 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
          */
         if (periods) {
                 sa->load_sum = decay_load(sa->load_sum, periods);
-               if (cfs_rq) {
-                       cfs_rq->runnable_load_sum =
-                               decay_load(cfs_rq->runnable_load_sum, periods);
-               }
+               sa->runnable_load_sum =
+                       decay_load(sa->runnable_load_sum, periods);
                 sa->util_sum = decay_load((u64)(sa->util_sum), periods);
  
                 /*
@@ -2925,11 +3147,10 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
         sa->period_contrib = delta;
  
         contrib = cap_scale(contrib, scale_freq);
-       if (weight) {
-               sa->load_sum += weight * contrib;
-               if (cfs_rq)
-                       cfs_rq->runnable_load_sum += weight * contrib;
-       }
+       if (load)
+               sa->load_sum += load * contrib;
+       if (runnable)
+               sa->runnable_load_sum += runnable * contrib;
         if (running)
                 sa->util_sum += contrib * scale_cpu;
  
@@ -2965,8 +3186,8 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
   *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
   */
  static __always_inline int
-___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
-                 unsigned long weight, int running, struct cfs_rq *cfs_rq)
+___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
+                 unsigned long load, unsigned long runnable, int running)
  {
         u64 delta;
  
@@ -2999,8 +3220,8 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
          * this happens during idle_balance() which calls
          * update_blocked_averages()
          */
-       if (!weight)
-               running = 0;
+       if (!load)
+               runnable = running = 0;
  
         /*
          * Now we know we crossed measurement unit boundaries. The *_avg
@@ -3009,63 +3230,96 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
          * Step 1: accumulate *_sum since last_update_time. If we haven't
          * crossed period boundaries, finish.
          */
-       if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq))
+       if (!accumulate_sum(delta, cpu, sa, load, runnable, running))
                 return 0;
  
+       return 1;
+}
+
+static __always_inline void
+___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable)
+{
+       u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
+
         /*
          * Step 2: update *_avg.
          */
-       sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib);
-       if (cfs_rq) {
-               cfs_rq->runnable_load_avg =
-                       div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib);
-       }
-       sa->util_avg = sa->util_sum / (LOAD_AVG_MAX - 1024 + sa->period_contrib);
-
-       return 1;
+       sa->load_avg = div_u64(load * sa->load_sum, divider);
+       sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider);
+       sa->util_avg = sa->util_sum / divider;
  }
  
+/*
+ * sched_entity:
+ *
+ *   task:
+ *     se_runnable() == se_weight()
+ *
+ *   group: [ see update_cfs_group() ]
+ *     se_weight()   = tg->weight * grq->load_avg / tg->load_avg
+ *     se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg
+ *
+ *   load_sum := runnable_sum
+ *   load_avg = se_weight(se) * runnable_avg
+ *
+ *   runnable_load_sum := runnable_sum
+ *   runnable_load_avg = se_runnable(se) * runnable_avg
+ *
+ * XXX collapse load_sum and runnable_load_sum
+ *
+ * cfq_rs:
+ *
+ *   load_sum = \Sum se_weight(se) * se->avg.load_sum
+ *   load_avg = \Sum se->avg.load_avg
+ *
+ *   runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum
+ *   runnable_load_avg = \Sum se->avg.runable_load_avg
+ */
+
  static int
  __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
  {
-       return ___update_load_avg(now, cpu, &se->avg, 0, 0, NULL);
+       if (entity_is_task(se))
+               se->runnable_weight = se->load.weight;
+
+       if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) {
+               ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
+               return 1;
+       }
+
+       return 0;
  }
  
  static int
  __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
-       return ___update_load_avg(now, cpu, &se->avg,
-                                 se->on_rq * scale_load_down(se->load.weight),
-                                 cfs_rq->curr == se, NULL);
+       if (entity_is_task(se))
+               se->runnable_weight = se->load.weight;
+
+       if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq,
+                               cfs_rq->curr == se)) {
+
+               ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
+               return 1;
+       }
+
+       return 0;
  }
  
  static int
  __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
  {
-       return ___update_load_avg(now, cpu, &cfs_rq->avg,
-                       scale_load_down(cfs_rq->load.weight),
-                       cfs_rq->curr != NULL, cfs_rq);
-}
+       if (___update_load_sum(now, cpu, &cfs_rq->avg,
+                               scale_load_down(cfs_rq->load.weight),
+                               scale_load_down(cfs_rq->runnable_weight),
+                               cfs_rq->curr != NULL)) {
  
-/*
- * Signed add and clamp on underflow.
- *
- * Explicitly do a load-store to ensure the intermediate value never hits
- * memory. This allows lockless observations without ever seeing the negative
- * values.
- */
-#define add_positive(_ptr, _val) do {                           \
-       typeof(_ptr) ptr = (_ptr);                              \
-       typeof(_val) val = (_val);                              \
-       typeof(*ptr) res, var = READ_ONCE(*ptr);                \
-                                                               \
-       res = var + val;                                        \
-                                                               \
-       if (val < 0 && res > var)                               \
-               res = 0;                                        \
-                                                               \
-       WRITE_ONCE(*ptr, res);                                  \
-} while (0)
+               ___update_load_avg(&cfs_rq->avg, 1, 1);
+               return 1;
+       }
+
+       return 0;
+}
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
  /**
@@ -3148,11 +3402,77 @@ void set_task_rq_fair(struct sched_entity *se,
         se->avg.last_update_time = n_last_update_time;
  }
  
-/* Take into account change of utilization of a child task group */
+
+/*
+ * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
+ * propagate its contribution. The key to this propagation is the invariant
+ * that for each group:
+ *
+ *   ge->avg == grq->avg                                               (1)
+ *
+ * _IFF_ we look at the pure running and runnable sums. Because they
+ * represent the very same entity, just at different points in the hierarchy.
+ *
+ *
+ * Per the above update_tg_cfs_util() is trivial (and still 'wrong') and
+ * simply copies the running sum over.
+ *
+ * However, update_tg_cfs_runnable() is more complex. So we have:
+ *
+ *   ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg         (2)
+ *
+ * And since, like util, the runnable part should be directly transferable,
+ * the following would _appear_ to be the straight forward approach:
+ *
+ *   grq->avg.load_avg = grq->load.weight * grq->avg.running_avg       (3)
+ *
+ * And per (1) we have:
+ *
+ *   ge->avg.running_avg == grq->avg.running_avg
+ *
+ * Which gives:
+ *
+ *                      ge->load.weight * grq->avg.load_avg
+ *   ge->avg.load_avg = -----------------------------------            (4)
+ *                               grq->load.weight
+ *
+ * Except that is wrong!
+ *
+ * Because while for entities historical weight is not important and we
+ * really only care about our future and therefore can consider a pure
+ * runnable sum, runqueues can NOT do this.
+ *
+ * We specifically want runqueues to have a load_avg that includes
+ * historical weights. Those represent the blocked load, the load we expect
+ * to (shortly) return to us. This only works by keeping the weights as
+ * integral part of the sum. We therefore cannot decompose as per (3).
+ *
+ * OK, so what then?
+ *
+ *
+ * Another way to look at things is:
+ *
+ *   grq->avg.load_avg = \Sum se->avg.load_avg
+ *
+ * Therefore, per (2):
+ *
+ *   grq->avg.load_avg = \Sum se->load.weight * se->avg.runnable_avg
+ *
+ * And the very thing we're propagating is a change in that sum (someone
+ * joined/left). So we can easily know the runnable change, which would be, per
+ * (2) the already tracked se->load_avg divided by the corresponding
+ * se->weight.
+ *
+ * Basically (4) but in differential form:
+ *
+ *   d(runnable_avg) += se->avg.load_avg / se->load.weight
+ *                                                                (5)
+ *   ge->avg.load_avg += ge->load.weight * d(runnable_avg)
+ */
+
  static inline void
-update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
  {
-       struct cfs_rq *gcfs_rq = group_cfs_rq(se);
         long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
  
         /* Nothing to update */
@@ -3168,102 +3488,65 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
         cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
  }
  
-/* Take into account change of load of a child task group */
  static inline void
-update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
  {
-       struct cfs_rq *gcfs_rq = group_cfs_rq(se);
-       long delta, load = gcfs_rq->avg.load_avg;
-
-       /*
-        * If the load of group cfs_rq is null, the load of the
-        * sched_entity will also be null so we can skip the formula
-        */
-       if (load) {
-               long tg_load;
+       long runnable_sum = gcfs_rq->prop_runnable_sum;
+       long runnable_load_avg, load_avg;
+       s64 runnable_load_sum, load_sum;
  
-               /* Get tg's load and ensure tg_load > 0 */
-               tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
-
-               /* Ensure tg_load >= load and updated with current load*/
-               tg_load -= gcfs_rq->tg_load_avg_contrib;
-               tg_load += load;
+       if (!runnable_sum)
+               return;
  
-               /*
-                * We need to compute a correction term in the case that the
-                * task group is consuming more CPU than a task of equal
-                * weight. A task with a weight equals to tg->shares will have
-                * a load less or equal to scale_load_down(tg->shares).
-                * Similarly, the sched_entities that represent the task group
-                * at parent level, can't have a load higher than
-                * scale_load_down(tg->shares). And the Sum of sched_entities'
-                * load must be <= scale_load_down(tg->shares).
-                */
-               if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {
-                       /* scale gcfs_rq's load into tg's shares*/
-                       load *= scale_load_down(gcfs_rq->tg->shares);
-                       load /= tg_load;
-               }
-       }
+       gcfs_rq->prop_runnable_sum = 0;
  
-       delta = load - se->avg.load_avg;
+       load_sum = (s64)se_weight(se) * runnable_sum;
+       load_avg = div_s64(load_sum, LOAD_AVG_MAX);
  
-       /* Nothing to update */
-       if (!delta)
-               return;
+       add_positive(&se->avg.load_sum, runnable_sum);
+       add_positive(&se->avg.load_avg, load_avg);
  
-       /* Set new sched_entity's load */
-       se->avg.load_avg = load;
-       se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX;
+       add_positive(&cfs_rq->avg.load_avg, load_avg);
+       add_positive(&cfs_rq->avg.load_sum, load_sum);
  
-       /* Update parent cfs_rq load */
-       add_positive(&cfs_rq->avg.load_avg, delta);
-       cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX;
+       runnable_load_sum = (s64)se_runnable(se) * runnable_sum;
+       runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX);
+
+       add_positive(&se->avg.runnable_load_sum, runnable_sum);
+       add_positive(&se->avg.runnable_load_avg, runnable_load_avg);
  
-       /*
-        * If the sched_entity is already enqueued, we also have to update the
-        * runnable load avg.
-        */
         if (se->on_rq) {
-               /* Update parent cfs_rq runnable_load_avg */
-               add_positive(&cfs_rq->runnable_load_avg, delta);
-               cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX;
+               add_positive(&cfs_rq->avg.runnable_load_avg, runnable_load_avg);
+               add_positive(&cfs_rq->avg.runnable_load_sum, runnable_load_sum);
         }
  }
  
-static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq)
-{
-       cfs_rq->propagate_avg = 1;
-}
-
-static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se)
+static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
  {
-       struct cfs_rq *cfs_rq = group_cfs_rq(se);
-
-       if (!cfs_rq->propagate_avg)
-               return 0;
-
-       cfs_rq->propagate_avg = 0;
-       return 1;
+       cfs_rq->propagate = 1;
+       cfs_rq->prop_runnable_sum += runnable_sum;
  }
  
  /* Update task and its cfs_rq load average */
  static inline int propagate_entity_load_avg(struct sched_entity *se)
  {
-       struct cfs_rq *cfs_rq;
+       struct cfs_rq *cfs_rq, *gcfs_rq;
  
         if (entity_is_task(se))
                 return 0;
  
-       if (!test_and_clear_tg_cfs_propagate(se))
+       gcfs_rq = group_cfs_rq(se);
+       if (!gcfs_rq->propagate)
                 return 0;
  
+       gcfs_rq->propagate = 0;
+
         cfs_rq = cfs_rq_of(se);
  
-       set_tg_cfs_propagate(cfs_rq);
+       add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum);
  
-       update_tg_cfs_util(cfs_rq, se);
-       update_tg_cfs_load(cfs_rq, se);
+       update_tg_cfs_util(cfs_rq, se, gcfs_rq);
+       update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
  
         return 1;
  }
@@ -3287,7 +3570,7 @@ static inline bool skip_blocked_update(struct sched_entity *se)
          * If there is a pending propagation, we have to update the load and
          * the utilization of the sched_entity:
          */
-       if (gcfs_rq->propagate_avg)
+       if (gcfs_rq->propagate)
                 return false;
  
         /*
@@ -3307,27 +3590,10 @@ static inline int propagate_entity_load_avg(struct sched_entity *se)
         return 0;
  }
  
-static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
+static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {}
  
  #endif /* CONFIG_FAIR_GROUP_SCHED */
  
-/*
- * Unsigned subtract and clamp on underflow.
- *
- * Explicitly do a load-store to ensure the intermediate value never hits
- * memory. This allows lockless observations without ever seeing the negative
- * values.
- */
-#define sub_positive(_ptr, _val) do {                          \
-       typeof(_ptr) ptr = (_ptr);                              \
-       typeof(*ptr) val = (_val);                              \
-       typeof(*ptr) res, var = READ_ONCE(*ptr);                \
-       res = var - val;                                        \
-       if (res > var)                                          \
-               res = 0;                                        \
-       WRITE_ONCE(*ptr, res);                                  \
-} while (0)
-
  /**
   * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
   * @now: current time, as per cfs_rq_clock_task()
@@ -3347,65 +3613,45 @@ static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
  static inline int
  update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
  {
+       unsigned long removed_load = 0, removed_util = 0, removed_runnable_sum = 0;
         struct sched_avg *sa = &cfs_rq->avg;
-       int decayed, removed_load = 0, removed_util = 0;
+       int decayed = 0;
  
-       if (atomic_long_read(&cfs_rq->removed_load_avg)) {
-               s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
+       if (cfs_rq->removed.nr) {
+               unsigned long r;
+               u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
+
+               raw_spin_lock(&cfs_rq->removed.lock);
+               swap(cfs_rq->removed.util_avg, removed_util);
+               swap(cfs_rq->removed.load_avg, removed_load);
+               swap(cfs_rq->removed.runnable_sum, removed_runnable_sum);
+               cfs_rq->removed.nr = 0;
+               raw_spin_unlock(&cfs_rq->removed.lock);
+
+               r = removed_load;
                 sub_positive(&sa->load_avg, r);
-               sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
-               removed_load = 1;
-               set_tg_cfs_propagate(cfs_rq);
-       }
+               sub_positive(&sa->load_sum, r * divider);
  
-       if (atomic_long_read(&cfs_rq->removed_util_avg)) {
-               long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
+               r = removed_util;
                 sub_positive(&sa->util_avg, r);
-               sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
-               removed_util = 1;
-               set_tg_cfs_propagate(cfs_rq);
+               sub_positive(&sa->util_sum, r * divider);
+
+               add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum);
+
+               decayed = 1;
         }
  
-       decayed = __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq);
+       decayed |= __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq);
  
  #ifndef CONFIG_64BIT
         smp_wmb();
         cfs_rq->load_last_update_time_copy = sa->last_update_time;
  #endif
  
-       if (decayed || removed_util)
+       if (decayed)
                 cfs_rq_util_change(cfs_rq);
  
-       return decayed || removed_load;
-}
-
-/*
- * Optional action to be done while updating the load average
- */
-#define UPDATE_TG      0x1
-#define SKIP_AGE_LOAD  0x2
-
-/* Update task and its cfs_rq load average */
-static inline void update_load_avg(struct sched_entity *se, int flags)
-{
-       struct cfs_rq *cfs_rq = cfs_rq_of(se);
-       u64 now = cfs_rq_clock_task(cfs_rq);
-       struct rq *rq = rq_of(cfs_rq);
-       int cpu = cpu_of(rq);
-       int decayed;
-
-       /*
-        * Track task load average for carrying it to new CPU after migrated, and
-        * track group sched_entity load average for task_h_load calc in migration
-        */
-       if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
-               __update_load_avg_se(now, cpu, cfs_rq, se);
-
-       decayed  = update_cfs_rq_load_avg(now, cfs_rq);
-       decayed |= propagate_entity_load_avg(se);
-
-       if (decayed && (flags & UPDATE_TG))
-               update_tg_load_avg(cfs_rq, 0);
+       return decayed;
  }
  
  /**
@@ -3418,12 +3664,39 @@ static inline void update_load_avg(struct sched_entity *se, int flags)
   */
  static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
+       u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
+
+       /*
+        * When we attach the @se to the @cfs_rq, we must align the decay
+        * window because without that, really weird and wonderful things can
+        * happen.
+        *
+        * XXX illustrate
+        */
         se->avg.last_update_time = cfs_rq->avg.last_update_time;
-       cfs_rq->avg.load_avg += se->avg.load_avg;
-       cfs_rq->avg.load_sum += se->avg.load_sum;
+       se->avg.period_contrib = cfs_rq->avg.period_contrib;
+
+       /*
+        * Hell(o) Nasty stuff.. we need to recompute _sum based on the new
+        * period_contrib. This isn't strictly correct, but since we're
+        * entirely outside of the PELT hierarchy, nobody cares if we truncate
+        * _sum a little.
+        */
+       se->avg.util_sum = se->avg.util_avg * divider;
+
+       se->avg.load_sum = divider;
+       if (se_weight(se)) {
+               se->avg.load_sum =
+                       div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
+       }
+
+       se->avg.runnable_load_sum = se->avg.load_sum;
+
+       enqueue_load_avg(cfs_rq, se);
         cfs_rq->avg.util_avg += se->avg.util_avg;
         cfs_rq->avg.util_sum += se->avg.util_sum;
-       set_tg_cfs_propagate(cfs_rq);
+
+       add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
  
         cfs_rq_util_change(cfs_rq);
  }
@@ -3438,39 +3711,47 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
   */
  static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
-
-       sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
-       sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
+       dequeue_load_avg(cfs_rq, se);
         sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
         sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
-       set_tg_cfs_propagate(cfs_rq);
+
+       add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
  
         cfs_rq_util_change(cfs_rq);
  }
  
-/* Add the load generated by se into cfs_rq's load average */
-static inline void
-enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+/*
+ * Optional action to be done while updating the load average
+ */
+#define UPDATE_TG      0x1
+#define SKIP_AGE_LOAD  0x2
+#define DO_ATTACH      0x4
+
+/* Update task and its cfs_rq load average */
+static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
  {
-       struct sched_avg *sa = &se->avg;
+       u64 now = cfs_rq_clock_task(cfs_rq);
+       struct rq *rq = rq_of(cfs_rq);
+       int cpu = cpu_of(rq);
+       int decayed;
+
+       /*
+        * Track task load average for carrying it to new CPU after migrated, and
+        * track group sched_entity load average for task_h_load calc in migration
+        */
+       if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
+               __update_load_avg_se(now, cpu, cfs_rq, se);
  
-       cfs_rq->runnable_load_avg += sa->load_avg;
-       cfs_rq->runnable_load_sum += sa->load_sum;
+       decayed  = update_cfs_rq_load_avg(now, cfs_rq);
+       decayed |= propagate_entity_load_avg(se);
+
+       if (!se->avg.last_update_time && (flags & DO_ATTACH)) {
  
-       if (!sa->last_update_time) {
                 attach_entity_load_avg(cfs_rq, se);
                 update_tg_load_avg(cfs_rq, 0);
-       }
-}
  
-/* Remove the runnable load generated by se from cfs_rq's runnable load average */
-static inline void
-dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-       cfs_rq->runnable_load_avg =
-               max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
-       cfs_rq->runnable_load_sum =
-               max_t(s64,  cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
+       } else if (decayed && (flags & UPDATE_TG))
+               update_tg_load_avg(cfs_rq, 0);
  }
  
  #ifndef CONFIG_64BIT
@@ -3514,6 +3795,7 @@ void sync_entity_load_avg(struct sched_entity *se)
  void remove_entity_load_avg(struct sched_entity *se)
  {
         struct cfs_rq *cfs_rq = cfs_rq_of(se);
+       unsigned long flags;
  
         /*
          * tasks cannot exit without having gone through wake_up_new_task() ->
@@ -3526,13 +3808,18 @@ void remove_entity_load_avg(struct sched_entity *se)
          */
  
         sync_entity_load_avg(se);
-       atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
-       atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
+
+       raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags);
+       ++cfs_rq->removed.nr;
+       cfs_rq->removed.util_avg        += se->avg.util_avg;
+       cfs_rq->removed.load_avg        += se->avg.load_avg;
+       cfs_rq->removed.runnable_sum    += se->avg.load_sum; /* == runnable_sum */
+       raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
  }
  
  static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
  {
-       return cfs_rq->runnable_load_avg;
+       return cfs_rq->avg.runnable_load_avg;
  }
  
  static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
@@ -3552,16 +3839,13 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
  
  #define UPDATE_TG      0x0
  #define SKIP_AGE_LOAD  0x0
+#define DO_ATTACH      0x0
  
-static inline void update_load_avg(struct sched_entity *se, int not_used1)
+static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
  {
-       cfs_rq_util_change(cfs_rq_of(se));
+       cfs_rq_util_change(cfs_rq);
  }
  
-static inline void
-enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
-static inline void
-dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
  static inline void remove_entity_load_avg(struct sched_entity *se) {}
  
  static inline void
@@ -3706,9 +3990,9 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
          *     its group cfs_rq
          *   - Add its new weight to cfs_rq->load.weight
          */
-       update_load_avg(se, UPDATE_TG);
-       enqueue_entity_load_avg(cfs_rq, se);
-       update_cfs_shares(se);
+       update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
+       update_cfs_group(se);
+       enqueue_runnable_load_avg(cfs_rq, se);
         account_entity_enqueue(cfs_rq, se);
  
         if (flags & ENQUEUE_WAKEUP)
@@ -3790,8 +4074,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
          *   - For group entity, update its weight to reflect the new share
          *     of its group cfs_rq.
          */
-       update_load_avg(se, UPDATE_TG);
-       dequeue_entity_load_avg(cfs_rq, se);
+       update_load_avg(cfs_rq, se, UPDATE_TG);
+       dequeue_runnable_load_avg(cfs_rq, se);
  
         update_stats_dequeue(cfs_rq, se, flags);
  
@@ -3814,7 +4098,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
         /* return excess runtime on last dequeue */
         return_cfs_rq_runtime(cfs_rq);
  
-       update_cfs_shares(se);
+       update_cfs_group(se);
  
         /*
          * Now advance min_vruntime if @se was the entity holding it back,
@@ -3878,7 +4162,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
                  */
                 update_stats_wait_end(cfs_rq, se);
                 __dequeue_entity(cfs_rq, se);
-               update_load_avg(se, UPDATE_TG);
+               update_load_avg(cfs_rq, se, UPDATE_TG);
         }
  
         update_stats_curr_start(cfs_rq, se);
@@ -3980,7 +4264,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
                 /* Put 'current' back into the tree. */
                 __enqueue_entity(cfs_rq, prev);
                 /* in !on_rq case, update occurred at dequeue */
-               update_load_avg(prev, 0);
+               update_load_avg(cfs_rq, prev, 0);
         }
         cfs_rq->curr = NULL;
  }
@@ -3996,8 +4280,8 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
         /*
          * Ensure that runnable average is periodically updated.
          */
-       update_load_avg(curr, UPDATE_TG);
-       update_cfs_shares(curr);
+       update_load_avg(cfs_rq, curr, UPDATE_TG);
+       update_cfs_group(curr);
  
  #ifdef CONFIG_SCHED_HRTICK
         /*
@@ -4914,8 +5198,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                 if (cfs_rq_throttled(cfs_rq))
                         break;
  
-               update_load_avg(se, UPDATE_TG);
-               update_cfs_shares(se);
+               update_load_avg(cfs_rq, se, UPDATE_TG);
+               update_cfs_group(se);
         }
  
         if (!se)
@@ -4973,8 +5257,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                 if (cfs_rq_throttled(cfs_rq))
                         break;
  
-               update_load_avg(se, UPDATE_TG);
-               update_cfs_shares(se);
+               update_load_avg(cfs_rq, se, UPDATE_TG);
+               update_cfs_group(se);
         }
  
         if (!se)
@@ -5356,91 +5640,62 @@ static int wake_wide(struct task_struct *p)
         return 1;
  }
  
-struct llc_stats {
-       unsigned long   nr_running;
-       unsigned long   load;
-       unsigned long   capacity;
-       int             has_capacity;
-};
+/*
+ * The purpose of wake_affine() is to quickly determine on which CPU we can run
+ * soonest. For the purpose of speed we only consider the waking and previous
+ * CPU.
+ *
+ * wake_affine_idle() - only considers 'now', it check if the waking CPU is (or
+ *                     will be) idle.
+ *
+ * wake_affine_weight() - considers the weight to reflect the average
+ *                       scheduling latency of the CPUs. This seems to work
+ *                       for the overloaded case.
+ */
  
-static bool get_llc_stats(struct llc_stats *stats, int cpu)
+static bool
+wake_affine_idle(struct sched_domain *sd, struct task_struct *p,
+                int this_cpu, int prev_cpu, int sync)
  {
-       struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
-
-       if (!sds)
-               return false;
+       if (idle_cpu(this_cpu))
+               return true;
  
-       stats->nr_running       = READ_ONCE(sds->nr_running);
-       stats->load             = READ_ONCE(sds->load);
-       stats->capacity         = READ_ONCE(sds->capacity);
-       stats->has_capacity     = stats->nr_running < per_cpu(sd_llc_size, cpu);
+       if (sync && cpu_rq(this_cpu)->nr_running == 1)
+               return true;
  
-       return true;
+       return false;
  }
  
-/*
- * Can a task be moved from prev_cpu to this_cpu without causing a load
- * imbalance that would trigger the load balancer?
- *
- * Since we're running on 'stale' values, we might in fact create an imbalance
- * but recomputing these values is expensive, as that'd mean iteration 2 cache
- * domains worth of CPUs.
- */
  static bool
-wake_affine_llc(struct sched_domain *sd, struct task_struct *p,
-               int this_cpu, int prev_cpu, int sync)
+wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
+                  int this_cpu, int prev_cpu, int sync)
  {
-       struct llc_stats prev_stats, this_stats;
         s64 this_eff_load, prev_eff_load;
         unsigned long task_load;
  
-       if (!get_llc_stats(&prev_stats, prev_cpu) ||
-           !get_llc_stats(&this_stats, this_cpu))
-               return false;
+       this_eff_load = target_load(this_cpu, sd->wake_idx);
+       prev_eff_load = source_load(prev_cpu, sd->wake_idx);
  
-       /*
-        * If sync wakeup then subtract the (maximum possible)
-        * effect of the currently running task from the load
-        * of the current LLC.
-        */
         if (sync) {
                 unsigned long current_load = task_h_load(current);
  
-               /* in this case load hits 0 and this LLC is considered 'idle' */
-               if (current_load > this_stats.load)
+               if (current_load > this_eff_load)
                         return true;
  
-               this_stats.load -= current_load;
+               this_eff_load -= current_load;
         }
  
-       /*
-        * The has_capacity stuff is not SMT aware, but by trying to balance
-        * the nr_running on both ends we try and fill the domain at equal
-        * rates, thereby first consuming cores before siblings.
-        */
-
-       /* if the old cache has capacity, stay there */
-       if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1)
-               return false;
-
-       /* if this cache has capacity, come here */
-       if (this_stats.has_capacity && this_stats.nr_running+1 < prev_stats.nr_running)
-               return true;
-
-       /*
-        * Check to see if we can move the load without causing too much
-        * imbalance.
-        */
         task_load = task_h_load(p);
  
-       this_eff_load = 100;
-       this_eff_load *= prev_stats.capacity;
+       this_eff_load += task_load;
+       if (sched_feat(WA_BIAS))
+               this_eff_load *= 100;
+       this_eff_load *= capacity_of(prev_cpu);
  
-       prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
-       prev_eff_load *= this_stats.capacity;
-
-       this_eff_load *= this_stats.load + task_load;
-       prev_eff_load *= prev_stats.load - task_load;
+       prev_eff_load -= task_load;
+       if (sched_feat(WA_BIAS))
+               prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
+       prev_eff_load *= capacity_of(this_cpu);
  
         return this_eff_load <= prev_eff_load;
  }
@@ -5449,22 +5704,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
                        int prev_cpu, int sync)
  {
         int this_cpu = smp_processor_id();
-       bool affine;
+       bool affine = false;
  
-       /*
-        * Default to no affine wakeups; wake_affine() should not effect a task
-        * placement the load-balancer feels inclined to undo. The conservative
-        * option is therefore to not move tasks when they wake up.
-        */
-       affine = false;
+       if (sched_feat(WA_IDLE) && !affine)
+               affine = wake_affine_idle(sd, p, this_cpu, prev_cpu, sync);
  
-       /*
-        * If the wakeup is across cache domains, try to evaluate if movement
-        * makes sense, otherwise rely on select_idle_siblings() to do
-        * placement inside the cache domain.
-        */
-       if (!cpus_share_cache(prev_cpu, this_cpu))
-               affine = wake_affine_llc(sd, p, this_cpu, prev_cpu, sync);
+       if (sched_feat(WA_WEIGHT) && !affine)
+               affine = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
  
         schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
         if (affine) {
@@ -5486,6 +5732,8 @@ static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
  /*
   * find_idlest_group finds and returns the least busy CPU group within the
   * domain.
+ *
+ * Assumes p is allowed on at least one CPU in sd.
   */
  static struct sched_group *
  find_idlest_group(struct sched_domain *sd, struct task_struct *p,
@@ -5493,8 +5741,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
  {
         struct sched_group *idlest = NULL, *group = sd->groups;
         struct sched_group *most_spare_sg = NULL;
-       unsigned long min_runnable_load = ULONG_MAX, this_runnable_load = 0;
-       unsigned long min_avg_load = ULONG_MAX, this_avg_load = 0;
+       unsigned long min_runnable_load = ULONG_MAX;
+       unsigned long this_runnable_load = ULONG_MAX;
+       unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
         unsigned long most_spare = 0, this_spare = 0;
         int load_idx = sd->forkexec_idx;
         int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
@@ -5615,10 +5864,10 @@ skip_spare:
  }
  
  /*
- * find_idlest_cpu - find the idlest cpu among the cpus in group.
+ * find_idlest_group_cpu - find the idlest cpu among the cpus in group.
   */
  static int
-find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
+find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
  {
         unsigned long load, min_load = ULONG_MAX;
         unsigned int min_exit_latency = UINT_MAX;
@@ -5667,6 +5916,53 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
         return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
  }
  
+static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
+                                 int cpu, int prev_cpu, int sd_flag)
+{
+       int new_cpu = cpu;
+
+       if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
+               return prev_cpu;
+
+       while (sd) {
+               struct sched_group *group;
+               struct sched_domain *tmp;
+               int weight;
+
+               if (!(sd->flags & sd_flag)) {
+                       sd = sd->child;
+                       continue;
+               }
+
+               group = find_idlest_group(sd, p, cpu, sd_flag);
+               if (!group) {
+                       sd = sd->child;
+                       continue;
+               }
+
+               new_cpu = find_idlest_group_cpu(group, p, cpu);
+               if (new_cpu == cpu) {
+                       /* Now try balancing at a lower domain level of cpu */
+                       sd = sd->child;
+                       continue;
+               }
+
+               /* Now try balancing at a lower domain level of new_cpu */
+               cpu = new_cpu;
+               weight = sd->span_weight;
+               sd = NULL;
+               for_each_domain(cpu, tmp) {
+                       if (weight <= tmp->span_weight)
+                               break;
+                       if (tmp->flags & sd_flag)
+                               sd = tmp;
+               }
+               /* while loop will break here if sd == NULL */
+       }
+
+       return new_cpu;
+}
+
  #ifdef CONFIG_SCHED_SMT
  
  static inline void set_idle_cores(int cpu, int val)
@@ -6019,50 +6315,30 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
                         new_cpu = cpu;
         }
  
+       if (sd && !(sd_flag & SD_BALANCE_FORK)) {
+               /*
+                * We're going to need the task's util for capacity_spare_wake
+                * in find_idlest_group. Sync it up to prev_cpu's
+                * last_update_time.
+                */
+               sync_entity_load_avg(&p->se);
+       }
+
         if (!sd) {
- pick_cpu:
+pick_cpu:
                 if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
                         new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
  
-       } else while (sd) {
-               struct sched_group *group;
-               int weight;
-
-               if (!(sd->flags & sd_flag)) {
-                       sd = sd->child;
-                       continue;
-               }
-
-               group = find_idlest_group(sd, p, cpu, sd_flag);
-               if (!group) {
-                       sd = sd->child;
-                       continue;
-               }
-
-               new_cpu = find_idlest_cpu(group, p, cpu);
-               if (new_cpu == -1 || new_cpu == cpu) {
-                       /* Now try balancing at a lower domain level of cpu */
-                       sd = sd->child;
-                       continue;
-               }
-
-               /* Now try balancing at a lower domain level of new_cpu */
-               cpu = new_cpu;
-               weight = sd->span_weight;
-               sd = NULL;
-               for_each_domain(cpu, tmp) {
-                       if (weight <= tmp->span_weight)
-                               break;
-                       if (tmp->flags & sd_flag)
-                               sd = tmp;
-               }
-               /* while loop will break here if sd == NULL */
+       } else {
+               new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
         }
         rcu_read_unlock();
  
         return new_cpu;
  }
  
+static void detach_entity_cfs_rq(struct sched_entity *se);
+
  /*
   * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
   * cfs_rq_of(p) references at time of call are still valid and identify the
@@ -6096,14 +6372,25 @@ static void migrate_task_rq_fair(struct task_struct *p)
                 se->vruntime -= min_vruntime;
         }
  
-       /*
-        * We are supposed to update the task to "current" time, then its up to date
-        * and ready to go to new CPU/cfs_rq. But we have difficulty in getting
-        * what current time is, so simply throw away the out-of-date time. This
-        * will result in the wakee task is less decayed, but giving the wakee more
-        * load sounds not bad.
-        */
-       remove_entity_load_avg(&p->se);
+       if (p->on_rq == TASK_ON_RQ_MIGRATING) {
+               /*
+                * In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old'
+                * rq->lock and can modify state directly.
+                */
+               lockdep_assert_held(&task_rq(p)->lock);
+               detach_entity_cfs_rq(&p->se);
+
+       } else {
+               /*
+                * We are supposed to update the task to "current" time, then
+                * its up to date and ready to go to new CPU/cfs_rq. But we
+                * have difficulty in getting what current time is, so simply
+                * throw away the out-of-date time. This will result in the
+                * wakee task is less decayed, but giving the wakee more load
+                * sounds not bad.
+                */
+               remove_entity_load_avg(&p->se);
+       }
  
         /* Tell new CPU we are migrated */
         p->se.avg.last_update_time = 0;
@@ -6371,10 +6658,7 @@ again:
                 set_next_entity(cfs_rq, se);
         }
  
-       if (hrtick_enabled(rq))
-               hrtick_start_fair(rq, p);
-
-       return p;
+       goto done;
  simple:
  #endif
  
@@ -6388,6 +6672,16 @@ simple:
  
         p = task_of(se);
  
+done: __maybe_unused
+#ifdef CONFIG_SMP
+       /*
+        * Move the next running task to the front of
+        * the list, so our cfs_tasks list becomes MRU
+        * one.
+        */
+       list_move(&p->se.group_node, &rq->cfs_tasks);
+#endif
+
         if (hrtick_enabled(rq))
                 hrtick_start_fair(rq, p);
  
@@ -6823,11 +7117,12 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
   */
  static struct task_struct *detach_one_task(struct lb_env *env)
  {
-       struct task_struct *p, *n;
+       struct task_struct *p;
  
         lockdep_assert_held(&env->src_rq->lock);
  
-       list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
+       list_for_each_entry_reverse(p,
+                       &env->src_rq->cfs_tasks, se.group_node) {
                 if (!can_migrate_task(p, env))
                         continue;
  
@@ -6873,7 +7168,7 @@ static int detach_tasks(struct lb_env *env)
                 if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
                         break;
  
-               p = list_first_entry(tasks, struct task_struct, se.group_node);
+               p = list_last_entry(tasks, struct task_struct, se.group_node);
  
                 env->loop++;
                 /* We've more or less seen every task there is, call it quits */
@@ -6923,7 +7218,7 @@ static int detach_tasks(struct lb_env *env)
  
                 continue;
  next:
-               list_move_tail(&p->se.group_node, tasks);
+               list_move(&p->se.group_node, tasks);
         }
  
         /*
@@ -6999,7 +7294,7 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
         if (cfs_rq->avg.util_sum)
                 return false;
  
-       if (cfs_rq->runnable_load_sum)
+       if (cfs_rq->avg.runnable_load_sum)
                 return false;
  
         return true;
@@ -7031,7 +7326,7 @@ static void update_blocked_averages(int cpu)
                 /* Propagate pending load changes to the parent, if any: */
                 se = cfs_rq->tg->se[cpu];
                 if (se && !skip_blocked_update(se))
-                       update_load_avg(se, 0);
+                       update_load_avg(cfs_rq_of(se), se, 0);
  
                 /*
                  * There can be a lot of idle CPU cgroups.  Don't let fully
@@ -7600,7 +7895,6 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
   */
  static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
  {
-       struct sched_domain_shared *shared = env->sd->shared;
         struct sched_domain *child = env->sd->child;
         struct sched_group *sg = env->sd->groups;
         struct sg_lb_stats *local = &sds->local_stat;
@@ -7672,22 +7966,6 @@ next_group:
                 if (env->dst_rq->rd->overload != overload)
                         env->dst_rq->rd->overload = overload;
         }
-
-       if (!shared)
-               return;
-
-       /*
-        * Since these are sums over groups they can contain some CPUs
-        * multiple times for the NUMA domains.
-        *
-        * Currently only wake_affine_llc() and find_busiest_group()
-        * uses these numbers, only the last is affected by this problem.
-        *
-        * XXX fix that.
-        */
-       WRITE_ONCE(shared->nr_running,  sds->total_running);
-       WRITE_ONCE(shared->load,        sds->total_load);
-       WRITE_ONCE(shared->capacity,    sds->total_capacity);
  }
  
  /**
@@ -7929,8 +8207,11 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
         if (busiest->group_type == group_imbalanced)
                 goto force_balance;
  
-       /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
-       if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) &&
+       /*
+        * When dst_cpu is idle, prevent SMP nice and/or asymmetric group
+        * capacities from resulting in underutilization due to avg_load.
+        */
+       if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
             busiest->group_no_capacity)
                 goto force_balance;
  
@@ -8098,6 +8379,13 @@ static int should_we_balance(struct lb_env *env)
         int cpu, balance_cpu = -1;
  
         /*
+        * Ensure the balancing environment is consistent; can happen
+        * when the softirq triggers 'during' hotplug.
+        */
+       if (!cpumask_test_cpu(env->dst_cpu, env->cpus))
+               return 0;
+
+       /*
          * In the newly idle case, we will allow all the cpu's
          * to do the newly idle load balance.
          */
@@ -8740,7 +9028,7 @@ void nohz_balance_enter_idle(int cpu)
                 return;
  
         /* Spare idle load balancing on CPUs that don't want to be disturbed: */
-       if (!is_housekeeping_cpu(cpu))
+       if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
                 return;
  
         if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
@@ -9205,7 +9493,7 @@ static void propagate_entity_cfs_rq(struct sched_entity *se)
                 if (cfs_rq_throttled(cfs_rq))
                         break;
  
-               update_load_avg(se, UPDATE_TG);
+               update_load_avg(cfs_rq, se, UPDATE_TG);
         }
  }
  #else
@@ -9217,7 +9505,7 @@ static void detach_entity_cfs_rq(struct sched_entity *se)
         struct cfs_rq *cfs_rq = cfs_rq_of(se);
  
         /* Catch up with the cfs_rq and remove our load when we leave */
-       update_load_avg(se, 0);
+       update_load_avg(cfs_rq, se, 0);
         detach_entity_load_avg(cfs_rq, se);
         update_tg_load_avg(cfs_rq, false);
         propagate_entity_cfs_rq(se);
@@ -9236,7 +9524,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
  #endif
  
         /* Synchronize entity with its cfs_rq */
-       update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
+       update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
         attach_entity_load_avg(cfs_rq, se);
         update_tg_load_avg(cfs_rq, false);
         propagate_entity_cfs_rq(se);
@@ -9318,11 +9606,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
         cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
  #endif
  #ifdef CONFIG_SMP
-#ifdef CONFIG_FAIR_GROUP_SCHED
-       cfs_rq->propagate_avg = 0;
-#endif
-       atomic_long_set(&cfs_rq->removed_load_avg, 0);
-       atomic_long_set(&cfs_rq->removed_util_avg, 0);
+       raw_spin_lock_init(&cfs_rq->removed.lock);
  #endif
  }
  
@@ -9520,8 +9804,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
                 rq_lock_irqsave(rq, &rf);
                 update_rq_clock(rq);
                 for_each_sched_entity(se) {
-                       update_load_avg(se, UPDATE_TG);
-                       update_cfs_shares(se);
+                       update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
+                       update_cfs_group(se);
                 }
                 rq_unlock_irqrestore(rq, &rf);
         }