Merge tag 'x86-cleanups-2023-08-28' of git://git.kernel.org/pub/scm/linux/kernel...

[platform/kernel/linux-starfive.git] / kernel / sched / fair.c
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 695f8e5..911d006 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -47,6 +47,7 @@
  #include <linux/psi.h>
  #include <linux/ratelimit.h>
  #include <linux/task_work.h>
+#include <linux/rbtree_augmented.h>
  
  #include <asm/switch_to.h>
  
@@ -57,22 +58,6 @@
  #include "autogroup.h"
  
  /*
- * Targeted preemption latency for CPU-bound tasks:
- *
- * NOTE: this latency value is not the same as the concept of
- * 'timeslice length' - timeslices in CFS are of variable length
- * and have no persistent notion like in traditional, time-slice
- * based scheduling concepts.
- *
- * (to see the precise effective timeslice length of your workload,
- *  run vmstat and monitor the context-switches (cs) field)
- *
- * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
- */
-unsigned int sysctl_sched_latency                      = 6000000ULL;
-static unsigned int normalized_sysctl_sched_latency    = 6000000ULL;
-
-/*
   * The initial- and re-scaling of tunables is configurable
   *
   * Options are:
@@ -90,21 +75,8 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
   *
   * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
   */
-unsigned int sysctl_sched_min_granularity                      = 750000ULL;
-static unsigned int normalized_sysctl_sched_min_granularity    = 750000ULL;
-
-/*
- * Minimal preemption granularity for CPU-bound SCHED_IDLE tasks.
- * Applies only when SCHED_IDLE tasks compete with normal tasks.
- *
- * (default: 0.75 msec)
- */
-unsigned int sysctl_sched_idle_min_granularity                 = 750000ULL;
-
-/*
- * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
- */
-static unsigned int sched_nr_latency = 8;
+unsigned int sysctl_sched_base_slice                   = 750000ULL;
+static unsigned int normalized_sysctl_sched_base_slice = 750000ULL;
  
  /*
   * After fork, child runs first. If set to 0 (default) then
@@ -112,18 +84,6 @@ static unsigned int sched_nr_latency = 8;
   */
  unsigned int sysctl_sched_child_runs_first __read_mostly;
  
-/*
- * SCHED_OTHER wake-up granularity.
- *
- * This option delays the preemption effects of decoupled workloads
- * and reduces their over-scheduling. Synchronous workloads will still
- * have immediate wakeup/sleep latencies.
- *
- * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
- */
-unsigned int sysctl_sched_wakeup_granularity                   = 1000000UL;
-static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
-
  const_debug unsigned int sysctl_sched_migration_cost   = 500000UL;
  
  int sched_thermal_decay_shift;
@@ -277,9 +237,7 @@ static void update_sysctl(void)
  
  #define SET_SYSCTL(name) \
         (sysctl_##name = (factor) * normalized_sysctl_##name)
-       SET_SYSCTL(sched_min_granularity);
-       SET_SYSCTL(sched_latency);
-       SET_SYSCTL(sched_wakeup_granularity);
+       SET_SYSCTL(sched_base_slice);
  #undef SET_SYSCTL
  }
  
@@ -347,6 +305,16 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight
         return mul_u64_u32_shr(delta_exec, fact, shift);
  }
  
+/*
+ * delta /= w
+ */
+static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
+{
+       if (unlikely(se->load.weight != NICE_0_LOAD))
+               delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
+
+       return delta;
+}
  
  const struct sched_class fair_sched_class;
  
@@ -601,13 +569,198 @@ static inline bool entity_before(const struct sched_entity *a,
         return (s64)(a->vruntime - b->vruntime) < 0;
  }
  
+static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+       return (s64)(se->vruntime - cfs_rq->min_vruntime);
+}
+
  #define __node_2_se(node) \
         rb_entry((node), struct sched_entity, run_node)
  
+/*
+ * Compute virtual time from the per-task service numbers:
+ *
+ * Fair schedulers conserve lag:
+ *
+ *   \Sum lag_i = 0
+ *
+ * Where lag_i is given by:
+ *
+ *   lag_i = S - s_i = w_i * (V - v_i)
+ *
+ * Where S is the ideal service time and V is it's virtual time counterpart.
+ * Therefore:
+ *
+ *   \Sum lag_i = 0
+ *   \Sum w_i * (V - v_i) = 0
+ *   \Sum w_i * V - w_i * v_i = 0
+ *
+ * From which we can solve an expression for V in v_i (which we have in
+ * se->vruntime):
+ *
+ *       \Sum v_i * w_i   \Sum v_i * w_i
+ *   V = -------------- = --------------
+ *          \Sum w_i            W
+ *
+ * Specifically, this is the weighted average of all entity virtual runtimes.
+ *
+ * [[ NOTE: this is only equal to the ideal scheduler under the condition
+ *          that join/leave operations happen at lag_i = 0, otherwise the
+ *          virtual time has non-continguous motion equivalent to:
+ *
+ *           V +-= lag_i / W
+ *
+ *         Also see the comment in place_entity() that deals with this. ]]
+ *
+ * However, since v_i is u64, and the multiplcation could easily overflow
+ * transform it into a relative form that uses smaller quantities:
+ *
+ * Substitute: v_i == (v_i - v0) + v0
+ *
+ *     \Sum ((v_i - v0) + v0) * w_i   \Sum (v_i - v0) * w_i
+ * V = ---------------------------- = --------------------- + v0
+ *                  W                            W
+ *
+ * Which we track using:
+ *
+ *                    v0 := cfs_rq->min_vruntime
+ * \Sum (v_i - v0) * w_i := cfs_rq->avg_vruntime
+ *              \Sum w_i := cfs_rq->avg_load
+ *
+ * Since min_vruntime is a monotonic increasing variable that closely tracks
+ * the per-task service, these deltas: (v_i - v), will be in the order of the
+ * maximal (virtual) lag induced in the system due to quantisation.
+ *
+ * Also, we use scale_load_down() to reduce the size.
+ *
+ * As measured, the max (key * weight) value was ~44 bits for a kernel build.
+ */
+static void
+avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+       unsigned long weight = scale_load_down(se->load.weight);
+       s64 key = entity_key(cfs_rq, se);
+
+       cfs_rq->avg_vruntime += key * weight;
+       cfs_rq->avg_load += weight;
+}
+
+static void
+avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+       unsigned long weight = scale_load_down(se->load.weight);
+       s64 key = entity_key(cfs_rq, se);
+
+       cfs_rq->avg_vruntime -= key * weight;
+       cfs_rq->avg_load -= weight;
+}
+
+static inline
+void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
+{
+       /*
+        * v' = v + d ==> avg_vruntime' = avg_runtime - d*avg_load
+        */
+       cfs_rq->avg_vruntime -= cfs_rq->avg_load * delta;
+}
+
+u64 avg_vruntime(struct cfs_rq *cfs_rq)
+{
+       struct sched_entity *curr = cfs_rq->curr;
+       s64 avg = cfs_rq->avg_vruntime;
+       long load = cfs_rq->avg_load;
+
+       if (curr && curr->on_rq) {
+               unsigned long weight = scale_load_down(curr->load.weight);
+
+               avg += entity_key(cfs_rq, curr) * weight;
+               load += weight;
+       }
+
+       if (load)
+               avg = div_s64(avg, load);
+
+       return cfs_rq->min_vruntime + avg;
+}
+
+/*
+ * lag_i = S - s_i = w_i * (V - v_i)
+ *
+ * However, since V is approximated by the weighted average of all entities it
+ * is possible -- by addition/removal/reweight to the tree -- to move V around
+ * and end up with a larger lag than we started with.
+ *
+ * Limit this to either double the slice length with a minimum of TICK_NSEC
+ * since that is the timing granularity.
+ *
+ * EEVDF gives the following limit for a steady state system:
+ *
+ *   -r_max < lag < max(r_max, q)
+ *
+ * XXX could add max_slice to the augmented data to track this.
+ */
+void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+       s64 lag, limit;
+
+       SCHED_WARN_ON(!se->on_rq);
+       lag = avg_vruntime(cfs_rq) - se->vruntime;
+
+       limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
+       se->vlag = clamp(lag, -limit, limit);
+}
+
+/*
+ * Entity is eligible once it received less service than it ought to have,
+ * eg. lag >= 0.
+ *
+ * lag_i = S - s_i = w_i*(V - v_i)
+ *
+ * lag_i >= 0 -> V >= v_i
+ *
+ *     \Sum (v_i - v)*w_i
+ * V = ------------------ + v
+ *          \Sum w_i
+ *
+ * lag_i >= 0 -> \Sum (v_i - v)*w_i >= (v_i - v)*(\Sum w_i)
+ *
+ * Note: using 'avg_vruntime() > se->vruntime' is inacurate due
+ *       to the loss in precision caused by the division.
+ */
+int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+       struct sched_entity *curr = cfs_rq->curr;
+       s64 avg = cfs_rq->avg_vruntime;
+       long load = cfs_rq->avg_load;
+
+       if (curr && curr->on_rq) {
+               unsigned long weight = scale_load_down(curr->load.weight);
+
+               avg += entity_key(cfs_rq, curr) * weight;
+               load += weight;
+       }
+
+       return avg >= entity_key(cfs_rq, se) * load;
+}
+
+static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
+{
+       u64 min_vruntime = cfs_rq->min_vruntime;
+       /*
+        * open coded max_vruntime() to allow updating avg_vruntime
+        */
+       s64 delta = (s64)(vruntime - min_vruntime);
+       if (delta > 0) {
+               avg_vruntime_update(cfs_rq, delta);
+               min_vruntime = vruntime;
+       }
+       return min_vruntime;
+}
+
  static void update_min_vruntime(struct cfs_rq *cfs_rq)
  {
+       struct sched_entity *se = __pick_first_entity(cfs_rq);
         struct sched_entity *curr = cfs_rq->curr;
-       struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline);
  
         u64 vruntime = cfs_rq->min_vruntime;
  
@@ -618,9 +771,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
                         curr = NULL;
         }
  
-       if (leftmost) { /* non-empty tree */
-               struct sched_entity *se = __node_2_se(leftmost);
-
+       if (se) {
                 if (!curr)
                         vruntime = se->vruntime;
                 else
@@ -629,7 +780,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
  
         /* ensure we never gain time by being placed backwards. */
         u64_u32_store(cfs_rq->min_vruntime,
-                     max_vruntime(cfs_rq->min_vruntime, vruntime));
+                     __update_min_vruntime(cfs_rq, vruntime));
  }
  
  static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
@@ -637,17 +788,51 @@ static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
         return entity_before(__node_2_se(a), __node_2_se(b));
  }
  
+#define deadline_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; })
+
+static inline void __update_min_deadline(struct sched_entity *se, struct rb_node *node)
+{
+       if (node) {
+               struct sched_entity *rse = __node_2_se(node);
+               if (deadline_gt(min_deadline, se, rse))
+                       se->min_deadline = rse->min_deadline;
+       }
+}
+
+/*
+ * se->min_deadline = min(se->deadline, left->min_deadline, right->min_deadline)
+ */
+static inline bool min_deadline_update(struct sched_entity *se, bool exit)
+{
+       u64 old_min_deadline = se->min_deadline;
+       struct rb_node *node = &se->run_node;
+
+       se->min_deadline = se->deadline;
+       __update_min_deadline(se, node->rb_right);
+       __update_min_deadline(se, node->rb_left);
+
+       return se->min_deadline == old_min_deadline;
+}
+
+RB_DECLARE_CALLBACKS(static, min_deadline_cb, struct sched_entity,
+                    run_node, min_deadline, min_deadline_update);
+
  /*
   * Enqueue an entity into the rb-tree:
   */
  static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
-       rb_add_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less);
+       avg_vruntime_add(cfs_rq, se);
+       se->min_deadline = se->deadline;
+       rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
+                               __entity_less, &min_deadline_cb);
  }
  
  static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
-       rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
+       rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
+                                 &min_deadline_cb);
+       avg_vruntime_sub(cfs_rq, se);
  }
  
  struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
@@ -660,14 +845,88 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
         return __node_2_se(left);
  }
  
-static struct sched_entity *__pick_next_entity(struct sched_entity *se)
+/*
+ * Earliest Eligible Virtual Deadline First
+ *
+ * In order to provide latency guarantees for different request sizes
+ * EEVDF selects the best runnable task from two criteria:
+ *
+ *  1) the task must be eligible (must be owed service)
+ *
+ *  2) from those tasks that meet 1), we select the one
+ *     with the earliest virtual deadline.
+ *
+ * We can do this in O(log n) time due to an augmented RB-tree. The
+ * tree keeps the entries sorted on service, but also functions as a
+ * heap based on the deadline by keeping:
+ *
+ *  se->min_deadline = min(se->deadline, se->{left,right}->min_deadline)
+ *
+ * Which allows an EDF like search on (sub)trees.
+ */
+static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
  {
-       struct rb_node *next = rb_next(&se->run_node);
+       struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
+       struct sched_entity *curr = cfs_rq->curr;
+       struct sched_entity *best = NULL;
  
-       if (!next)
-               return NULL;
+       if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
+               curr = NULL;
+
+       /*
+        * Once selected, run a task until it either becomes non-eligible or
+        * until it gets a new slice. See the HACK in set_next_entity().
+        */
+       if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline)
+               return curr;
+
+       while (node) {
+               struct sched_entity *se = __node_2_se(node);
  
-       return __node_2_se(next);
+               /*
+                * If this entity is not eligible, try the left subtree.
+                */
+               if (!entity_eligible(cfs_rq, se)) {
+                       node = node->rb_left;
+                       continue;
+               }
+
+               /*
+                * If this entity has an earlier deadline than the previous
+                * best, take this one. If it also has the earliest deadline
+                * of its subtree, we're done.
+                */
+               if (!best || deadline_gt(deadline, best, se)) {
+                       best = se;
+                       if (best->deadline == best->min_deadline)
+                               break;
+               }
+
+               /*
+                * If the earlest deadline in this subtree is in the fully
+                * eligible left half of our space, go there.
+                */
+               if (node->rb_left &&
+                   __node_2_se(node->rb_left)->min_deadline == se->min_deadline) {
+                       node = node->rb_left;
+                       continue;
+               }
+
+               node = node->rb_right;
+       }
+
+       if (!best || (curr && deadline_gt(deadline, best, curr)))
+               best = curr;
+
+       if (unlikely(!best)) {
+               struct sched_entity *left = __pick_first_entity(cfs_rq);
+               if (left) {
+                       pr_err("EEVDF scheduling fail, picking leftmost\n");
+                       return left;
+               }
+       }
+
+       return best;
  }
  
  #ifdef CONFIG_SCHED_DEBUG
@@ -689,14 +948,9 @@ int sched_update_scaling(void)
  {
         unsigned int factor = get_update_sysctl_factor();
  
-       sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
-                                       sysctl_sched_min_granularity);
-
  #define WRT_SYSCTL(name) \
         (normalized_sysctl_##name = sysctl_##name / (factor))
-       WRT_SYSCTL(sched_min_granularity);
-       WRT_SYSCTL(sched_latency);
-       WRT_SYSCTL(sched_wakeup_granularity);
+       WRT_SYSCTL(sched_base_slice);
  #undef WRT_SYSCTL
  
         return 0;
@@ -704,90 +958,36 @@ int sched_update_scaling(void)
  #endif
  #endif
  
-/*
- * delta /= w
- */
-static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
-{
-       if (unlikely(se->load.weight != NICE_0_LOAD))
-               delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
-
-       return delta;
-}
+static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
  
  /*
- * The idea is to set a period in which each task runs once.
- *
- * When there are too many tasks (sched_nr_latency) we have to stretch
- * this period because otherwise the slices get too small.
- *
- * p = (nr <= nl) ? l : l*nr/nl
+ * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i
+ * this is probably good enough.
   */
-static u64 __sched_period(unsigned long nr_running)
+static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
-       if (unlikely(nr_running > sched_nr_latency))
-               return nr_running * sysctl_sched_min_granularity;
-       else
-               return sysctl_sched_latency;
-}
-
-static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq);
-
-/*
- * We calculate the wall-time slice from the period by taking a part
- * proportional to the weight.
- *
- * s = p*P[w/rw]
- */
-static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-       unsigned int nr_running = cfs_rq->nr_running;
-       struct sched_entity *init_se = se;
-       unsigned int min_gran;
-       u64 slice;
-
-       if (sched_feat(ALT_PERIOD))
-               nr_running = rq_of(cfs_rq)->cfs.h_nr_running;
-
-       slice = __sched_period(nr_running + !se->on_rq);
-
-       for_each_sched_entity(se) {
-               struct load_weight *load;
-               struct load_weight lw;
-               struct cfs_rq *qcfs_rq;
-
-               qcfs_rq = cfs_rq_of(se);
-               load = &qcfs_rq->load;
-
-               if (unlikely(!se->on_rq)) {
-                       lw = qcfs_rq->load;
+       if ((s64)(se->vruntime - se->deadline) < 0)
+               return;
  
-                       update_load_add(&lw, se->load.weight);
-                       load = &lw;
-               }
-               slice = __calc_delta(slice, se->load.weight, load);
-       }
+       /*
+        * For EEVDF the virtual time slope is determined by w_i (iow.
+        * nice) while the request time r_i is determined by
+        * sysctl_sched_base_slice.
+        */
+       se->slice = sysctl_sched_base_slice;
  
-       if (sched_feat(BASE_SLICE)) {
-               if (se_is_idle(init_se) && !sched_idle_cfs_rq(cfs_rq))
-                       min_gran = sysctl_sched_idle_min_granularity;
-               else
-                       min_gran = sysctl_sched_min_granularity;
+       /*
+        * EEVDF: vd_i = ve_i + r_i / w_i
+        */
+       se->deadline = se->vruntime + calc_delta_fair(se->slice, se);
  
-               slice = max_t(u64, slice, min_gran);
+       /*
+        * The task has consumed its request, reschedule.
+        */
+       if (cfs_rq->nr_running > 1) {
+               resched_curr(rq_of(cfs_rq));
+               clear_buddies(cfs_rq, se);
         }
-
-       return slice;
-}
-
-/*
- * We calculate the vruntime slice of a to-be-inserted task.
- *
- * vs = s/w
- */
-static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-       return calc_delta_fair(sched_slice(cfs_rq, se), se);
  }
  
  #include "pelt.h"
@@ -922,6 +1122,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
         schedstat_add(cfs_rq->exec_clock, delta_exec);
  
         curr->vruntime += calc_delta_fair(delta_exec, curr);
+       update_deadline(cfs_rq, curr);
         update_min_vruntime(cfs_rq);
  
         if (entity_is_task(curr)) {
@@ -3376,16 +3577,36 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
  static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
                             unsigned long weight)
  {
+       unsigned long old_weight = se->load.weight;
+
         if (se->on_rq) {
                 /* commit outstanding execution time */
                 if (cfs_rq->curr == se)
                         update_curr(cfs_rq);
+               else
+                       avg_vruntime_sub(cfs_rq, se);
                 update_load_sub(&cfs_rq->load, se->load.weight);
         }
         dequeue_load_avg(cfs_rq, se);
  
         update_load_set(&se->load, weight);
  
+       if (!se->on_rq) {
+               /*
+                * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
+                * we need to scale se->vlag when w_i changes.
+                */
+               se->vlag = div_s64(se->vlag * old_weight, weight);
+       } else {
+               s64 deadline = se->deadline - se->vruntime;
+               /*
+                * When the weight changes, the virtual time slope changes and
+                * we should adjust the relative virtual deadline accordingly.
+                */
+               deadline = div_s64(deadline * old_weight, weight);
+               se->deadline = se->vruntime + deadline;
+       }
+
  #ifdef CONFIG_SMP
         do {
                 u32 divider = get_pelt_divider(&se->avg);
@@ -3395,9 +3616,11 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
  #endif
  
         enqueue_load_avg(cfs_rq, se);
-       if (se->on_rq)
+       if (se->on_rq) {
                 update_load_add(&cfs_rq->load, se->load.weight);
-
+               if (cfs_rq->curr != se)
+                       avg_vruntime_add(cfs_rq, se);
+       }
  }
  
  void reweight_task(struct task_struct *p, int prio)
@@ -4693,159 +4916,125 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
  
  #endif /* CONFIG_SMP */
  
-static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-#ifdef CONFIG_SCHED_DEBUG
-       s64 d = se->vruntime - cfs_rq->min_vruntime;
-
-       if (d < 0)
-               d = -d;
-
-       if (d > 3*sysctl_sched_latency)
-               schedstat_inc(cfs_rq->nr_spread_over);
-#endif
-}
-
-static inline bool entity_is_long_sleeper(struct sched_entity *se)
+static void
+place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
  {
-       struct cfs_rq *cfs_rq;
-       u64 sleep_time;
-
-       if (se->exec_start == 0)
-               return false;
+       u64 vslice = calc_delta_fair(se->slice, se);
+       u64 vruntime = avg_vruntime(cfs_rq);
+       s64 lag = 0;
  
-       cfs_rq = cfs_rq_of(se);
-
-       sleep_time = rq_clock_task(rq_of(cfs_rq));
+       /*
+        * Due to how V is constructed as the weighted average of entities,
+        * adding tasks with positive lag, or removing tasks with negative lag
+        * will move 'time' backwards, this can screw around with the lag of
+        * other tasks.
+        *
+        * EEVDF: placement strategy #1 / #2
+        */
+       if (sched_feat(PLACE_LAG) && cfs_rq->nr_running) {
+               struct sched_entity *curr = cfs_rq->curr;
+               unsigned long load;
  
-       /* Happen while migrating because of clock task divergence */
-       if (sleep_time <= se->exec_start)
-               return false;
+               lag = se->vlag;
  
-       sleep_time -= se->exec_start;
-       if (sleep_time > ((1ULL << 63) / scale_load_down(NICE_0_LOAD)))
-               return true;
+               /*
+                * If we want to place a task and preserve lag, we have to
+                * consider the effect of the new entity on the weighted
+                * average and compensate for this, otherwise lag can quickly
+                * evaporate.
+                *
+                * Lag is defined as:
+                *
+                *   lag_i = S - s_i = w_i * (V - v_i)
+                *
+                * To avoid the 'w_i' term all over the place, we only track
+                * the virtual lag:
+                *
+                *   vl_i = V - v_i <=> v_i = V - vl_i
+                *
+                * And we take V to be the weighted average of all v:
+                *
+                *   V = (\Sum w_j*v_j) / W
+                *
+                * Where W is: \Sum w_j
+                *
+                * Then, the weighted average after adding an entity with lag
+                * vl_i is given by:
+                *
+                *   V' = (\Sum w_j*v_j + w_i*v_i) / (W + w_i)
+                *      = (W*V + w_i*(V - vl_i)) / (W + w_i)
+                *      = (W*V + w_i*V - w_i*vl_i) / (W + w_i)
+                *      = (V*(W + w_i) - w_i*l) / (W + w_i)
+                *      = V - w_i*vl_i / (W + w_i)
+                *
+                * And the actual lag after adding an entity with vl_i is:
+                *
+                *   vl'_i = V' - v_i
+                *         = V - w_i*vl_i / (W + w_i) - (V - vl_i)
+                *         = vl_i - w_i*vl_i / (W + w_i)
+                *
+                * Which is strictly less than vl_i. So in order to preserve lag
+                * we should inflate the lag before placement such that the
+                * effective lag after placement comes out right.
+                *
+                * As such, invert the above relation for vl'_i to get the vl_i
+                * we need to use such that the lag after placement is the lag
+                * we computed before dequeue.
+                *
+                *   vl'_i = vl_i - w_i*vl_i / (W + w_i)
+                *         = ((W + w_i)*vl_i - w_i*vl_i) / (W + w_i)
+                *
+                *   (W + w_i)*vl'_i = (W + w_i)*vl_i - w_i*vl_i
+                *                   = W*vl_i
+                *
+                *   vl_i = (W + w_i)*vl'_i / W
+                */
+               load = cfs_rq->avg_load;
+               if (curr && curr->on_rq)
+                       load += scale_load_down(curr->load.weight);
  
-       return false;
-}
+               lag *= load + scale_load_down(se->load.weight);
+               if (WARN_ON_ONCE(!load))
+                       load = 1;
+               lag = div_s64(lag, load);
+       }
  
-static void
-place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
-{
-       u64 vruntime = cfs_rq->min_vruntime;
+       se->vruntime = vruntime - lag;
  
         /*
-        * The 'current' period is already promised to the current tasks,
-        * however the extra weight of the new task will slow them down a
-        * little, place the new task so that it fits in the slot that
-        * stays open at the end.
+        * When joining the competition; the exisiting tasks will be,
+        * on average, halfway through their slice, as such start tasks
+        * off with half a slice to ease into the competition.
          */
-       if (initial && sched_feat(START_DEBIT))
-               vruntime += sched_vslice(cfs_rq, se);
+       if (sched_feat(PLACE_DEADLINE_INITIAL) && (flags & ENQUEUE_INITIAL))
+               vslice /= 2;
  
-       /* sleeps up to a single latency don't count. */
-       if (!initial) {
-               unsigned long thresh;
-
-               if (se_is_idle(se))
-                       thresh = sysctl_sched_min_granularity;
-               else
-                       thresh = sysctl_sched_latency;
-
-               /*
-                * Halve their sleep time's effect, to allow
-                * for a gentler effect of sleepers:
-                */
-               if (sched_feat(GENTLE_FAIR_SLEEPERS))
-                       thresh >>= 1;
-
-               vruntime -= thresh;
-       }
-
-       /*
-        * Pull vruntime of the entity being placed to the base level of
-        * cfs_rq, to prevent boosting it if placed backwards.
-        * However, min_vruntime can advance much faster than real time, with
-        * the extreme being when an entity with the minimal weight always runs
-        * on the cfs_rq. If the waking entity slept for a long time, its
-        * vruntime difference from min_vruntime may overflow s64 and their
-        * comparison may get inversed, so ignore the entity's original
-        * vruntime in that case.
-        * The maximal vruntime speedup is given by the ratio of normal to
-        * minimal weight: scale_load_down(NICE_0_LOAD) / MIN_SHARES.
-        * When placing a migrated waking entity, its exec_start has been set
-        * from a different rq. In order to take into account a possible
-        * divergence between new and prev rq's clocks task because of irq and
-        * stolen time, we take an additional margin.
-        * So, cutting off on the sleep time of
-        *     2^63 / scale_load_down(NICE_0_LOAD) ~ 104 days
-        * should be safe.
-        */
-       if (entity_is_long_sleeper(se))
-               se->vruntime = vruntime;
-       else
-               se->vruntime = max_vruntime(se->vruntime, vruntime);
+       /*
+        * EEVDF: vd_i = ve_i + r_i/w_i
+        */
+       se->deadline = se->vruntime + vslice;
  }
  
  static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
  
  static inline bool cfs_bandwidth_used(void);
  
-/*
- * MIGRATION
- *
- *     dequeue
- *       update_curr()
- *         update_min_vruntime()
- *       vruntime -= min_vruntime
- *
- *     enqueue
- *       update_curr()
- *         update_min_vruntime()
- *       vruntime += min_vruntime
- *
- * this way the vruntime transition between RQs is done when both
- * min_vruntime are up-to-date.
- *
- * WAKEUP (remote)
- *
- *     ->migrate_task_rq_fair() (p->state == TASK_WAKING)
- *       vruntime -= min_vruntime
- *
- *     enqueue
- *       update_curr()
- *         update_min_vruntime()
- *       vruntime += min_vruntime
- *
- * this way we don't have the most up-to-date min_vruntime on the originating
- * CPU and an up-to-date min_vruntime on the destination CPU.
- */
-
  static void
  enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
  {
-       bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
         bool curr = cfs_rq->curr == se;
  
         /*
          * If we're the current task, we must renormalise before calling
          * update_curr().
          */
-       if (renorm && curr)
-               se->vruntime += cfs_rq->min_vruntime;
+       if (curr)
+               place_entity(cfs_rq, se, flags);
  
         update_curr(cfs_rq);
  
         /*
-        * Otherwise, renormalise after, such that we're placed at the current
-        * moment in time, instead of some random moment in the past. Being
-        * placed in the past could significantly boost this task to the
-        * fairness detriment of existing tasks.
-        */
-       if (renorm && !curr)
-               se->vruntime += cfs_rq->min_vruntime;
-
-       /*
          * When enqueuing a sched_entity, we must:
          *   - Update loads to have both entity and cfs_rq synced with now.
          *   - For group_entity, update its runnable_weight to reflect the new
@@ -4856,37 +5045,46 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
          */
         update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
         se_update_runnable(se);
+       /*
+        * XXX update_load_avg() above will have attached us to the pelt sum;
+        * but update_cfs_group() here will re-adjust the weight and have to
+        * undo/redo all that. Seems wasteful.
+        */
         update_cfs_group(se);
+
+       /*
+        * XXX now that the entity has been re-weighted, and it's lag adjusted,
+        * we can place the entity.
+        */
+       if (!curr)
+               place_entity(cfs_rq, se, flags);
+
         account_entity_enqueue(cfs_rq, se);
  
-       if (flags & ENQUEUE_WAKEUP)
-               place_entity(cfs_rq, se, 0);
         /* Entity has migrated, no longer consider this task hot */
         if (flags & ENQUEUE_MIGRATED)
                 se->exec_start = 0;
  
         check_schedstat_required();
         update_stats_enqueue_fair(cfs_rq, se, flags);
-       check_spread(cfs_rq, se);
         if (!curr)
                 __enqueue_entity(cfs_rq, se);
         se->on_rq = 1;
  
         if (cfs_rq->nr_running == 1) {
                 check_enqueue_throttle(cfs_rq);
-               if (!throttled_hierarchy(cfs_rq))
+               if (!throttled_hierarchy(cfs_rq)) {
                         list_add_leaf_cfs_rq(cfs_rq);
-       }
-}
-
-static void __clear_buddies_last(struct sched_entity *se)
-{
-       for_each_sched_entity(se) {
-               struct cfs_rq *cfs_rq = cfs_rq_of(se);
-               if (cfs_rq->last != se)
-                       break;
+               } else {
+#ifdef CONFIG_CFS_BANDWIDTH
+                       struct rq *rq = rq_of(cfs_rq);
  
-               cfs_rq->last = NULL;
+                       if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
+                               cfs_rq->throttled_clock = rq_clock(rq);
+                       if (!cfs_rq->throttled_clock_self)
+                               cfs_rq->throttled_clock_self = rq_clock(rq);
+#endif
+               }
         }
  }
  
@@ -4901,27 +5099,10 @@ static void __clear_buddies_next(struct sched_entity *se)
         }
  }
  
-static void __clear_buddies_skip(struct sched_entity *se)
-{
-       for_each_sched_entity(se) {
-               struct cfs_rq *cfs_rq = cfs_rq_of(se);
-               if (cfs_rq->skip != se)
-                       break;
-
-               cfs_rq->skip = NULL;
-       }
-}
-
  static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
-       if (cfs_rq->last == se)
-               __clear_buddies_last(se);
-
         if (cfs_rq->next == se)
                 __clear_buddies_next(se);
-
-       if (cfs_rq->skip == se)
-               __clear_buddies_skip(se);
  }
  
  static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
@@ -4955,82 +5136,28 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
  
         clear_buddies(cfs_rq, se);
  
+       update_entity_lag(cfs_rq, se);
         if (se != cfs_rq->curr)
                 __dequeue_entity(cfs_rq, se);
-       se->on_rq = 0;
-       account_entity_dequeue(cfs_rq, se);
-
-       /*
-        * Normalize after update_curr(); which will also have moved
-        * min_vruntime if @se is the one holding it back. But before doing
-        * update_min_vruntime() again, which will discount @se's position and
-        * can move min_vruntime forward still more.
-        */
-       if (!(flags & DEQUEUE_SLEEP))
-               se->vruntime -= cfs_rq->min_vruntime;
-
-       /* return excess runtime on last dequeue */
-       return_cfs_rq_runtime(cfs_rq);
-
-       update_cfs_group(se);
-
-       /*
-        * Now advance min_vruntime if @se was the entity holding it back,
-        * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
-        * put back on, and if we advance min_vruntime, we'll be placed back
-        * further than we started -- ie. we'll be penalized.
-        */
-       if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
-               update_min_vruntime(cfs_rq);
-
-       if (cfs_rq->nr_running == 0)
-               update_idle_cfs_rq_clock_pelt(cfs_rq);
-}
-
-/*
- * Preempt the current task with a newly woken task if needed:
- */
-static void
-check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
-{
-       unsigned long ideal_runtime, delta_exec;
-       struct sched_entity *se;
-       s64 delta;
+       se->on_rq = 0;
+       account_entity_dequeue(cfs_rq, se);
  
-       /*
-        * When many tasks blow up the sched_period; it is possible that
-        * sched_slice() reports unusually large results (when many tasks are
-        * very light for example). Therefore impose a maximum.
-        */
-       ideal_runtime = min_t(u64, sched_slice(cfs_rq, curr), sysctl_sched_latency);
+       /* return excess runtime on last dequeue */
+       return_cfs_rq_runtime(cfs_rq);
  
-       delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
-       if (delta_exec > ideal_runtime) {
-               resched_curr(rq_of(cfs_rq));
-               /*
-                * The current task ran long enough, ensure it doesn't get
-                * re-elected due to buddy favours.
-                */
-               clear_buddies(cfs_rq, curr);
-               return;
-       }
+       update_cfs_group(se);
  
         /*
-        * Ensure that a task that missed wakeup preemption by a
-        * narrow margin doesn't have to wait for a full slice.
-        * This also mitigates buddy induced latencies under load.
+        * Now advance min_vruntime if @se was the entity holding it back,
+        * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
+        * put back on, and if we advance min_vruntime, we'll be placed back
+        * further than we started -- ie. we'll be penalized.
          */
-       if (delta_exec < sysctl_sched_min_granularity)
-               return;
-
-       se = __pick_first_entity(cfs_rq);
-       delta = curr->vruntime - se->vruntime;
-
-       if (delta < 0)
-               return;
+       if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
+               update_min_vruntime(cfs_rq);
  
-       if (delta > ideal_runtime)
-               resched_curr(rq_of(cfs_rq));
+       if (cfs_rq->nr_running == 0)
+               update_idle_cfs_rq_clock_pelt(cfs_rq);
  }
  
  static void
@@ -5048,6 +5175,11 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
                 update_stats_wait_end_fair(cfs_rq, se);
                 __dequeue_entity(cfs_rq, se);
                 update_load_avg(cfs_rq, se, UPDATE_TG);
+               /*
+                * HACK, stash a copy of deadline at the point of pick in vlag,
+                * which isn't used until dequeue.
+                */
+               se->vlag = se->deadline;
         }
  
         update_stats_curr_start(cfs_rq, se);
@@ -5071,9 +5203,6 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
         se->prev_sum_exec_runtime = se->sum_exec_runtime;
  }
  
-static int
-wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
-
  /*
   * Pick the next process, keeping these things in mind, in this order:
   * 1) keep things fair between processes/task groups
@@ -5084,50 +5213,14 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
  static struct sched_entity *
  pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
  {
-       struct sched_entity *left = __pick_first_entity(cfs_rq);
-       struct sched_entity *se;
-
         /*
-        * If curr is set we have to see if its left of the leftmost entity
-        * still in the tree, provided there was anything in the tree at all.
+        * Enabling NEXT_BUDDY will affect latency but not fairness.
          */
-       if (!left || (curr && entity_before(curr, left)))
-               left = curr;
+       if (sched_feat(NEXT_BUDDY) &&
+           cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next))
+               return cfs_rq->next;
  
-       se = left; /* ideally we run the leftmost entity */
-
-       /*
-        * Avoid running the skip buddy, if running something else can
-        * be done without getting too unfair.
-        */
-       if (cfs_rq->skip && cfs_rq->skip == se) {
-               struct sched_entity *second;
-
-               if (se == curr) {
-                       second = __pick_first_entity(cfs_rq);
-               } else {
-                       second = __pick_next_entity(se);
-                       if (!second || (curr && entity_before(curr, second)))
-                               second = curr;
-               }
-
-               if (second && wakeup_preempt_entity(second, left) < 1)
-                       se = second;
-       }
-
-       if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
-               /*
-                * Someone really wants this to run. If it's not unfair, run it.
-                */
-               se = cfs_rq->next;
-       } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) {
-               /*
-                * Prefer last buddy, try to return the CPU to a preempted task.
-                */
-               se = cfs_rq->last;
-       }
-
-       return se;
+       return pick_eevdf(cfs_rq);
  }
  
  static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
@@ -5144,8 +5237,6 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
         /* throttle cfs_rqs exceeding runtime */
         check_cfs_rq_runtime(cfs_rq);
  
-       check_spread(cfs_rq, prev);
-
         if (prev->on_rq) {
                 update_stats_wait_start_fair(cfs_rq, prev);
                 /* Put 'current' back into the tree. */
@@ -5186,9 +5277,6 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
                         hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
                 return;
  #endif
-
-       if (cfs_rq->nr_running > 1)
-               check_preempt_tick(cfs_rq, curr);
  }
  
  
@@ -5378,6 +5466,17 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
                 /* Add cfs_rq with load or one or more already running entities to the list */
                 if (!cfs_rq_is_decayed(cfs_rq))
                         list_add_leaf_cfs_rq(cfs_rq);
+
+               if (cfs_rq->throttled_clock_self) {
+                       u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
+
+                       cfs_rq->throttled_clock_self = 0;
+
+                       if (SCHED_WARN_ON((s64)delta < 0))
+                               delta = 0;
+
+                       cfs_rq->throttled_clock_self_time += delta;
+               }
         }
  
         return 0;
@@ -5392,6 +5491,10 @@ static int tg_throttle_down(struct task_group *tg, void *data)
         if (!cfs_rq->throttle_count) {
                 cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
                 list_del_leaf_cfs_rq(cfs_rq);
+
+               SCHED_WARN_ON(cfs_rq->throttled_clock_self);
+               if (cfs_rq->nr_running)
+                       cfs_rq->throttled_clock_self = rq_clock(rq);
         }
         cfs_rq->throttle_count++;
  
@@ -5481,7 +5584,9 @@ done:
          * throttled-list.  rq->lock protects completion.
          */
         cfs_rq->throttled = 1;
-       cfs_rq->throttled_clock = rq_clock(rq);
+       SCHED_WARN_ON(cfs_rq->throttled_clock);
+       if (cfs_rq->nr_running)
+               cfs_rq->throttled_clock = rq_clock(rq);
         return true;
  }
  
@@ -5499,7 +5604,10 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
         update_rq_clock(rq);
  
         raw_spin_lock(&cfs_b->lock);
-       cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
+       if (cfs_rq->throttled_clock) {
+               cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
+               cfs_rq->throttled_clock = 0;
+       }
         list_del_rcu(&cfs_rq->throttled_list);
         raw_spin_unlock(&cfs_b->lock);
  
@@ -6015,13 +6123,14 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
         return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
  }
  
-void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent)
  {
         raw_spin_lock_init(&cfs_b->lock);
         cfs_b->runtime = 0;
         cfs_b->quota = RUNTIME_INF;
         cfs_b->period = ns_to_ktime(default_cfs_period());
         cfs_b->burst = 0;
+       cfs_b->hierarchical_quota = parent ? parent->hierarchical_quota : RUNTIME_INF;
  
         INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
         hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
@@ -6158,6 +6267,46 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
         rq_clock_stop_loop_update(rq);
  }
  
+bool cfs_task_bw_constrained(struct task_struct *p)
+{
+       struct cfs_rq *cfs_rq = task_cfs_rq(p);
+
+       if (!cfs_bandwidth_used())
+               return false;
+
+       if (cfs_rq->runtime_enabled ||
+           tg_cfs_bandwidth(cfs_rq->tg)->hierarchical_quota != RUNTIME_INF)
+               return true;
+
+       return false;
+}
+
+#ifdef CONFIG_NO_HZ_FULL
+/* called from pick_next_task_fair() */
+static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p)
+{
+       int cpu = cpu_of(rq);
+
+       if (!sched_feat(HZ_BW) || !cfs_bandwidth_used())
+               return;
+
+       if (!tick_nohz_full_cpu(cpu))
+               return;
+
+       if (rq->nr_running != 1)
+               return;
+
+       /*
+        *  We know there is only one task runnable and we've just picked it. The
+        *  normal enqueue path will have cleared TICK_DEP_BIT_SCHED if we will
+        *  be otherwise able to stop the tick. Just need to check if we are using
+        *  bandwidth control.
+        */
+       if (cfs_task_bw_constrained(p))
+               tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);
+}
+#endif
+
  #else /* CONFIG_CFS_BANDWIDTH */
  
  static inline bool cfs_bandwidth_used(void)
@@ -6188,7 +6337,7 @@ static inline int throttled_lb_pair(struct task_group *tg,
  }
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
-void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
+void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent) {}
  static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
  #endif
  
@@ -6199,9 +6348,18 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
  static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
  static inline void update_runtime_enabled(struct rq *rq) {}
  static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
-
+#ifdef CONFIG_CGROUP_SCHED
+bool cfs_task_bw_constrained(struct task_struct *p)
+{
+       return false;
+}
+#endif
  #endif /* CONFIG_CFS_BANDWIDTH */
  
+#if !defined(CONFIG_CFS_BANDWIDTH) || !defined(CONFIG_NO_HZ_FULL)
+static inline void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) {}
+#endif
+
  /**************************************************
   * CFS operations on tasks:
   */
@@ -6210,13 +6368,12 @@ static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
  static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
  {
         struct sched_entity *se = &p->se;
-       struct cfs_rq *cfs_rq = cfs_rq_of(se);
  
         SCHED_WARN_ON(task_rq(p) != rq);
  
         if (rq->cfs.h_nr_running > 1) {
-               u64 slice = sched_slice(cfs_rq, se);
                 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
+               u64 slice = se->slice;
                 s64 delta = slice - ran;
  
                 if (delta < 0) {
@@ -6240,8 +6397,7 @@ static void hrtick_update(struct rq *rq)
         if (!hrtick_enabled_fair(rq) || curr->sched_class != &fair_sched_class)
                 return;
  
-       if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
-               hrtick_start_fair(rq, curr);
+       hrtick_start_fair(rq, curr);
  }
  #else /* !CONFIG_SCHED_HRTICK */
  static inline void
@@ -6282,17 +6438,6 @@ static int sched_idle_rq(struct rq *rq)
                         rq->nr_running);
  }
  
-/*
- * Returns true if cfs_rq only has SCHED_IDLE entities enqueued. Note the use
- * of idle_nr_running, which does not consider idle descendants of normal
- * entities.
- */
-static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq)
-{
-       return cfs_rq->nr_running &&
-               cfs_rq->nr_running == cfs_rq->idle_nr_running;
-}
-
  #ifdef CONFIG_SMP
  static int sched_idle_cpu(int cpu)
  {
@@ -7065,7 +7210,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
         util_min = uclamp_eff_value(p, UCLAMP_MIN);
         util_max = uclamp_eff_value(p, UCLAMP_MAX);
  
-       for_each_cpu_wrap(cpu, cpus, target + 1) {
+       for_each_cpu_wrap(cpu, cpus, target) {
                 unsigned long cpu_cap = capacity_of(cpu);
  
                 if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
@@ -7289,9 +7434,6 @@ cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost)
  
                 util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
  
-               if (boost)
-                       util_est = max(util_est, runnable);
-
                 /*
                  * During wake-up @p isn't enqueued yet and doesn't contribute
                  * to any cpu_rq(cpu)->cfs.avg.util_est.enqueued.
@@ -7741,6 +7883,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
         if (wake_flags & WF_TTWU) {
                 record_wakee(p);
  
+               if ((wake_flags & WF_CURRENT_CPU) &&
+                   cpumask_test_cpu(cpu, p->cpus_ptr))
+                       return cpu;
+
                 if (sched_energy_enabled()) {
                         new_cpu = find_energy_efficient_cpu(p, prev_cpu);
                         if (new_cpu >= 0)
@@ -7798,18 +7944,6 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
  {
         struct sched_entity *se = &p->se;
  
-       /*
-        * As blocked tasks retain absolute vruntime the migration needs to
-        * deal with this by subtracting the old and adding the new
-        * min_vruntime -- the latter is done by enqueue_entity() when placing
-        * the task on the new runqueue.
-        */
-       if (READ_ONCE(p->__state) == TASK_WAKING) {
-               struct cfs_rq *cfs_rq = cfs_rq_of(se);
-
-               se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
-       }
-
         if (!task_on_rq_migrating(p)) {
                 remove_entity_load_avg(se);
  
@@ -7847,66 +7981,6 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
  }
  #endif /* CONFIG_SMP */
  
-static unsigned long wakeup_gran(struct sched_entity *se)
-{
-       unsigned long gran = sysctl_sched_wakeup_granularity;
-
-       /*
-        * Since its curr running now, convert the gran from real-time
-        * to virtual-time in his units.
-        *
-        * By using 'se' instead of 'curr' we penalize light tasks, so
-        * they get preempted easier. That is, if 'se' < 'curr' then
-        * the resulting gran will be larger, therefore penalizing the
-        * lighter, if otoh 'se' > 'curr' then the resulting gran will
-        * be smaller, again penalizing the lighter task.
-        *
-        * This is especially important for buddies when the leftmost
-        * task is higher priority than the buddy.
-        */
-       return calc_delta_fair(gran, se);
-}
-
-/*
- * Should 'se' preempt 'curr'.
- *
- *             |s1
- *        |s2
- *   |s3
- *         g
- *      |<--->|c
- *
- *  w(c, s1) = -1
- *  w(c, s2) =  0
- *  w(c, s3) =  1
- *
- */
-static int
-wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
-{
-       s64 gran, vdiff = curr->vruntime - se->vruntime;
-
-       if (vdiff <= 0)
-               return -1;
-
-       gran = wakeup_gran(se);
-       if (vdiff > gran)
-               return 1;
-
-       return 0;
-}
-
-static void set_last_buddy(struct sched_entity *se)
-{
-       for_each_sched_entity(se) {
-               if (SCHED_WARN_ON(!se->on_rq))
-                       return;
-               if (se_is_idle(se))
-                       return;
-               cfs_rq_of(se)->last = se;
-       }
-}
-
  static void set_next_buddy(struct sched_entity *se)
  {
         for_each_sched_entity(se) {
@@ -7918,12 +7992,6 @@ static void set_next_buddy(struct sched_entity *se)
         }
  }
  
-static void set_skip_buddy(struct sched_entity *se)
-{
-       for_each_sched_entity(se)
-               cfs_rq_of(se)->skip = se;
-}
-
  /*
   * Preempt the current task with a newly woken task if needed:
   */
@@ -7932,7 +8000,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
         struct task_struct *curr = rq->curr;
         struct sched_entity *se = &curr->se, *pse = &p->se;
         struct cfs_rq *cfs_rq = task_cfs_rq(curr);
-       int scale = cfs_rq->nr_running >= sched_nr_latency;
         int next_buddy_marked = 0;
         int cse_is_idle, pse_is_idle;
  
@@ -7948,7 +8015,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
         if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
                 return;
  
-       if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
+       if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK)) {
                 set_next_buddy(pse);
                 next_buddy_marked = 1;
         }
@@ -7993,35 +8060,19 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
         if (cse_is_idle != pse_is_idle)
                 return;
  
-       update_curr(cfs_rq_of(se));
-       if (wakeup_preempt_entity(se, pse) == 1) {
-               /*
-                * Bias pick_next to pick the sched entity that is
-                * triggering this preemption.
-                */
-               if (!next_buddy_marked)
-                       set_next_buddy(pse);
+       cfs_rq = cfs_rq_of(se);
+       update_curr(cfs_rq);
+
+       /*
+        * XXX pick_eevdf(cfs_rq) != se ?
+        */
+       if (pick_eevdf(cfs_rq) == pse)
                 goto preempt;
-       }
  
         return;
  
  preempt:
         resched_curr(rq);
-       /*
-        * Only set the backward buddy when the current task is still
-        * on the rq. This can happen when a wakeup gets interleaved
-        * with schedule on the ->pre_schedule() or idle_balance()
-        * point, either of which can * drop the rq lock.
-        *
-        * Also, during early boot the idle thread is in the fair class,
-        * for obvious reasons its a bad idea to schedule back to it.
-        */
-       if (unlikely(!se->on_rq || curr == rq->idle))
-               return;
-
-       if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
-               set_last_buddy(se);
  }
  
  #ifdef CONFIG_SMP
@@ -8172,6 +8223,7 @@ done: __maybe_unused;
                 hrtick_start_fair(rq, p);
  
         update_misfit_status(p, rq);
+       sched_fair_update_stop_tick(rq, p);
  
         return p;
  
@@ -8222,8 +8274,6 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
  
  /*
   * sched_yield() is very simple
- *
- * The magic of dealing with the ->skip buddy is in pick_next_entity.
   */
  static void yield_task_fair(struct rq *rq)
  {
@@ -8239,21 +8289,19 @@ static void yield_task_fair(struct rq *rq)
  
         clear_buddies(cfs_rq, se);
  
-       if (curr->policy != SCHED_BATCH) {
-               update_rq_clock(rq);
-               /*
-                * Update run-time statistics of the 'current'.
-                */
-               update_curr(cfs_rq);
-               /*
-                * Tell update_rq_clock() that we've just updated,
-                * so we don't do microscopic update in schedule()
-                * and double the fastpath cost.
-                */
-               rq_clock_skip_update(rq);
-       }
+       update_rq_clock(rq);
+       /*
+        * Update run-time statistics of the 'current'.
+        */
+       update_curr(cfs_rq);
+       /*
+        * Tell update_rq_clock() that we've just updated,
+        * so we don't do microscopic update in schedule()
+        * and double the fastpath cost.
+        */
+       rq_clock_skip_update(rq);
  
-       set_skip_buddy(se);
+       se->deadline += calc_delta_fair(se->slice, se);
  }
  
  static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
@@ -8416,6 +8464,11 @@ enum group_type {
          */
         group_misfit_task,
         /*
+        * Balance SMT group that's fully busy. Can benefit from migration
+        * a task on SMT with busy sibling to another CPU on idle core.
+        */
+       group_smt_balance,
+       /*
          * SD_ASYM_PACKING only: One local CPU with higher capacity is available,
          * and the task should be migrated to it instead of running on the
          * current CPU.
@@ -8496,8 +8549,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
          * Buddy candidates are cache hot:
          */
         if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
-                       (&p->se == cfs_rq_of(&p->se)->next ||
-                        &p->se == cfs_rq_of(&p->se)->last))
+           (&p->se == cfs_rq_of(&p->se)->next))
                 return 1;
  
         if (sysctl_sched_migration_cost == -1)
@@ -9123,6 +9175,7 @@ struct sg_lb_stats {
         unsigned int group_weight;
         enum group_type group_type;
         unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */
+       unsigned int group_smt_balance;  /* Task on busy SMT be moved */
         unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
  #ifdef CONFIG_NUMA_BALANCING
         unsigned int nr_numa_running;
@@ -9396,6 +9449,9 @@ group_type group_classify(unsigned int imbalance_pct,
         if (sgs->group_asym_packing)
                 return group_asym_packing;
  
+       if (sgs->group_smt_balance)
+               return group_smt_balance;
+
         if (sgs->group_misfit_task_load)
                 return group_misfit_task;
  
@@ -9465,6 +9521,71 @@ sched_asym(struct lb_env *env, struct sd_lb_stats *sds,  struct sg_lb_stats *sgs
         return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu);
  }
  
+/* One group has more than one SMT CPU while the other group does not */
+static inline bool smt_vs_nonsmt_groups(struct sched_group *sg1,
+                                   struct sched_group *sg2)
+{
+       if (!sg1 || !sg2)
+               return false;
+
+       return (sg1->flags & SD_SHARE_CPUCAPACITY) !=
+               (sg2->flags & SD_SHARE_CPUCAPACITY);
+}
+
+static inline bool smt_balance(struct lb_env *env, struct sg_lb_stats *sgs,
+                              struct sched_group *group)
+{
+       if (env->idle == CPU_NOT_IDLE)
+               return false;
+
+       /*
+        * For SMT source group, it is better to move a task
+        * to a CPU that doesn't have multiple tasks sharing its CPU capacity.
+        * Note that if a group has a single SMT, SD_SHARE_CPUCAPACITY
+        * will not be on.
+        */
+       if (group->flags & SD_SHARE_CPUCAPACITY &&
+           sgs->sum_h_nr_running > 1)
+               return true;
+
+       return false;
+}
+
+static inline long sibling_imbalance(struct lb_env *env,
+                                   struct sd_lb_stats *sds,
+                                   struct sg_lb_stats *busiest,
+                                   struct sg_lb_stats *local)
+{
+       int ncores_busiest, ncores_local;
+       long imbalance;
+
+       if (env->idle == CPU_NOT_IDLE || !busiest->sum_nr_running)
+               return 0;
+
+       ncores_busiest = sds->busiest->cores;
+       ncores_local = sds->local->cores;
+
+       if (ncores_busiest == ncores_local) {
+               imbalance = busiest->sum_nr_running;
+               lsub_positive(&imbalance, local->sum_nr_running);
+               return imbalance;
+       }
+
+       /* Balance such that nr_running/ncores ratio are same on both groups */
+       imbalance = ncores_local * busiest->sum_nr_running;
+       lsub_positive(&imbalance, ncores_busiest * local->sum_nr_running);
+       /* Normalize imbalance and do rounding on normalization */
+       imbalance = 2 * imbalance + ncores_local + ncores_busiest;
+       imbalance /= ncores_local + ncores_busiest;
+
+       /* Take advantage of resource in an empty sched group */
+       if (imbalance == 0 && local->sum_nr_running == 0 &&
+           busiest->sum_nr_running > 1)
+               imbalance = 2;
+
+       return imbalance;
+}
+
  static inline bool
  sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
  {
@@ -9557,6 +9678,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                 sgs->group_asym_packing = 1;
         }
  
+       /* Check for loaded SMT group to be balanced to dst CPU */
+       if (!local_group && smt_balance(env, sgs, group))
+               sgs->group_smt_balance = 1;
+
         sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
  
         /* Computing avg_load makes sense only when group is overloaded */
@@ -9641,6 +9766,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
                         return false;
                 break;
  
+       case group_smt_balance:
         case group_fully_busy:
                 /*
                  * Select the fully busy group with highest avg_load. In
@@ -9670,6 +9796,18 @@ static bool update_sd_pick_busiest(struct lb_env *env,
  
         case group_has_spare:
                 /*
+                * Do not pick sg with SMT CPUs over sg with pure CPUs,
+                * as we do not want to pull task off SMT core with one task
+                * and make the core idle.
+                */
+               if (smt_vs_nonsmt_groups(sds->busiest, sg)) {
+                       if (sg->flags & SD_SHARE_CPUCAPACITY && sgs->sum_h_nr_running <= 1)
+                               return false;
+                       else
+                               return true;
+               }
+
+               /*
                  * Select not overloaded group with lowest number of idle cpus
                  * and highest number of running tasks. We could also compare
                  * the spare capacity which is more stable but it can end up
@@ -9865,6 +10003,7 @@ static bool update_pick_idlest(struct sched_group *idlest,
  
         case group_imbalanced:
         case group_asym_packing:
+       case group_smt_balance:
                 /* Those types are not used in the slow wakeup path */
                 return false;
  
@@ -9996,6 +10135,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
  
         case group_imbalanced:
         case group_asym_packing:
+       case group_smt_balance:
                 /* Those type are not used in the slow wakeup path */
                 return NULL;
  
@@ -10250,6 +10390,13 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
                 return;
         }
  
+       if (busiest->group_type == group_smt_balance) {
+               /* Reduce number of tasks sharing CPU capacity */
+               env->migration_type = migrate_task;
+               env->imbalance = 1;
+               return;
+       }
+
         if (busiest->group_type == group_imbalanced) {
                 /*
                  * In the group_imb case we cannot rely on group-wide averages
@@ -10297,14 +10444,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
                 }
  
                 if (busiest->group_weight == 1 || sds->prefer_sibling) {
-                       unsigned int nr_diff = busiest->sum_nr_running;
                         /*
                          * When prefer sibling, evenly spread running tasks on
                          * groups.
                          */
                         env->migration_type = migrate_task;
-                       lsub_positive(&nr_diff, local->sum_nr_running);
-                       env->imbalance = nr_diff;
+                       env->imbalance = sibling_imbalance(env, sds, busiest, local);
                 } else {
  
                         /*
@@ -10501,20 +10646,27 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
          * group's child domain.
          */
         if (sds.prefer_sibling && local->group_type == group_has_spare &&
-           busiest->sum_nr_running > local->sum_nr_running + 1)
+           sibling_imbalance(env, &sds, busiest, local) > 1)
                 goto force_balance;
  
         if (busiest->group_type != group_overloaded) {
-               if (env->idle == CPU_NOT_IDLE)
+               if (env->idle == CPU_NOT_IDLE) {
                         /*
                          * If the busiest group is not overloaded (and as a
                          * result the local one too) but this CPU is already
                          * busy, let another idle CPU try to pull task.
                          */
                         goto out_balanced;
+               }
+
+               if (busiest->group_type == group_smt_balance &&
+                   smt_vs_nonsmt_groups(sds.local, sds.busiest)) {
+                       /* Let non SMT CPU pull from SMT CPU sharing with sibling */
+                       goto force_balance;
+               }
  
                 if (busiest->group_weight > 1 &&
-                   local->idle_cpus <= (busiest->idle_cpus + 1))
+                   local->idle_cpus <= (busiest->idle_cpus + 1)) {
                         /*
                          * If the busiest group is not overloaded
                          * and there is no imbalance between this and busiest
@@ -10525,12 +10677,14 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
                          * there is more than 1 CPU per group.
                          */
                         goto out_balanced;
+               }
  
-               if (busiest->sum_h_nr_running == 1)
+               if (busiest->sum_h_nr_running == 1) {
                         /*
                          * busiest doesn't have any tasks waiting to run
                          */
                         goto out_balanced;
+               }
         }
  
  force_balance:
@@ -10764,7 +10918,7 @@ static int active_load_balance_cpu_stop(void *data);
  static int should_we_balance(struct lb_env *env)
  {
         struct sched_group *sg = env->sd->groups;
-       int cpu;
+       int cpu, idle_smt = -1;
  
         /*
          * Ensure the balancing environment is consistent; can happen
@@ -10791,10 +10945,24 @@ static int should_we_balance(struct lb_env *env)
                 if (!idle_cpu(cpu))
                         continue;
  
+               /*
+                * Don't balance to idle SMT in busy core right away when
+                * balancing cores, but remember the first idle SMT CPU for
+                * later consideration.  Find CPU on an idle core first.
+                */
+               if (!(env->sd->flags & SD_SHARE_CPUCAPACITY) && !is_core_idle(cpu)) {
+                       if (idle_smt == -1)
+                               idle_smt = cpu;
+                       continue;
+               }
+
                 /* Are we the first idle CPU? */
                 return cpu == env->dst_cpu;
         }
  
+       if (idle_smt == env->dst_cpu)
+               return true;
+
         /* Are we the first CPU of this group ? */
         return group_balance_cpu(sg) == env->dst_cpu;
  }
@@ -12007,8 +12175,8 @@ static void rq_offline_fair(struct rq *rq)
  static inline bool
  __entity_slice_used(struct sched_entity *se, int min_nr_tasks)
  {
-       u64 slice = sched_slice(cfs_rq_of(se), se);
         u64 rtime = se->sum_exec_runtime - se->prev_sum_exec_runtime;
+       u64 slice = se->slice;
  
         return (rtime * min_nr_tasks > slice);
  }
@@ -12164,8 +12332,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
   */
  static void task_fork_fair(struct task_struct *p)
  {
-       struct cfs_rq *cfs_rq;
         struct sched_entity *se = &p->se, *curr;
+       struct cfs_rq *cfs_rq;
         struct rq *rq = this_rq();
         struct rq_flags rf;
  
@@ -12174,22 +12342,9 @@ static void task_fork_fair(struct task_struct *p)
  
         cfs_rq = task_cfs_rq(current);
         curr = cfs_rq->curr;
-       if (curr) {
+       if (curr)
                 update_curr(cfs_rq);
-               se->vruntime = curr->vruntime;
-       }
-       place_entity(cfs_rq, se, 1);
-
-       if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
-               /*
-                * Upon rescheduling, sched_class::put_prev_task() will place
-                * 'current' within the tree based on its new key value.
-                */
-               swap(curr->vruntime, se->vruntime);
-               resched_curr(rq);
-       }
-
-       se->vruntime -= cfs_rq->min_vruntime;
+       place_entity(cfs_rq, se, ENQUEUE_INITIAL);
         rq_unlock(rq, &rf);
  }
  
@@ -12218,34 +12373,6 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
                 check_preempt_curr(rq, p, 0);
  }
  
-static inline bool vruntime_normalized(struct task_struct *p)
-{
-       struct sched_entity *se = &p->se;
-
-       /*
-        * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
-        * the dequeue_entity(.flags=0) will already have normalized the
-        * vruntime.
-        */
-       if (p->on_rq)
-               return true;
-
-       /*
-        * When !on_rq, vruntime of the task has usually NOT been normalized.
-        * But there are some cases where it has already been normalized:
-        *
-        * - A forked child which is waiting for being woken up by
-        *   wake_up_new_task().
-        * - A task which has been woken up by try_to_wake_up() and
-        *   waiting for actually being woken up by sched_ttwu_pending().
-        */
-       if (!se->sum_exec_runtime ||
-           (READ_ONCE(p->__state) == TASK_WAKING && p->sched_remote_wakeup))
-               return true;
-
-       return false;
-}
-
  #ifdef CONFIG_FAIR_GROUP_SCHED
  /*
   * Propagate the changes of the sched_entity across the tg tree to make it
@@ -12316,16 +12443,6 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
  static void detach_task_cfs_rq(struct task_struct *p)
  {
         struct sched_entity *se = &p->se;
-       struct cfs_rq *cfs_rq = cfs_rq_of(se);
-
-       if (!vruntime_normalized(p)) {
-               /*
-                * Fix up our vruntime so that the current sleep doesn't
-                * cause 'unlimited' sleep bonus.
-                */
-               place_entity(cfs_rq, se, 0);
-               se->vruntime -= cfs_rq->min_vruntime;
-       }
  
         detach_entity_cfs_rq(se);
  }
@@ -12333,12 +12450,8 @@ static void detach_task_cfs_rq(struct task_struct *p)
  static void attach_task_cfs_rq(struct task_struct *p)
  {
         struct sched_entity *se = &p->se;
-       struct cfs_rq *cfs_rq = cfs_rq_of(se);
  
         attach_entity_cfs_rq(se);
-
-       if (!vruntime_normalized(p))
-               se->vruntime += cfs_rq->min_vruntime;
  }
  
  static void switched_from_fair(struct rq *rq, struct task_struct *p)
@@ -12450,7 +12563,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
  
         tg->shares = NICE_0_LOAD;
  
-       init_cfs_bandwidth(tg_cfs_bandwidth(tg));
+       init_cfs_bandwidth(tg_cfs_bandwidth(tg), tg_cfs_bandwidth(parent));
  
         for_each_possible_cpu(i) {
                 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
@@ -12703,7 +12816,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
          * idle runqueue:
          */
         if (rq->cfs.load.weight)
-               rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
+               rr_interval = NS_TO_JIFFIES(se->slice);
  
         return rr_interval;
  }