sched/fair: Snapshot the min_vruntime of CPUs on force idle

author Joel Fernandes (Google) <joel@joelfernandes.org>

Tue, 17 Nov 2020 23:19:39 +0000 (18:19 -0500)

committer Peter Zijlstra <peterz@infradead.org>

Wed, 12 May 2021 09:43:29 +0000 (11:43 +0200)
author Joel Fernandes (Google) <joel@joelfernandes.org>
Tue, 17 Nov 2020 23:19:39 +0000 (18:19 -0500)
committer Peter Zijlstra <peterz@infradead.org>
Wed, 12 May 2021 09:43:29 +0000 (11:43 +0200)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index e506d9de16fcce252c7deb0e917bc912dd20b13c..e45c1d21b371417f6f1cc7b7d045f208c86df165 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -111,7 +111,7 @@ static inline int __task_prio(struct task_struct *p)
   */
  
  /* real prio, less is less */
-static inline bool prio_less(struct task_struct *a, struct task_struct *b)
+static inline bool prio_less(struct task_struct *a, struct task_struct *b, bool in_fi)
  {
  
         int pa = __task_prio(a), pb = __task_prio(b);
@@ -125,19 +125,8 @@ static inline bool prio_less(struct task_struct *a, struct task_struct *b)
         if (pa == -1) /* dl_prio() doesn't work because of stop_class above */
                 return !dl_time_before(a->dl.deadline, b->dl.deadline);
  
-       if (pa == MAX_RT_PRIO + MAX_NICE)  { /* fair */
-               u64 vruntime = b->se.vruntime;
-
-               /*
-                * Normalize the vruntime if tasks are in different cpus.
-                */
-               if (task_cpu(a) != task_cpu(b)) {
-                       vruntime -= task_cfs_rq(b)->min_vruntime;
-                       vruntime += task_cfs_rq(a)->min_vruntime;
-               }
-
-               return !((s64)(a->se.vruntime - vruntime) <= 0);
-       }
+       if (pa == MAX_RT_PRIO + MAX_NICE)       /* fair */
+               return cfs_prio_less(a, b, in_fi);
  
         return false;
  }
@@ -151,7 +140,7 @@ static inline bool __sched_core_less(struct task_struct *a, struct task_struct *
                 return false;
  
         /* flip prio, so high prio is leftmost */
-       if (prio_less(b, a))
+       if (prio_less(b, a, task_rq(a)->core->core_forceidle))
                 return true;
  
         return false;
@@ -5350,7 +5339,7 @@ static inline bool cookie_match(struct task_struct *a, struct task_struct *b)
   * - Else returns idle_task.
   */
  static struct task_struct *
-pick_task(struct rq *rq, const struct sched_class *class, struct task_struct *max)
+pick_task(struct rq *rq, const struct sched_class *class, struct task_struct *max, bool in_fi)
  {
         struct task_struct *class_pick, *cookie_pick;
         unsigned long cookie = rq->core->core_cookie;
@@ -5365,7 +5354,7 @@ pick_task(struct rq *rq, const struct sched_class *class, struct task_struct *ma
                  * higher priority than max.
                  */
                 if (max && class_pick->core_cookie &&
-                   prio_less(class_pick, max))
+                   prio_less(class_pick, max, in_fi))
                         return idle_sched_class.pick_task(rq);
  
                 return class_pick;
@@ -5384,19 +5373,22 @@ pick_task(struct rq *rq, const struct sched_class *class, struct task_struct *ma
          * the core (so far) and it must be selected, otherwise we must go with
          * the cookie pick in order to satisfy the constraint.
          */
-       if (prio_less(cookie_pick, class_pick) &&
-           (!max || prio_less(max, class_pick)))
+       if (prio_less(cookie_pick, class_pick, in_fi) &&
+           (!max || prio_less(max, class_pick, in_fi)))
                 return class_pick;
  
         return cookie_pick;
  }
  
+extern void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi);
+
  static struct task_struct *
  pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
  {
         struct task_struct *next, *max = NULL;
         const struct sched_class *class;
         const struct cpumask *smt_mask;
+       bool fi_before = false;
         bool need_sync;
         int i, j, cpu;
  
@@ -5478,9 +5470,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
  
                 if (!next->core_cookie) {
                         rq->core_pick = NULL;
+                       /*
+                        * For robustness, update the min_vruntime_fi for
+                        * unconstrained picks as well.
+                        */
+                       WARN_ON_ONCE(fi_before);
+                       task_vruntime_update(rq, next, false);
                         goto done;
                 }
-               need_sync = true;
         }
  
         for_each_cpu(i, smt_mask) {
@@ -5511,11 +5508,16 @@ again:
                          * highest priority task already selected for this
                          * core.
                          */
-                       p = pick_task(rq_i, class, max);
+                       p = pick_task(rq_i, class, max, fi_before);
                         if (!p)
                                 continue;
  
                         rq_i->core_pick = p;
+                       if (rq_i->idle == p && rq_i->nr_running) {
+                               rq->core->core_forceidle = true;
+                               if (!fi_before)
+                                       rq->core->core_forceidle_seq++;
+                       }
  
                         /*
                          * If this new candidate is of higher priority than the
@@ -5534,6 +5536,7 @@ again:
                                 max = p;
  
                                 if (old_max) {
+                                       rq->core->core_forceidle = false;
                                         for_each_cpu(j, smt_mask) {
                                                 if (j == i)
                                                         continue;
@@ -5574,10 +5577,16 @@ again:
                 if (!rq_i->core_pick)
                         continue;
  
-               if (is_task_rq_idle(rq_i->core_pick) && rq_i->nr_running &&
-                   !rq_i->core->core_forceidle) {
-                       rq_i->core->core_forceidle = true;
-               }
+               /*
+                * Update for new !FI->FI transitions, or if continuing to be in !FI:
+                * fi_before     fi      update?
+                *  0            0       1
+                *  0            1       1
+                *  1            0       1
+                *  1            1       0
+                */
+               if (!(fi_before && rq->core->core_forceidle))
+                       task_vruntime_update(rq_i, rq_i->core_pick, rq->core->core_forceidle);
  
                 if (i == cpu) {
                         rq_i->core_pick = NULL;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 4d1ecab41e8043bf94c95b94f5be3234c9b4c65d..5948dc17b9ccbf267efbb4bb8a5ed9f0df82441c 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10801,6 +10801,81 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
             __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
                 resched_curr(rq);
  }
+
+/*
+ * se_fi_update - Update the cfs_rq->min_vruntime_fi in a CFS hierarchy if needed.
+ */
+static void se_fi_update(struct sched_entity *se, unsigned int fi_seq, bool forceidle)
+{
+       for_each_sched_entity(se) {
+               struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+               if (forceidle) {
+                       if (cfs_rq->forceidle_seq == fi_seq)
+                               break;
+                       cfs_rq->forceidle_seq = fi_seq;
+               }
+
+               cfs_rq->min_vruntime_fi = cfs_rq->min_vruntime;
+       }
+}
+
+void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi)
+{
+       struct sched_entity *se = &p->se;
+
+       if (p->sched_class != &fair_sched_class)
+               return;
+
+       se_fi_update(se, rq->core->core_forceidle_seq, in_fi);
+}
+
+bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool in_fi)
+{
+       struct rq *rq = task_rq(a);
+       struct sched_entity *sea = &a->se;
+       struct sched_entity *seb = &b->se;
+       struct cfs_rq *cfs_rqa;
+       struct cfs_rq *cfs_rqb;
+       s64 delta;
+
+       SCHED_WARN_ON(task_rq(b)->core != rq->core);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+       /*
+        * Find an se in the hierarchy for tasks a and b, such that the se's
+        * are immediate siblings.
+        */
+       while (sea->cfs_rq->tg != seb->cfs_rq->tg) {
+               int sea_depth = sea->depth;
+               int seb_depth = seb->depth;
+
+               if (sea_depth >= seb_depth)
+                       sea = parent_entity(sea);
+               if (sea_depth <= seb_depth)
+                       seb = parent_entity(seb);
+       }
+
+       se_fi_update(sea, rq->core->core_forceidle_seq, in_fi);
+       se_fi_update(seb, rq->core->core_forceidle_seq, in_fi);
+
+       cfs_rqa = sea->cfs_rq;
+       cfs_rqb = seb->cfs_rq;
+#else
+       cfs_rqa = &task_rq(a)->cfs;
+       cfs_rqb = &task_rq(b)->cfs;
+#endif
+
+       /*
+        * Find delta after normalizing se's vruntime with its cfs_rq's
+        * min_vruntime_fi, which would have been updated in prior calls
+        * to se_fi_update().
+        */
+       delta = (s64)(sea->vruntime - seb->vruntime) +
+               (s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi);
+
+       return delta > 0;
+}
  #else
  static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {}
  #endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

index db555143380d3b71bfc34cb3e32d9295641048b4..4a898abc60ce243c958eb1ab23b23492e00419d9 100644 (file)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -526,6 +526,11 @@ struct cfs_rq {
  
         u64                     exec_clock;
         u64                     min_vruntime;
+#ifdef CONFIG_SCHED_CORE
+       unsigned int            forceidle_seq;
+       u64                     min_vruntime_fi;
+#endif
+
  #ifndef CONFIG_64BIT
         u64                     min_vruntime_copy;
  #endif
@@ -1089,6 +1094,7 @@ struct rq {
         unsigned int            core_pick_seq;
         unsigned long           core_cookie;
         unsigned char           core_forceidle;
+       unsigned int            core_forceidle_seq;
  #endif
  };
  
@@ -1162,6 +1168,8 @@ static inline raw_spinlock_t *__rq_lockp(struct rq *rq)
         return &rq->__lock;
  }
  
+bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool fi);
+
  #else /* !CONFIG_SCHED_CORE */
  
  static inline bool sched_core_enabled(struct rq *rq)
author	Joel Fernandes (Google) <joel@joelfernandes.org>
	Tue, 17 Nov 2020 23:19:39 +0000 (18:19 -0500)
committer	Peter Zijlstra <peterz@infradead.org>
	Wed, 12 May 2021 09:43:29 +0000 (11:43 +0200)
kernel/sched/core.c		patch \| blob \| history
kernel/sched/fair.c		patch \| blob \| history
kernel/sched/sched.h		patch \| blob \| history