sched/fair: Eliminate bandwidth race between throttling and distribution

author Paul Turner <pjt@google.com>

Fri, 10 Apr 2020 22:52:07 +0000 (15:52 -0700)

committer Peter Zijlstra <peterz@infradead.org>

Thu, 30 Apr 2020 18:14:38 +0000 (20:14 +0200)
author Paul Turner <pjt@google.com>
Fri, 10 Apr 2020 22:52:07 +0000 (15:52 -0700)
committer Peter Zijlstra <peterz@infradead.org>
Thu, 30 Apr 2020 18:14:38 +0000 (20:14 +0200)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 02f323b..0c13a41 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4588,16 +4588,16 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
  }
  
  /* returns 0 on failure to allocate runtime */
-static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b,
+                                  struct cfs_rq *cfs_rq, u64 target_runtime)
  {
-       struct task_group *tg = cfs_rq->tg;
-       struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
-       u64 amount = 0, min_amount;
+       u64 min_amount, amount = 0;
+
+       lockdep_assert_held(&cfs_b->lock);
  
         /* note: this is a positive sum as runtime_remaining <= 0 */
-       min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
+       min_amount = target_runtime - cfs_rq->runtime_remaining;
  
-       raw_spin_lock(&cfs_b->lock);
         if (cfs_b->quota == RUNTIME_INF)
                 amount = min_amount;
         else {
@@ -4609,13 +4609,25 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
                         cfs_b->idle = 0;
                 }
         }
-       raw_spin_unlock(&cfs_b->lock);
  
         cfs_rq->runtime_remaining += amount;
  
         return cfs_rq->runtime_remaining > 0;
  }
  
+/* returns 0 on failure to allocate runtime */
+static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+       struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+       int ret;
+
+       raw_spin_lock(&cfs_b->lock);
+       ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice());
+       raw_spin_unlock(&cfs_b->lock);
+
+       return ret;
+}
+
  static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
  {
         /* dock delta_exec before expiring quota (as it could span periods) */
@@ -4704,13 +4716,33 @@ static int tg_throttle_down(struct task_group *tg, void *data)
         return 0;
  }
  
-static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
+static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
  {
         struct rq *rq = rq_of(cfs_rq);
         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
         struct sched_entity *se;
         long task_delta, idle_task_delta, dequeue = 1;
-       bool empty;
+
+       raw_spin_lock(&cfs_b->lock);
+       /* This will start the period timer if necessary */
+       if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) {
+               /*
+                * We have raced with bandwidth becoming available, and if we
+                * actually throttled the timer might not unthrottle us for an
+                * entire period. We additionally needed to make sure that any
+                * subsequent check_cfs_rq_runtime calls agree not to throttle
+                * us, as we may commit to do cfs put_prev+pick_next, so we ask
+                * for 1ns of runtime rather than just check cfs_b.
+                */
+               dequeue = 0;
+       } else {
+               list_add_tail_rcu(&cfs_rq->throttled_list,
+                                 &cfs_b->throttled_cfs_rq);
+       }
+       raw_spin_unlock(&cfs_b->lock);
+
+       if (!dequeue)
+               return false;  /* Throttle no longer required. */
  
         se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
  
@@ -4744,29 +4776,13 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
         if (!se)
                 sub_nr_running(rq, task_delta);
  
-       cfs_rq->throttled = 1;
-       cfs_rq->throttled_clock = rq_clock(rq);
-       raw_spin_lock(&cfs_b->lock);
-       empty = list_empty(&cfs_b->throttled_cfs_rq);
-
-       /*
-        * Add to the _head_ of the list, so that an already-started
-        * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is
-        * not running add to the tail so that later runqueues don't get starved.
-        */
-       if (cfs_b->distribute_running)
-               list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
-       else
-               list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
-
         /*
-        * If we're the first throttled task, make sure the bandwidth
-        * timer is running.
+        * Note: distribution will already see us throttled via the
+        * throttled-list.  rq->lock protects completion.
          */
-       if (empty)
-               start_cfs_bandwidth(cfs_b);
-
-       raw_spin_unlock(&cfs_b->lock);
+       cfs_rq->throttled = 1;
+       cfs_rq->throttled_clock = rq_clock(rq);
+       return true;
  }
  
  void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
@@ -5121,8 +5137,7 @@ static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
         if (cfs_rq_throttled(cfs_rq))
                 return true;
  
-       throttle_cfs_rq(cfs_rq);
-       return true;
+       return throttle_cfs_rq(cfs_rq);
  }
  
  static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
author	Paul Turner <pjt@google.com>
	Fri, 10 Apr 2020 22:52:07 +0000 (15:52 -0700)
committer	Peter Zijlstra <peterz@infradead.org>
	Thu, 30 Apr 2020 18:14:38 +0000 (20:14 +0200)