sched/fair: Move rebalance_domains()

author Peter Zijlstra <peterz@infradead.org>

Tue, 20 Feb 2018 09:58:39 +0000 (10:58 +0100)

committer Ingo Molnar <mingo@kernel.org>

Fri, 9 Mar 2018 06:59:23 +0000 (07:59 +0100)
author Peter Zijlstra <peterz@infradead.org>
Tue, 20 Feb 2018 09:58:39 +0000 (10:58 +0100)
committer Ingo Molnar <mingo@kernel.org>
Fri, 9 Mar 2018 06:59:23 +0000 (07:59 +0100)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 5c35756..0da79d8 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9121,6 +9121,124 @@ out_unlock:
         return 0;
  }
  
+static DEFINE_SPINLOCK(balancing);
+
+/*
+ * Scale the max load_balance interval with the number of CPUs in the system.
+ * This trades load-balance latency on larger machines for less cross talk.
+ */
+void update_max_interval(void)
+{
+       max_load_balance_interval = HZ*num_online_cpus()/10;
+}
+
+/*
+ * It checks each scheduling domain to see if it is due to be balanced,
+ * and initiates a balancing operation if so.
+ *
+ * Balancing parameters are set up in init_sched_domains.
+ */
+static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
+{
+       int continue_balancing = 1;
+       int cpu = rq->cpu;
+       unsigned long interval;
+       struct sched_domain *sd;
+       /* Earliest time when we have to do rebalance again */
+       unsigned long next_balance = jiffies + 60*HZ;
+       int update_next_balance = 0;
+       int need_serialize, need_decay = 0;
+       u64 max_cost = 0;
+
+       rcu_read_lock();
+       for_each_domain(cpu, sd) {
+               /*
+                * Decay the newidle max times here because this is a regular
+                * visit to all the domains. Decay ~1% per second.
+                */
+               if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
+                       sd->max_newidle_lb_cost =
+                               (sd->max_newidle_lb_cost * 253) / 256;
+                       sd->next_decay_max_lb_cost = jiffies + HZ;
+                       need_decay = 1;
+               }
+               max_cost += sd->max_newidle_lb_cost;
+
+               if (!(sd->flags & SD_LOAD_BALANCE))
+                       continue;
+
+               /*
+                * Stop the load balance at this level. There is another
+                * CPU in our sched group which is doing load balancing more
+                * actively.
+                */
+               if (!continue_balancing) {
+                       if (need_decay)
+                               continue;
+                       break;
+               }
+
+               interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
+
+               need_serialize = sd->flags & SD_SERIALIZE;
+               if (need_serialize) {
+                       if (!spin_trylock(&balancing))
+                               goto out;
+               }
+
+               if (time_after_eq(jiffies, sd->last_balance + interval)) {
+                       if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
+                               /*
+                                * The LBF_DST_PINNED logic could have changed
+                                * env->dst_cpu, so we can't know our idle
+                                * state even if we migrated tasks. Update it.
+                                */
+                               idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
+                       }
+                       sd->last_balance = jiffies;
+                       interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
+               }
+               if (need_serialize)
+                       spin_unlock(&balancing);
+out:
+               if (time_after(next_balance, sd->last_balance + interval)) {
+                       next_balance = sd->last_balance + interval;
+                       update_next_balance = 1;
+               }
+       }
+       if (need_decay) {
+               /*
+                * Ensure the rq-wide value also decays but keep it at a
+                * reasonable floor to avoid funnies with rq->avg_idle.
+                */
+               rq->max_idle_balance_cost =
+                       max((u64)sysctl_sched_migration_cost, max_cost);
+       }
+       rcu_read_unlock();
+
+       /*
+        * next_balance will be updated only when there is a need.
+        * When the cpu is attached to null domain for ex, it will not be
+        * updated.
+        */
+       if (likely(update_next_balance)) {
+               rq->next_balance = next_balance;
+
+#ifdef CONFIG_NO_HZ_COMMON
+               /*
+                * If this CPU has been elected to perform the nohz idle
+                * balance. Other idle CPUs have already rebalanced with
+                * nohz_idle_balance() and nohz.next_balance has been
+                * updated accordingly. This CPU is now running the idle load
+                * balance for itself and we need to update the
+                * nohz.next_balance accordingly.
+                */
+               if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
+                       nohz.next_balance = rq->next_balance;
+#endif
+       }
+}
+
  static inline int on_null_domain(struct rq *rq)
  {
         return unlikely(!rcu_dereference_sched(rq->sd));
@@ -9373,124 +9491,6 @@ out:
  static inline void nohz_balancer_kick(struct rq *rq) { }
  #endif
  
-static DEFINE_SPINLOCK(balancing);
-
-/*
- * Scale the max load_balance interval with the number of CPUs in the system.
- * This trades load-balance latency on larger machines for less cross talk.
- */
-void update_max_interval(void)
-{
-       max_load_balance_interval = HZ*num_online_cpus()/10;
-}
-
-/*
- * It checks each scheduling domain to see if it is due to be balanced,
- * and initiates a balancing operation if so.
- *
- * Balancing parameters are set up in init_sched_domains.
- */
-static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
-{
-       int continue_balancing = 1;
-       int cpu = rq->cpu;
-       unsigned long interval;
-       struct sched_domain *sd;
-       /* Earliest time when we have to do rebalance again */
-       unsigned long next_balance = jiffies + 60*HZ;
-       int update_next_balance = 0;
-       int need_serialize, need_decay = 0;
-       u64 max_cost = 0;
-
-       rcu_read_lock();
-       for_each_domain(cpu, sd) {
-               /*
-                * Decay the newidle max times here because this is a regular
-                * visit to all the domains. Decay ~1% per second.
-                */
-               if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
-                       sd->max_newidle_lb_cost =
-                               (sd->max_newidle_lb_cost * 253) / 256;
-                       sd->next_decay_max_lb_cost = jiffies + HZ;
-                       need_decay = 1;
-               }
-               max_cost += sd->max_newidle_lb_cost;
-
-               if (!(sd->flags & SD_LOAD_BALANCE))
-                       continue;
-
-               /*
-                * Stop the load balance at this level. There is another
-                * CPU in our sched group which is doing load balancing more
-                * actively.
-                */
-               if (!continue_balancing) {
-                       if (need_decay)
-                               continue;
-                       break;
-               }
-
-               interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
-
-               need_serialize = sd->flags & SD_SERIALIZE;
-               if (need_serialize) {
-                       if (!spin_trylock(&balancing))
-                               goto out;
-               }
-
-               if (time_after_eq(jiffies, sd->last_balance + interval)) {
-                       if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
-                               /*
-                                * The LBF_DST_PINNED logic could have changed
-                                * env->dst_cpu, so we can't know our idle
-                                * state even if we migrated tasks. Update it.
-                                */
-                               idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
-                       }
-                       sd->last_balance = jiffies;
-                       interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
-               }
-               if (need_serialize)
-                       spin_unlock(&balancing);
-out:
-               if (time_after(next_balance, sd->last_balance + interval)) {
-                       next_balance = sd->last_balance + interval;
-                       update_next_balance = 1;
-               }
-       }
-       if (need_decay) {
-               /*
-                * Ensure the rq-wide value also decays but keep it at a
-                * reasonable floor to avoid funnies with rq->avg_idle.
-                */
-               rq->max_idle_balance_cost =
-                       max((u64)sysctl_sched_migration_cost, max_cost);
-       }
-       rcu_read_unlock();
-
-       /*
-        * next_balance will be updated only when there is a need.
-        * When the CPU is attached to null domain for ex, it will not be
-        * updated.
-        */
-       if (likely(update_next_balance)) {
-               rq->next_balance = next_balance;
-
-#ifdef CONFIG_NO_HZ_COMMON
-               /*
-                * If this CPU has been elected to perform the nohz idle
-                * balance. Other idle CPUs have already rebalanced with
-                * nohz_idle_balance() and nohz.next_balance has been
-                * updated accordingly. This CPU is now running the idle load
-                * balance for itself and we need to update the
-                * nohz.next_balance accordingly.
-                */
-               if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
-                       nohz.next_balance = rq->next_balance;
-#endif
-       }
-}
-
  #ifdef CONFIG_NO_HZ_COMMON
  /*
   * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
author	Peter Zijlstra <peterz@infradead.org>
	Tue, 20 Feb 2018 09:58:39 +0000 (10:58 +0100)
committer	Ingo Molnar <mingo@kernel.org>
	Fri, 9 Mar 2018 06:59:23 +0000 (07:59 +0100)