sched/numa: Initialise numa_next_scan properly
[platform/adaptation/renesas_rcar/renesas_kernel.git] / kernel / sched / fair.c
index 0784ab6..0966f0c 100644 (file)
@@ -900,22 +900,11 @@ void task_numa_work(struct callback_head *work)
        if (p->flags & PF_EXITING)
                return;
 
-       /*
-        * We do not care about task placement until a task runs on a node
-        * other than the first one used by the address space. This is
-        * largely because migrations are driven by what CPU the task
-        * is running on. If it's never scheduled on another node, it'll
-        * not migrate so why bother trapping the fault.
-        */
-       if (mm->first_nid == NUMA_PTE_SCAN_INIT)
-               mm->first_nid = numa_node_id();
-       if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
-               /* Are we running on a new node yet? */
-               if (numa_node_id() == mm->first_nid &&
-                   !sched_feat_numa(NUMA_FORCE))
-                       return;
-
-               mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
+       if (!mm->numa_next_reset || !mm->numa_next_scan) {
+               mm->numa_next_scan = now +
+                       msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
+               mm->numa_next_reset = now +
+                       msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
        }
 
        /*
@@ -946,12 +935,10 @@ void task_numa_work(struct callback_head *work)
                return;
 
        /*
-        * Do not set pte_numa if the current running node is rate-limited.
-        * This loses statistics on the fault but if we are unwilling to
-        * migrate to this node, it is less likely we can do useful work
+        * Delay this task enough that another task of this mm will likely win
+        * the next time around.
         */
-       if (migrate_ratelimited(numa_node_id()))
-               return;
+       p->node_stamp += 2 * TICK_NSEC;
 
        start = mm->numa_scan_offset;
        pages = sysctl_numa_balancing_scan_size;
@@ -988,10 +975,10 @@ void task_numa_work(struct callback_head *work)
 
 out:
        /*
-        * It is possible to reach the end of the VMA list but the last few VMAs are
-        * not guaranteed to the vma_migratable. If they are not, we would find the
-        * !migratable VMA on the next scan but not reset the scanner to the start
-        * so check it now.
+        * It is possible to reach the end of the VMA list but the last few
+        * VMAs are not guaranteed to the vma_migratable. If they are not, we
+        * would find the !migratable VMA on the next scan but not reset the
+        * scanner to the start so check it now.
         */
        if (vma)
                mm->numa_scan_offset = start;
@@ -1026,7 +1013,7 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
        if (now - curr->node_stamp > period) {
                if (!curr->node_stamp)
                        curr->numa_scan_period = sysctl_numa_balancing_scan_period_min;
-               curr->node_stamp = now;
+               curr->node_stamp += period;
 
                if (!time_before(jiffies, curr->mm->numa_next_scan)) {
                        init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
@@ -5396,6 +5383,7 @@ void idle_balance(int this_cpu, struct rq *this_rq)
        struct sched_domain *sd;
        int pulled_task = 0;
        unsigned long next_balance = jiffies + HZ;
+       u64 curr_cost = 0;
 
        this_rq->idle_stamp = rq_clock(this_rq);
 
@@ -5412,15 +5400,27 @@ void idle_balance(int this_cpu, struct rq *this_rq)
        for_each_domain(this_cpu, sd) {
                unsigned long interval;
                int continue_balancing = 1;
+               u64 t0, domain_cost;
 
                if (!(sd->flags & SD_LOAD_BALANCE))
                        continue;
 
+               if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)
+                       break;
+
                if (sd->flags & SD_BALANCE_NEWIDLE) {
+                       t0 = sched_clock_cpu(this_cpu);
+
                        /* If we've pulled tasks over stop searching: */
                        pulled_task = load_balance(this_cpu, this_rq,
                                                   sd, CPU_NEWLY_IDLE,
                                                   &continue_balancing);
+
+                       domain_cost = sched_clock_cpu(this_cpu) - t0;
+                       if (domain_cost > sd->max_newidle_lb_cost)
+                               sd->max_newidle_lb_cost = domain_cost;
+
+                       curr_cost += domain_cost;
                }
 
                interval = msecs_to_jiffies(sd->balance_interval);
@@ -5442,6 +5442,9 @@ void idle_balance(int this_cpu, struct rq *this_rq)
                 */
                this_rq->next_balance = next_balance;
        }
+
+       if (curr_cost > this_rq->max_idle_balance_cost)
+               this_rq->max_idle_balance_cost = curr_cost;
 }
 
 /*
@@ -5665,15 +5668,39 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
        /* Earliest time when we have to do rebalance again */
        unsigned long next_balance = jiffies + 60*HZ;
        int update_next_balance = 0;
-       int need_serialize;
+       int need_serialize, need_decay = 0;
+       u64 max_cost = 0;
 
        update_blocked_averages(cpu);
 
        rcu_read_lock();
        for_each_domain(cpu, sd) {
+               /*
+                * Decay the newidle max times here because this is a regular
+                * visit to all the domains. Decay ~1% per second.
+                */
+               if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
+                       sd->max_newidle_lb_cost =
+                               (sd->max_newidle_lb_cost * 253) / 256;
+                       sd->next_decay_max_lb_cost = jiffies + HZ;
+                       need_decay = 1;
+               }
+               max_cost += sd->max_newidle_lb_cost;
+
                if (!(sd->flags & SD_LOAD_BALANCE))
                        continue;
 
+               /*
+                * Stop the load balance at this level. There is another
+                * CPU in our sched group which is doing load balancing more
+                * actively.
+                */
+               if (!continue_balancing) {
+                       if (need_decay)
+                               continue;
+                       break;
+               }
+
                interval = sd->balance_interval;
                if (idle != CPU_IDLE)
                        interval *= sd->busy_factor;
@@ -5707,14 +5734,14 @@ out:
                        next_balance = sd->last_balance + interval;
                        update_next_balance = 1;
                }
-
+       }
+       if (need_decay) {
                /*
-                * Stop the load balance at this level. There is another
-                * CPU in our sched group which is doing load balancing more
-                * actively.
+                * Ensure the rq-wide value also decays but keep it at a
+                * reasonable floor to avoid funnies with rq->avg_idle.
                 */
-               if (!continue_balancing)
-                       break;
+               rq->max_idle_balance_cost =
+                       max((u64)sysctl_sched_migration_cost, max_cost);
        }
        rcu_read_unlock();