sched/core: move IO scheduling accounting from io_schedule_timeout() into scheduler

author Tejun Heo <tj@kernel.org>

Wed, 7 Dec 2016 20:48:41 +0000 (15:48 -0500)

committer Ingo Molnar <mingo@kernel.org>

Sat, 14 Jan 2017 10:30:03 +0000 (11:30 +0100)
author Tejun Heo <tj@kernel.org>
Wed, 7 Dec 2016 20:48:41 +0000 (15:48 -0500)
committer Ingo Molnar <mingo@kernel.org>
Sat, 14 Jan 2017 10:30:03 +0000 (11:30 +0100)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 96a4267..9fd3716 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2089,11 +2089,24 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
         p->sched_contributes_to_load = !!task_contributes_to_load(p);
         p->state = TASK_WAKING;
  
+       if (p->in_iowait) {
+               delayacct_blkio_end();
+               atomic_dec(&task_rq(p)->nr_iowait);
+       }
+
         cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
         if (task_cpu(p) != cpu) {
                 wake_flags |= WF_MIGRATED;
                 set_task_cpu(p, cpu);
         }
+
+#else /* CONFIG_SMP */
+
+       if (p->in_iowait) {
+               delayacct_blkio_end();
+               atomic_dec(&task_rq(p)->nr_iowait);
+       }
+
  #endif /* CONFIG_SMP */
  
         ttwu_queue(p, cpu, wake_flags);
@@ -2143,8 +2156,13 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
  
         trace_sched_waking(p);
  
-       if (!task_on_rq_queued(p))
+       if (!task_on_rq_queued(p)) {
+               if (p->in_iowait) {
+                       delayacct_blkio_end();
+                       atomic_dec(&rq->nr_iowait);
+               }
                 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
+       }
  
         ttwu_do_wakeup(rq, p, 0, rf);
         ttwu_stat(p, smp_processor_id(), 0);
@@ -2956,6 +2974,36 @@ unsigned long long nr_context_switches(void)
         return sum;
  }
  
+/*
+ * IO-wait accounting, and how its mostly bollocks (on SMP).
+ *
+ * The idea behind IO-wait account is to account the idle time that we could
+ * have spend running if it were not for IO. That is, if we were to improve the
+ * storage performance, we'd have a proportional reduction in IO-wait time.
+ *
+ * This all works nicely on UP, where, when a task blocks on IO, we account
+ * idle time as IO-wait, because if the storage were faster, it could've been
+ * running and we'd not be idle.
+ *
+ * This has been extended to SMP, by doing the same for each CPU. This however
+ * is broken.
+ *
+ * Imagine for instance the case where two tasks block on one CPU, only the one
+ * CPU will have IO-wait accounted, while the other has regular idle. Even
+ * though, if the storage were faster, both could've ran at the same time,
+ * utilising both CPUs.
+ *
+ * This means, that when looking globally, the current IO-wait accounting on
+ * SMP is a lower bound, by reason of under accounting.
+ *
+ * Worse, since the numbers are provided per CPU, they are sometimes
+ * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly
+ * associated with any one particular CPU, it can wake to another CPU than it
+ * blocked on. This means the per CPU IO-wait number is meaningless.
+ *
+ * Task CPU affinities can make all that even more 'interesting'.
+ */
+
  unsigned long nr_iowait(void)
  {
         unsigned long i, sum = 0;
@@ -2966,6 +3014,13 @@ unsigned long nr_iowait(void)
         return sum;
  }
  
+/*
+ * Consumers of these two interfaces, like for example the cpufreq menu
+ * governor are using nonsensical data. Boosting frequency for a CPU that has
+ * IO-wait which might not even end up running the task when it does become
+ * runnable.
+ */
+
  unsigned long nr_iowait_cpu(int cpu)
  {
         struct rq *this = cpu_rq(cpu);
@@ -3377,6 +3432,11 @@ static void __sched notrace __schedule(bool preempt)
                         deactivate_task(rq, prev, DEQUEUE_SLEEP);
                         prev->on_rq = 0;
  
+                       if (prev->in_iowait) {
+                               atomic_inc(&rq->nr_iowait);
+                               delayacct_blkio_start();
+                       }
+
                         /*
                          * If a worker went to sleep, notify and ask workqueue
                          * whether it wants to wake up a task to maintain
@@ -5075,19 +5135,13 @@ EXPORT_SYMBOL_GPL(yield_to);
  long __sched io_schedule_timeout(long timeout)
  {
         int old_iowait = current->in_iowait;
-       struct rq *rq;
         long ret;
  
         current->in_iowait = 1;
         blk_schedule_flush_plug(current);
  
-       delayacct_blkio_start();
-       rq = raw_rq();
-       atomic_inc(&rq->nr_iowait);
         ret = schedule_timeout(timeout);
         current->in_iowait = old_iowait;
-       atomic_dec(&rq->nr_iowait);
-       delayacct_blkio_end();
  
         return ret;
  }
author	Tejun Heo <tj@kernel.org>
	Wed, 7 Dec 2016 20:48:41 +0000 (15:48 -0500)
committer	Ingo Molnar <mingo@kernel.org>
	Sat, 14 Jan 2017 10:30:03 +0000 (11:30 +0100)