tracing/boot: Fix a hist trigger dependency for boot time tracing

[platform/kernel/linux-rpi.git] / kernel / cpu.c
diff --git a/kernel/cpu.c b/kernel/cpu.c

index 6ff2578..67c2294 100644 (file)
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -32,6 +32,7 @@
  #include <linux/relay.h>
  #include <linux/slab.h>
  #include <linux/percpu-rwsem.h>
+#include <linux/cpuset.h>
  
  #include <trace/events/power.h>
  #define CREATE_TRACE_POINTS
@@ -814,7 +815,57 @@ void __init cpuhp_threads_init(void)
         kthread_unpark(this_cpu_read(cpuhp_state.thread));
  }
  
+/*
+ *
+ * Serialize hotplug trainwrecks outside of the cpu_hotplug_lock
+ * protected region.
+ *
+ * The operation is still serialized against concurrent CPU hotplug via
+ * cpu_add_remove_lock, i.e. CPU map protection.  But it is _not_
+ * serialized against other hotplug related activity like adding or
+ * removing of state callbacks and state instances, which invoke either the
+ * startup or the teardown callback of the affected state.
+ *
+ * This is required for subsystems which are unfixable vs. CPU hotplug and
+ * evade lock inversion problems by scheduling work which has to be
+ * completed _before_ cpu_up()/_cpu_down() returns.
+ *
+ * Don't even think about adding anything to this for any new code or even
+ * drivers. It's only purpose is to keep existing lock order trainwrecks
+ * working.
+ *
+ * For cpu_down() there might be valid reasons to finish cleanups which are
+ * not required to be done under cpu_hotplug_lock, but that's a different
+ * story and would be not invoked via this.
+ */
+static void cpu_up_down_serialize_trainwrecks(bool tasks_frozen)
+{
+       /*
+        * cpusets delegate hotplug operations to a worker to "solve" the
+        * lock order problems. Wait for the worker, but only if tasks are
+        * _not_ frozen (suspend, hibernate) as that would wait forever.
+        *
+        * The wait is required because otherwise the hotplug operation
+        * returns with inconsistent state, which could even be observed in
+        * user space when a new CPU is brought up. The CPU plug uevent
+        * would be delivered and user space reacting on it would fail to
+        * move tasks to the newly plugged CPU up to the point where the
+        * work has finished because up to that point the newly plugged CPU
+        * is not assignable in cpusets/cgroups. On unplug that's not
+        * necessarily a visible issue, but it is still inconsistent state,
+        * which is the real problem which needs to be "fixed". This can't
+        * prevent the transient state between scheduling the work and
+        * returning from waiting for it.
+        */
+       if (!tasks_frozen)
+               cpuset_wait_for_hotplug();
+}
+
  #ifdef CONFIG_HOTPLUG_CPU
+#ifndef arch_clear_mm_cpumask_cpu
+#define arch_clear_mm_cpumask_cpu(cpu, mm) cpumask_clear_cpu(cpu, mm_cpumask(mm))
+#endif
+
  /**
   * clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU
   * @cpu: a CPU id
@@ -850,7 +901,7 @@ void clear_tasks_mm_cpumask(int cpu)
                 t = find_lock_task_mm(p);
                 if (!t)
                         continue;
-               cpumask_clear_cpu(cpu, mm_cpumask(t->mm));
+               arch_clear_mm_cpumask_cpu(cpu, t->mm);
                 task_unlock(t);
         }
         rcu_read_unlock();
@@ -1047,6 +1098,7 @@ out:
          */
         lockup_detector_cleanup();
         arch_smt_update();
+       cpu_up_down_serialize_trainwrecks(tasks_frozen);
         return ret;
  }
  
@@ -1243,6 +1295,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
  out:
         cpus_write_unlock();
         arch_smt_update();
+       cpu_up_down_serialize_trainwrecks(tasks_frozen);
         return ret;
  }