Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...
authorLinus Torvalds <torvalds@linux-foundation.org>
Wed, 23 May 2012 01:27:32 +0000 (18:27 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 23 May 2012 01:27:32 +0000 (18:27 -0700)
Pull scheduler changes from Ingo Molnar:
 "The biggest change is the cleanup/simplification of the load-balancer:
  instead of the current practice of architectures twiddling scheduler
  internal data structures and providing the scheduler domains in
  colorfully inconsistent ways, we now have generic scheduler code in
  kernel/sched/core.c:sched_init_numa() that looks at the architecture's
  node_distance() parameters and (while not fully trusting it) deducts a
  NUMA topology from it.

  This inevitably changes balancing behavior - hopefully for the better.

  There are various smaller optimizations, cleanups and fixlets as well"

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  sched: Taint kernel with TAINT_WARN after sleep-in-atomic bug
  sched: Remove stale power aware scheduling remnants and dysfunctional knobs
  sched/debug: Fix printing large integers on 32-bit platforms
  sched/fair: Improve the ->group_imb logic
  sched/nohz: Fix rq->cpu_load[] calculations
  sched/numa: Don't scale the imbalance
  sched/fair: Revert sched-domain iteration breakage
  sched/x86: Rewrite set_cpu_sibling_map()
  sched/numa: Fix the new NUMA topology bits
  sched/numa: Rewrite the CONFIG_NUMA sched domain support
  sched/fair: Propagate 'struct lb_env' usage into find_busiest_group
  sched/fair: Add some serialization to the sched_domain load-balance walk
  sched/fair: Let minimally loaded cpu balance the group
  sched: Change rq->nr_running to unsigned int
  x86/numa: Check for nonsensical topologies on real hw as well
  x86/numa: Hard partition cpu topology masks on node boundaries
  x86/numa: Allow specifying node_distance() for numa=fake
  x86/sched: Make mwait_usable() heed to "idle=" kernel parameters properly
  sched: Update documentation and comments
  sched_rt: Avoid unnecessary dequeue and enqueue of pushable tasks in set_cpus_allowed_rt()

1  2 
arch/x86/kernel/process.c
arch/x86/kernel/smpboot.c
include/linux/sched.h
include/linux/topology.h
kernel/sched/core.c

  #include <asm/debugreg.h>
  #include <asm/nmi.h>
  
 +/*
 + * per-CPU TSS segments. Threads are completely 'soft' on Linux,
 + * no more per-task TSS's. The TSS size is kept cacheline-aligned
 + * so they are allowed to end up in the .data..cacheline_aligned
 + * section. Since TSS's are completely CPU-local, we want them
 + * on exact cacheline boundaries, to eliminate cacheline ping-pong.
 + */
 +DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
 +
  #ifdef CONFIG_X86_64
  static DEFINE_PER_CPU(unsigned char, is_idle);
  static ATOMIC_NOTIFIER_HEAD(idle_notifier);
@@@ -76,9 -67,10 +76,9 @@@ void free_thread_xstate(struct task_str
        fpu_free(&tsk->thread.fpu);
  }
  
 -void free_thread_info(struct thread_info *ti)
 +void arch_release_task_struct(struct task_struct *tsk)
  {
 -      free_thread_xstate(ti->task);
 -      free_pages((unsigned long)ti, THREAD_ORDER);
 +      free_thread_xstate(tsk);
  }
  
  void arch_task_cache_init(void)
@@@ -385,7 -377,7 +385,7 @@@ static inline void play_dead(void
  #ifdef CONFIG_X86_64
  void enter_idle(void)
  {
 -      percpu_write(is_idle, 1);
 +      this_cpu_write(is_idle, 1);
        atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
  }
  
@@@ -524,6 -516,26 +524,6 @@@ void stop_this_cpu(void *dummy
        }
  }
  
 -static void do_nothing(void *unused)
 -{
 -}
 -
 -/*
 - * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
 - * pm_idle and update to new pm_idle value. Required while changing pm_idle
 - * handler on SMP systems.
 - *
 - * Caller must have changed pm_idle to the new value before the call. Old
 - * pm_idle value will not be used by any CPU after the return of this function.
 - */
 -void cpu_idle_wait(void)
 -{
 -      smp_mb();
 -      /* kick all the CPUs so that they exit out of pm_idle */
 -      smp_call_function(do_nothing, NULL, 1);
 -}
 -EXPORT_SYMBOL_GPL(cpu_idle_wait);
 -
  /* Default MONITOR/MWAIT with no hints, used for default C1 state */
  static void mwait_idle(void)
  {
@@@ -582,9 -594,17 +582,17 @@@ int mwait_usable(const struct cpuinfo_x
  {
        u32 eax, ebx, ecx, edx;
  
+       /* Use mwait if idle=mwait boot option is given */
        if (boot_option_idle_override == IDLE_FORCE_MWAIT)
                return 1;
  
+       /*
+        * Any idle= boot option other than idle=mwait means that we must not
+        * use mwait. Eg: idle=halt or idle=poll or idle=nomwait
+        */
+       if (boot_option_idle_override != IDLE_NO_OVERRIDE)
+               return 0;
        if (c->cpuid_level < MWAIT_INFO)
                return 0;
  
  /* State of each CPU */
  DEFINE_PER_CPU(int, cpu_state) = { 0 };
  
 -/* Store all idle threads, this can be reused instead of creating
 -* a new thread. Also avoids complicated thread destroy functionality
 -* for idle threads.
 -*/
  #ifdef CONFIG_HOTPLUG_CPU
  /*
 - * Needed only for CONFIG_HOTPLUG_CPU because __cpuinitdata is
 - * removed after init for !CONFIG_HOTPLUG_CPU.
 - */
 -static DEFINE_PER_CPU(struct task_struct *, idle_thread_array);
 -#define get_idle_for_cpu(x)      (per_cpu(idle_thread_array, x))
 -#define set_idle_for_cpu(x, p)   (per_cpu(idle_thread_array, x) = (p))
 -
 -/*
   * We need this for trampoline_base protection from concurrent accesses when
   * off- and onlining cores wildly.
   */
@@@ -85,16 -97,20 +85,16 @@@ static DEFINE_MUTEX(x86_cpu_hotplug_dri
  
  void cpu_hotplug_driver_lock(void)
  {
 -        mutex_lock(&x86_cpu_hotplug_driver_mutex);
 +      mutex_lock(&x86_cpu_hotplug_driver_mutex);
  }
  
  void cpu_hotplug_driver_unlock(void)
  {
 -        mutex_unlock(&x86_cpu_hotplug_driver_mutex);
 +      mutex_unlock(&x86_cpu_hotplug_driver_mutex);
  }
  
  ssize_t arch_cpu_probe(const char *buf, size_t count) { return -1; }
  ssize_t arch_cpu_release(const char *buf, size_t count) { return -1; }
 -#else
 -static struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
 -#define get_idle_for_cpu(x)      (idle_thread_array[(x)])
 -#define set_idle_for_cpu(x, p)   (idle_thread_array[(x)] = (p))
  #endif
  
  /* Number of siblings per CPU package */
@@@ -299,59 -315,90 +299,90 @@@ void __cpuinit smp_store_cpu_info(int i
                identify_secondary_cpu(c);
  }
  
- static void __cpuinit link_thread_siblings(int cpu1, int cpu2)
+ static bool __cpuinit
+ topology_sane(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o, const char *name)
  {
-       cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2));
-       cpumask_set_cpu(cpu2, cpu_sibling_mask(cpu1));
-       cpumask_set_cpu(cpu1, cpu_core_mask(cpu2));
-       cpumask_set_cpu(cpu2, cpu_core_mask(cpu1));
-       cpumask_set_cpu(cpu1, cpu_llc_shared_mask(cpu2));
-       cpumask_set_cpu(cpu2, cpu_llc_shared_mask(cpu1));
+       int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
+       return !WARN_ONCE(cpu_to_node(cpu1) != cpu_to_node(cpu2),
+               "sched: CPU #%d's %s-sibling CPU #%d is not on the same node! "
+               "[node: %d != %d]. Ignoring dependency.\n",
+               cpu1, name, cpu2, cpu_to_node(cpu1), cpu_to_node(cpu2));
+ }
+ #define link_mask(_m, c1, c2)                                         \
+ do {                                                                  \
+       cpumask_set_cpu((c1), cpu_##_m##_mask(c2));                     \
+       cpumask_set_cpu((c2), cpu_##_m##_mask(c1));                     \
+ } while (0)
+ static bool __cpuinit match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
+ {
+       if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
+               int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
+               if (c->phys_proc_id == o->phys_proc_id &&
+                   per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2) &&
+                   c->compute_unit_id == o->compute_unit_id)
+                       return topology_sane(c, o, "smt");
+       } else if (c->phys_proc_id == o->phys_proc_id &&
+                  c->cpu_core_id == o->cpu_core_id) {
+               return topology_sane(c, o, "smt");
+       }
+       return false;
+ }
+ static bool __cpuinit match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
+ {
+       int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
+       if (per_cpu(cpu_llc_id, cpu1) != BAD_APICID &&
+           per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2))
+               return topology_sane(c, o, "llc");
+       return false;
  }
  
+ static bool __cpuinit match_mc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
+ {
+       if (c->phys_proc_id == o->phys_proc_id)
+               return topology_sane(c, o, "mc");
+       return false;
+ }
  
  void __cpuinit set_cpu_sibling_map(int cpu)
  {
-       int i;
+       bool has_mc = boot_cpu_data.x86_max_cores > 1;
+       bool has_smt = smp_num_siblings > 1;
        struct cpuinfo_x86 *c = &cpu_data(cpu);
+       struct cpuinfo_x86 *o;
+       int i;
  
        cpumask_set_cpu(cpu, cpu_sibling_setup_mask);
  
-       if (smp_num_siblings > 1) {
-               for_each_cpu(i, cpu_sibling_setup_mask) {
-                       struct cpuinfo_x86 *o = &cpu_data(i);
-                       if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
-                               if (c->phys_proc_id == o->phys_proc_id &&
-                                   per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i) &&
-                                   c->compute_unit_id == o->compute_unit_id)
-                                       link_thread_siblings(cpu, i);
-                       } else if (c->phys_proc_id == o->phys_proc_id &&
-                                  c->cpu_core_id == o->cpu_core_id) {
-                               link_thread_siblings(cpu, i);
-                       }
-               }
-       } else {
+       if (!has_smt && !has_mc) {
                cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
-       }
-       cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
-       if (__this_cpu_read(cpu_info.x86_max_cores) == 1) {
-               cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu));
+               cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
+               cpumask_set_cpu(cpu, cpu_core_mask(cpu));
                c->booted_cores = 1;
                return;
        }
  
        for_each_cpu(i, cpu_sibling_setup_mask) {
-               if (per_cpu(cpu_llc_id, cpu) != BAD_APICID &&
-                   per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) {
-                       cpumask_set_cpu(i, cpu_llc_shared_mask(cpu));
-                       cpumask_set_cpu(cpu, cpu_llc_shared_mask(i));
-               }
-               if (c->phys_proc_id == cpu_data(i).phys_proc_id) {
-                       cpumask_set_cpu(i, cpu_core_mask(cpu));
-                       cpumask_set_cpu(cpu, cpu_core_mask(i));
+               o = &cpu_data(i);
+               if ((i == cpu) || (has_smt && match_smt(c, o)))
+                       link_mask(sibling, cpu, i);
+               if ((i == cpu) || (has_mc && match_llc(c, o)))
+                       link_mask(llc_shared, cpu, i);
+               if ((i == cpu) || (has_mc && match_mc(c, o))) {
+                       link_mask(core, cpu, i);
                        /*
                         *  Does this new cpu bringup a new core?
                         */
@@@ -382,8 -429,7 +413,7 @@@ const struct cpumask *cpu_coregroup_mas
         * For perf, we return last level cache shared map.
         * And for power savings, we return cpu_core_map
         */
-       if ((sched_mc_power_savings || sched_smt_power_savings) &&
-           !(cpu_has(c, X86_FEATURE_AMD_DCM)))
+       if (!(cpu_has(c, X86_FEATURE_AMD_DCM)))
                return cpu_core_mask(cpu);
        else
                return cpu_llc_shared_mask(cpu);
@@@ -602,6 -648,22 +632,6 @@@ wakeup_secondary_cpu_via_init(int phys_
        return (send_status | accept_status);
  }
  
 -struct create_idle {
 -      struct work_struct work;
 -      struct task_struct *idle;
 -      struct completion done;
 -      int cpu;
 -};
 -
 -static void __cpuinit do_fork_idle(struct work_struct *work)
 -{
 -      struct create_idle *c_idle =
 -              container_of(work, struct create_idle, work);
 -
 -      c_idle->idle = fork_idle(c_idle->cpu);
 -      complete(&c_idle->done);
 -}
 -
  /* reduce the number of lines printed when booting a large cpu count system */
  static void __cpuinit announce_cpu(int cpu, int apicid)
  {
   * Returns zero if CPU booted OK, else error code from
   * ->wakeup_secondary_cpu.
   */
 -static int __cpuinit do_boot_cpu(int apicid, int cpu)
 +static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
  {
        unsigned long boot_error = 0;
        unsigned long start_ip;
        int timeout;
 -      struct create_idle c_idle = {
 -              .cpu    = cpu,
 -              .done   = COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
 -      };
 -
 -      INIT_WORK_ONSTACK(&c_idle.work, do_fork_idle);
  
        alternatives_smp_switch(1);
  
 -      c_idle.idle = get_idle_for_cpu(cpu);
 -
 -      /*
 -       * We can't use kernel_thread since we must avoid to
 -       * reschedule the child.
 -       */
 -      if (c_idle.idle) {
 -              c_idle.idle->thread.sp = (unsigned long) (((struct pt_regs *)
 -                      (THREAD_SIZE +  task_stack_page(c_idle.idle))) - 1);
 -              init_idle(c_idle.idle, cpu);
 -              goto do_rest;
 -      }
 +      idle->thread.sp = (unsigned long) (((struct pt_regs *)
 +                        (THREAD_SIZE +  task_stack_page(idle))) - 1);
 +      per_cpu(current_task, cpu) = idle;
  
 -      schedule_work(&c_idle.work);
 -      wait_for_completion(&c_idle.done);
 -
 -      if (IS_ERR(c_idle.idle)) {
 -              printk("failed fork for CPU %d\n", cpu);
 -              destroy_work_on_stack(&c_idle.work);
 -              return PTR_ERR(c_idle.idle);
 -      }
 -
 -      set_idle_for_cpu(cpu, c_idle.idle);
 -do_rest:
 -      per_cpu(current_task, cpu) = c_idle.idle;
  #ifdef CONFIG_X86_32
        /* Stack for startup_32 can be just as for start_secondary onwards */
        irq_ctx_init(cpu);
  #else
 -      clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
 +      clear_tsk_thread_flag(idle, TIF_FORK);
        initial_gs = per_cpu_offset(cpu);
        per_cpu(kernel_stack, cpu) =
 -              (unsigned long)task_stack_page(c_idle.idle) -
 +              (unsigned long)task_stack_page(idle) -
                KERNEL_STACK_OFFSET + THREAD_SIZE;
  #endif
        early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
        initial_code = (unsigned long)start_secondary;
 -      stack_start  = c_idle.idle->thread.sp;
 +      stack_start  = idle->thread.sp;
  
        /* start_ip had better be page-aligned! */
        start_ip = trampoline_address();
                 */
                smpboot_restore_warm_reset_vector();
        }
 -
 -      destroy_work_on_stack(&c_idle.work);
        return boot_error;
  }
  
 -int __cpuinit native_cpu_up(unsigned int cpu)
 +int __cpuinit native_cpu_up(unsigned int cpu, struct task_struct *tidle)
  {
        int apicid = apic->cpu_present_to_apicid(cpu);
        unsigned long flags;
  
        per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
  
 -      err = do_boot_cpu(apicid, cpu);
 +      err = do_boot_cpu(apicid, cpu, tidle);
        if (err) {
                pr_debug("do_boot_cpu failed %d\n", err);
                return -EIO;
diff --combined include/linux/sched.h
@@@ -855,61 -855,14 +855,14 @@@ enum cpu_idle_type 
  #define SD_WAKE_AFFINE                0x0020  /* Wake task to waking CPU */
  #define SD_PREFER_LOCAL               0x0040  /* Prefer to keep tasks local to this domain */
  #define SD_SHARE_CPUPOWER     0x0080  /* Domain members share cpu power */
- #define SD_POWERSAVINGS_BALANCE       0x0100  /* Balance for power savings */
  #define SD_SHARE_PKG_RESOURCES        0x0200  /* Domain members share cpu pkg resources */
  #define SD_SERIALIZE          0x0400  /* Only a single load balancing instance */
  #define SD_ASYM_PACKING               0x0800  /* Place busy groups earlier in the domain */
  #define SD_PREFER_SIBLING     0x1000  /* Prefer to place tasks in a sibling domain */
  #define SD_OVERLAP            0x2000  /* sched_domains of this level overlap */
  
- enum powersavings_balance_level {
-       POWERSAVINGS_BALANCE_NONE = 0,  /* No power saving load balance */
-       POWERSAVINGS_BALANCE_BASIC,     /* Fill one thread/core/package
-                                        * first for long running threads
-                                        */
-       POWERSAVINGS_BALANCE_WAKEUP,    /* Also bias task wakeups to semi-idle
-                                        * cpu package for power savings
-                                        */
-       MAX_POWERSAVINGS_BALANCE_LEVELS
- };
- extern int sched_mc_power_savings, sched_smt_power_savings;
- static inline int sd_balance_for_mc_power(void)
- {
-       if (sched_smt_power_savings)
-               return SD_POWERSAVINGS_BALANCE;
-       if (!sched_mc_power_savings)
-               return SD_PREFER_SIBLING;
-       return 0;
- }
- static inline int sd_balance_for_package_power(void)
- {
-       if (sched_mc_power_savings | sched_smt_power_savings)
-               return SD_POWERSAVINGS_BALANCE;
-       return SD_PREFER_SIBLING;
- }
  extern int __weak arch_sd_sibiling_asym_packing(void);
  
- /*
-  * Optimise SD flags for power savings:
-  * SD_BALANCE_NEWIDLE helps aggressive task consolidation and power savings.
-  * Keep default SD flags if sched_{smt,mc}_power_saving=0
-  */
- static inline int sd_power_saving_flags(void)
- {
-       if (sched_mc_power_savings | sched_smt_power_savings)
-               return SD_BALANCE_NEWIDLE;
-       return 0;
- }
  struct sched_group_power {
        atomic_t ref;
        /*
@@@ -1341,8 -1294,6 +1294,8 @@@ struct task_struct 
                                 * execve */
        unsigned in_iowait:1;
  
 +      /* task may not gain privileges */
 +      unsigned no_new_privs:1;
  
        /* Revert to default priority/policy when forking */
        unsigned sched_reset_on_fork:1;
        uid_t loginuid;
        unsigned int sessionid;
  #endif
 -      seccomp_t seccomp;
 +      struct seccomp seccomp;
  
  /* Thread group tracking */
        u32 parent_exec_id;
@@@ -1907,22 -1858,12 +1860,22 @@@ static inline void rcu_copy_process(str
        INIT_LIST_HEAD(&p->rcu_node_entry);
  }
  
 +static inline void rcu_switch_from(struct task_struct *prev)
 +{
 +      if (prev->rcu_read_lock_nesting != 0)
 +              rcu_preempt_note_context_switch();
 +}
 +
  #else
  
  static inline void rcu_copy_process(struct task_struct *p)
  {
  }
  
 +static inline void rcu_switch_from(struct task_struct *prev)
 +{
 +}
 +
  #endif
  
  #ifdef CONFIG_SMP
@@@ -1962,7 -1903,7 +1915,7 @@@ static inline int set_cpus_allowed(stru
   */
  extern unsigned long long notrace sched_clock(void);
  /*
-  * See the comment in kernel/sched_clock.c
+  * See the comment in kernel/sched/clock.c
   */
  extern u64 cpu_clock(int cpu);
  extern u64 local_clock(void);
diff --combined include/linux/topology.h
@@@ -70,7 -70,6 +70,6 @@@ int arch_update_cpu_topology(void)
   * Below are the 3 major initializers used in building sched_domains:
   * SD_SIBLING_INIT, for SMT domains
   * SD_CPU_INIT, for SMP domains
-  * SD_NODE_INIT, for NUMA domains
   *
   * Any architecture that cares to do any tuning to these values should do so
   * by defining their own arch-specific initializer in include/asm/topology.h.
@@@ -99,7 -98,6 +98,6 @@@
                                | 0*SD_BALANCE_WAKE                     \
                                | 1*SD_WAKE_AFFINE                      \
                                | 1*SD_SHARE_CPUPOWER                   \
-                               | 0*SD_POWERSAVINGS_BALANCE             \
                                | 1*SD_SHARE_PKG_RESOURCES              \
                                | 0*SD_SERIALIZE                        \
                                | 0*SD_PREFER_SIBLING                   \
                                | 0*SD_SHARE_CPUPOWER                   \
                                | 1*SD_SHARE_PKG_RESOURCES              \
                                | 0*SD_SERIALIZE                        \
-                               | sd_balance_for_mc_power()             \
-                               | sd_power_saving_flags()               \
                                ,                                       \
        .last_balance           = jiffies,                              \
        .balance_interval       = 1,                                    \
                                | 0*SD_SHARE_CPUPOWER                   \
                                | 0*SD_SHARE_PKG_RESOURCES              \
                                | 0*SD_SERIALIZE                        \
-                               | sd_balance_for_package_power()        \
-                               | sd_power_saving_flags()               \
                                ,                                       \
        .last_balance           = jiffies,                              \
        .balance_interval       = 1,                                    \
  }
  #endif
  
- /* sched_domains SD_ALLNODES_INIT for NUMA machines */
- #define SD_ALLNODES_INIT (struct sched_domain) {                      \
-       .min_interval           = 64,                                   \
-       .max_interval           = 64*num_online_cpus(),                 \
-       .busy_factor            = 128,                                  \
-       .imbalance_pct          = 133,                                  \
-       .cache_nice_tries       = 1,                                    \
-       .busy_idx               = 3,                                    \
-       .idle_idx               = 3,                                    \
-       .flags                  = 1*SD_LOAD_BALANCE                     \
-                               | 1*SD_BALANCE_NEWIDLE                  \
-                               | 0*SD_BALANCE_EXEC                     \
-                               | 0*SD_BALANCE_FORK                     \
-                               | 0*SD_BALANCE_WAKE                     \
-                               | 0*SD_WAKE_AFFINE                      \
-                               | 0*SD_SHARE_CPUPOWER                   \
-                               | 0*SD_POWERSAVINGS_BALANCE             \
-                               | 0*SD_SHARE_PKG_RESOURCES              \
-                               | 1*SD_SERIALIZE                        \
-                               | 0*SD_PREFER_SIBLING                   \
-                               ,                                       \
-       .last_balance           = jiffies,                              \
-       .balance_interval       = 64,                                   \
- }
- #ifndef SD_NODES_PER_DOMAIN
- #define SD_NODES_PER_DOMAIN 16
- #endif
  #ifdef CONFIG_SCHED_BOOK
  #ifndef SD_BOOK_INIT
  #error Please define an appropriate SD_BOOK_INIT in include/asm/topology.h!!!
  #endif
  #endif /* CONFIG_SCHED_BOOK */
  
- #ifdef CONFIG_NUMA
- #ifndef SD_NODE_INIT
- #error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!!
- #endif
- #endif /* CONFIG_NUMA */
  #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
  DECLARE_PER_CPU(int, numa_node);
  
@@@ -239,7 -197,7 +197,7 @@@ static inline int cpu_to_node(int cpu
  #ifndef set_numa_node
  static inline void set_numa_node(int node)
  {
 -      percpu_write(numa_node, node);
 +      this_cpu_write(numa_node, node);
  }
  #endif
  
@@@ -274,7 -232,7 +232,7 @@@ DECLARE_PER_CPU(int, _numa_mem_)
  #ifndef set_numa_mem
  static inline void set_numa_mem(int node)
  {
 -      percpu_write(_numa_mem_, node);
 +      this_cpu_write(_numa_mem_, node);
  }
  #endif
  
diff --combined kernel/sched/core.c
@@@ -83,7 -83,6 +83,7 @@@
  
  #include "sched.h"
  #include "../workqueue_sched.h"
 +#include "../smpboot.h"
  
  #define CREATE_TRACE_POINTS
  #include <trace/events/sched.h>
@@@ -693,8 -692,6 +693,6 @@@ int tg_nop(struct task_group *tg, void 
  }
  #endif
  
- void update_cpu_load(struct rq *this_rq);
  static void set_load_weight(struct task_struct *p)
  {
        int prio = p->static_prio - MAX_RT_PRIO;
@@@ -1914,7 -1911,7 +1912,7 @@@ prepare_task_switch(struct rq *rq, stru
                    struct task_struct *next)
  {
        sched_info_switch(prev, next);
 -      perf_event_task_sched_out(prev, next);
 +      perf_event_task_sched(prev, next);
        fire_sched_out_preempt_notifiers(prev, next);
        prepare_lock_switch(rq, next);
        prepare_arch_switch(next);
@@@ -1957,6 -1954,13 +1955,6 @@@ static void finish_task_switch(struct r
         */
        prev_state = prev->state;
        finish_arch_switch(prev);
 -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
 -      local_irq_disable();
 -#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
 -      perf_event_task_sched_in(prev, current);
 -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
 -      local_irq_enable();
 -#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
        finish_lock_switch(rq, prev);
        finish_arch_post_lock_switch();
  
@@@ -2077,7 -2081,6 +2075,7 @@@ context_switch(struct rq *rq, struct ta
  #endif
  
        /* Here we just switch the register state and the stack. */
 +      rcu_switch_from(prev);
        switch_to(prev, next, prev);
  
        barrier();
@@@ -2481,22 -2484,13 +2479,13 @@@ decay_load_missed(unsigned long load, u
   * scheduler tick (TICK_NSEC). With tickless idle this will not be called
   * every tick. We fix it up based on jiffies.
   */
- void update_cpu_load(struct rq *this_rq)
+ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
+                             unsigned long pending_updates)
  {
-       unsigned long this_load = this_rq->load.weight;
-       unsigned long curr_jiffies = jiffies;
-       unsigned long pending_updates;
        int i, scale;
  
        this_rq->nr_load_updates++;
  
-       /* Avoid repeated calls on same jiffy, when moving in and out of idle */
-       if (curr_jiffies == this_rq->last_load_update_tick)
-               return;
-       pending_updates = curr_jiffies - this_rq->last_load_update_tick;
-       this_rq->last_load_update_tick = curr_jiffies;
        /* Update our load: */
        this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
        for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
        sched_avg_update(this_rq);
  }
  
+ /*
+  * Called from nohz_idle_balance() to update the load ratings before doing the
+  * idle balance.
+  */
+ void update_idle_cpu_load(struct rq *this_rq)
+ {
+       unsigned long curr_jiffies = jiffies;
+       unsigned long load = this_rq->load.weight;
+       unsigned long pending_updates;
+       /*
+        * Bloody broken means of dealing with nohz, but better than nothing..
+        * jiffies is updated by one cpu, another cpu can drift wrt the jiffy
+        * update and see 0 difference the one time and 2 the next, even though
+        * we ticked at roughtly the same rate.
+        *
+        * Hence we only use this from nohz_idle_balance() and skip this
+        * nonsense when called from the scheduler_tick() since that's
+        * guaranteed a stable rate.
+        */
+       if (load || curr_jiffies == this_rq->last_load_update_tick)
+               return;
+       pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+       this_rq->last_load_update_tick = curr_jiffies;
+       __update_cpu_load(this_rq, load, pending_updates);
+ }
+ /*
+  * Called from scheduler_tick()
+  */
  static void update_cpu_load_active(struct rq *this_rq)
  {
-       update_cpu_load(this_rq);
+       /*
+        * See the mess in update_idle_cpu_load().
+        */
+       this_rq->last_load_update_tick = jiffies;
+       __update_cpu_load(this_rq, this_rq->load.weight, 1);
  
        calc_load_account_active(this_rq);
  }
@@@ -3108,6 -3138,7 +3133,7 @@@ static noinline void __schedule_bug(str
        if (irqs_disabled())
                print_irqtrace_events(prev);
        dump_stack();
+       add_taint(TAINT_WARN);
  }
  
  /*
@@@ -5555,7 -5586,8 +5581,8 @@@ static int sched_domain_debug_one(struc
                        break;
                }
  
-               if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
+               if (!(sd->flags & SD_OVERLAP) &&
+                   cpumask_intersects(groupmask, sched_group_cpus(group))) {
                        printk(KERN_CONT "\n");
                        printk(KERN_ERR "ERROR: repeated CPUs\n");
                        break;
@@@ -5893,99 -5925,11 +5920,11 @@@ static int __init isolated_cpu_setup(ch
  
  __setup("isolcpus=", isolated_cpu_setup);
  
- #ifdef CONFIG_NUMA
- /**
-  * find_next_best_node - find the next node to include in a sched_domain
-  * @node: node whose sched_domain we're building
-  * @used_nodes: nodes already in the sched_domain
-  *
-  * Find the next node to include in a given scheduling domain. Simply
-  * finds the closest node not already in the @used_nodes map.
-  *
-  * Should use nodemask_t.
-  */
- static int find_next_best_node(int node, nodemask_t *used_nodes)
- {
-       int i, n, val, min_val, best_node = -1;
-       min_val = INT_MAX;
-       for (i = 0; i < nr_node_ids; i++) {
-               /* Start at @node */
-               n = (node + i) % nr_node_ids;
-               if (!nr_cpus_node(n))
-                       continue;
-               /* Skip already used nodes */
-               if (node_isset(n, *used_nodes))
-                       continue;
-               /* Simple min distance search */
-               val = node_distance(node, n);
-               if (val < min_val) {
-                       min_val = val;
-                       best_node = n;
-               }
-       }
-       if (best_node != -1)
-               node_set(best_node, *used_nodes);
-       return best_node;
- }
- /**
-  * sched_domain_node_span - get a cpumask for a node's sched_domain
-  * @node: node whose cpumask we're constructing
-  * @span: resulting cpumask
-  *
-  * Given a node, construct a good cpumask for its sched_domain to span. It
-  * should be one that prevents unnecessary balancing, but also spreads tasks
-  * out optimally.
-  */
- static void sched_domain_node_span(int node, struct cpumask *span)
- {
-       nodemask_t used_nodes;
-       int i;
-       cpumask_clear(span);
-       nodes_clear(used_nodes);
-       cpumask_or(span, span, cpumask_of_node(node));
-       node_set(node, used_nodes);
-       for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
-               int next_node = find_next_best_node(node, &used_nodes);
-               if (next_node < 0)
-                       break;
-               cpumask_or(span, span, cpumask_of_node(next_node));
-       }
- }
- static const struct cpumask *cpu_node_mask(int cpu)
- {
-       lockdep_assert_held(&sched_domains_mutex);
-       sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
-       return sched_domains_tmpmask;
- }
- static const struct cpumask *cpu_allnodes_mask(int cpu)
- {
-       return cpu_possible_mask;
- }
- #endif /* CONFIG_NUMA */
  static const struct cpumask *cpu_cpu_mask(int cpu)
  {
        return cpumask_of_node(cpu_to_node(cpu));
  }
  
- int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
  struct sd_data {
        struct sched_domain **__percpu sd;
        struct sched_group **__percpu sg;
@@@ -6015,6 -5959,7 +5954,7 @@@ struct sched_domain_topology_level 
        sched_domain_init_f init;
        sched_domain_mask_f mask;
        int                 flags;
+       int                 numa_level;
        struct sd_data      data;
  };
  
@@@ -6206,10 -6151,6 +6146,6 @@@ sd_init_##type(struct sched_domain_topo
  }
  
  SD_INIT_FUNC(CPU)
- #ifdef CONFIG_NUMA
-  SD_INIT_FUNC(ALLNODES)
-  SD_INIT_FUNC(NODE)
- #endif
  #ifdef CONFIG_SCHED_SMT
   SD_INIT_FUNC(SIBLING)
  #endif
@@@ -6331,15 -6272,184 +6267,184 @@@ static struct sched_domain_topology_lev
        { sd_init_BOOK, cpu_book_mask, },
  #endif
        { sd_init_CPU, cpu_cpu_mask, },
- #ifdef CONFIG_NUMA
-       { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
-       { sd_init_ALLNODES, cpu_allnodes_mask, },
- #endif
        { NULL, },
  };
  
  static struct sched_domain_topology_level *sched_domain_topology = default_topology;
  
+ #ifdef CONFIG_NUMA
+ static int sched_domains_numa_levels;
+ static int sched_domains_numa_scale;
+ static int *sched_domains_numa_distance;
+ static struct cpumask ***sched_domains_numa_masks;
+ static int sched_domains_curr_level;
+ static inline int sd_local_flags(int level)
+ {
+       if (sched_domains_numa_distance[level] > REMOTE_DISTANCE)
+               return 0;
+       return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
+ }
+ static struct sched_domain *
+ sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
+ {
+       struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
+       int level = tl->numa_level;
+       int sd_weight = cpumask_weight(
+                       sched_domains_numa_masks[level][cpu_to_node(cpu)]);
+       *sd = (struct sched_domain){
+               .min_interval           = sd_weight,
+               .max_interval           = 2*sd_weight,
+               .busy_factor            = 32,
+               .imbalance_pct          = 125,
+               .cache_nice_tries       = 2,
+               .busy_idx               = 3,
+               .idle_idx               = 2,
+               .newidle_idx            = 0,
+               .wake_idx               = 0,
+               .forkexec_idx           = 0,
+               .flags                  = 1*SD_LOAD_BALANCE
+                                       | 1*SD_BALANCE_NEWIDLE
+                                       | 0*SD_BALANCE_EXEC
+                                       | 0*SD_BALANCE_FORK
+                                       | 0*SD_BALANCE_WAKE
+                                       | 0*SD_WAKE_AFFINE
+                                       | 0*SD_PREFER_LOCAL
+                                       | 0*SD_SHARE_CPUPOWER
+                                       | 0*SD_SHARE_PKG_RESOURCES
+                                       | 1*SD_SERIALIZE
+                                       | 0*SD_PREFER_SIBLING
+                                       | sd_local_flags(level)
+                                       ,
+               .last_balance           = jiffies,
+               .balance_interval       = sd_weight,
+       };
+       SD_INIT_NAME(sd, NUMA);
+       sd->private = &tl->data;
+       /*
+        * Ugly hack to pass state to sd_numa_mask()...
+        */
+       sched_domains_curr_level = tl->numa_level;
+       return sd;
+ }
+ static const struct cpumask *sd_numa_mask(int cpu)
+ {
+       return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
+ }
+ static void sched_init_numa(void)
+ {
+       int next_distance, curr_distance = node_distance(0, 0);
+       struct sched_domain_topology_level *tl;
+       int level = 0;
+       int i, j, k;
+       sched_domains_numa_scale = curr_distance;
+       sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
+       if (!sched_domains_numa_distance)
+               return;
+       /*
+        * O(nr_nodes^2) deduplicating selection sort -- in order to find the
+        * unique distances in the node_distance() table.
+        *
+        * Assumes node_distance(0,j) includes all distances in
+        * node_distance(i,j) in order to avoid cubic time.
+        *
+        * XXX: could be optimized to O(n log n) by using sort()
+        */
+       next_distance = curr_distance;
+       for (i = 0; i < nr_node_ids; i++) {
+               for (j = 0; j < nr_node_ids; j++) {
+                       int distance = node_distance(0, j);
+                       if (distance > curr_distance &&
+                                       (distance < next_distance ||
+                                        next_distance == curr_distance))
+                               next_distance = distance;
+               }
+               if (next_distance != curr_distance) {
+                       sched_domains_numa_distance[level++] = next_distance;
+                       sched_domains_numa_levels = level;
+                       curr_distance = next_distance;
+               } else break;
+       }
+       /*
+        * 'level' contains the number of unique distances, excluding the
+        * identity distance node_distance(i,i).
+        *
+        * The sched_domains_nume_distance[] array includes the actual distance
+        * numbers.
+        */
+       sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
+       if (!sched_domains_numa_masks)
+               return;
+       /*
+        * Now for each level, construct a mask per node which contains all
+        * cpus of nodes that are that many hops away from us.
+        */
+       for (i = 0; i < level; i++) {
+               sched_domains_numa_masks[i] =
+                       kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
+               if (!sched_domains_numa_masks[i])
+                       return;
+               for (j = 0; j < nr_node_ids; j++) {
+                       struct cpumask *mask = kzalloc_node(cpumask_size(), GFP_KERNEL, j);
+                       if (!mask)
+                               return;
+                       sched_domains_numa_masks[i][j] = mask;
+                       for (k = 0; k < nr_node_ids; k++) {
+                               if (node_distance(j, k) > sched_domains_numa_distance[i])
+                                       continue;
+                               cpumask_or(mask, mask, cpumask_of_node(k));
+                       }
+               }
+       }
+       tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
+                       sizeof(struct sched_domain_topology_level), GFP_KERNEL);
+       if (!tl)
+               return;
+       /*
+        * Copy the default topology bits..
+        */
+       for (i = 0; default_topology[i].init; i++)
+               tl[i] = default_topology[i];
+       /*
+        * .. and append 'j' levels of NUMA goodness.
+        */
+       for (j = 0; j < level; i++, j++) {
+               tl[i] = (struct sched_domain_topology_level){
+                       .init = sd_numa_init,
+                       .mask = sd_numa_mask,
+                       .flags = SDTL_OVERLAP,
+                       .numa_level = j,
+               };
+       }
+       sched_domain_topology = tl;
+ }
+ #else
+ static inline void sched_init_numa(void)
+ {
+ }
+ #endif /* CONFIG_NUMA */
  static int __sdt_alloc(const struct cpumask *cpu_map)
  {
        struct sched_domain_topology_level *tl;
@@@ -6707,97 -6817,6 +6812,6 @@@ match2
        mutex_unlock(&sched_domains_mutex);
  }
  
- #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
- static void reinit_sched_domains(void)
- {
-       get_online_cpus();
-       /* Destroy domains first to force the rebuild */
-       partition_sched_domains(0, NULL, NULL);
-       rebuild_sched_domains();
-       put_online_cpus();
- }
- static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
- {
-       unsigned int level = 0;
-       if (sscanf(buf, "%u", &level) != 1)
-               return -EINVAL;
-       /*
-        * level is always be positive so don't check for
-        * level < POWERSAVINGS_BALANCE_NONE which is 0
-        * What happens on 0 or 1 byte write,
-        * need to check for count as well?
-        */
-       if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
-               return -EINVAL;
-       if (smt)
-               sched_smt_power_savings = level;
-       else
-               sched_mc_power_savings = level;
-       reinit_sched_domains();
-       return count;
- }
- #ifdef CONFIG_SCHED_MC
- static ssize_t sched_mc_power_savings_show(struct device *dev,
-                                          struct device_attribute *attr,
-                                          char *buf)
- {
-       return sprintf(buf, "%u\n", sched_mc_power_savings);
- }
- static ssize_t sched_mc_power_savings_store(struct device *dev,
-                                           struct device_attribute *attr,
-                                           const char *buf, size_t count)
- {
-       return sched_power_savings_store(buf, count, 0);
- }
- static DEVICE_ATTR(sched_mc_power_savings, 0644,
-                  sched_mc_power_savings_show,
-                  sched_mc_power_savings_store);
- #endif
- #ifdef CONFIG_SCHED_SMT
- static ssize_t sched_smt_power_savings_show(struct device *dev,
-                                           struct device_attribute *attr,
-                                           char *buf)
- {
-       return sprintf(buf, "%u\n", sched_smt_power_savings);
- }
- static ssize_t sched_smt_power_savings_store(struct device *dev,
-                                           struct device_attribute *attr,
-                                            const char *buf, size_t count)
- {
-       return sched_power_savings_store(buf, count, 1);
- }
- static DEVICE_ATTR(sched_smt_power_savings, 0644,
-                  sched_smt_power_savings_show,
-                  sched_smt_power_savings_store);
- #endif
- int __init sched_create_sysfs_power_savings_entries(struct device *dev)
- {
-       int err = 0;
- #ifdef CONFIG_SCHED_SMT
-       if (smt_capable())
-               err = device_create_file(dev, &dev_attr_sched_smt_power_savings);
- #endif
- #ifdef CONFIG_SCHED_MC
-       if (!err && mc_capable())
-               err = device_create_file(dev, &dev_attr_sched_mc_power_savings);
- #endif
-       return err;
- }
- #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
  /*
   * Update cpusets according to cpu_active mask.  If cpusets are
   * disabled, cpuset_update_active_cpus() becomes a simple wrapper
@@@ -6835,6 -6854,8 +6849,8 @@@ void __init sched_init_smp(void
        alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
        alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
  
+       sched_init_numa();
        get_online_cpus();
        mutex_lock(&sched_domains_mutex);
        init_sched_domains(cpu_active_mask);
@@@ -7056,7 -7077,6 +7072,7 @@@ void __init sched_init(void
        /* May be allocated at isolcpus cmdline parse time */
        if (cpu_isolated_map == NULL)
                zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
 +      idle_thread_set_boot_cpu();
  #endif
        init_sched_fair_class();
  
@@@ -7978,9 -7998,13 +7994,9 @@@ static struct cftype cpu_files[] = 
                .write_u64 = cpu_rt_period_write_uint,
        },
  #endif
 +      { }     /* terminate */
  };
  
 -static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
 -{
 -      return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));
 -}
 -
  struct cgroup_subsys cpu_cgroup_subsys = {
        .name           = "cpu",
        .create         = cpu_cgroup_create,
        .can_attach     = cpu_cgroup_can_attach,
        .attach         = cpu_cgroup_attach,
        .exit           = cpu_cgroup_exit,
 -      .populate       = cpu_cgroup_populate,
        .subsys_id      = cpu_cgroup_subsys_id,
 +      .base_cftypes   = cpu_files,
        .early_init     = 1,
  };
  
@@@ -8174,9 -8198,13 +8190,9 @@@ static struct cftype files[] = 
                .name = "stat",
                .read_map = cpuacct_stats_show,
        },
 +      { }     /* terminate */
  };
  
 -static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
 -{
 -      return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));
 -}
 -
  /*
   * charge this task's execution time to its accounting group.
   *
@@@ -8208,7 -8236,7 +8224,7 @@@ struct cgroup_subsys cpuacct_subsys = 
        .name = "cpuacct",
        .create = cpuacct_create,
        .destroy = cpuacct_destroy,
 -      .populate = cpuacct_populate,
        .subsys_id = cpuacct_subsys_id,
 +      .base_cftypes = files,
  };
  #endif        /* CONFIG_CGROUP_CPUACCT */