Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...

author Linus Torvalds <torvalds@linux-foundation.org>

Wed, 23 May 2012 01:27:32 +0000 (18:27 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Wed, 23 May 2012 01:27:32 +0000 (18:27 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Wed, 23 May 2012 01:27:32 +0000 (18:27 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Wed, 23 May 2012 01:27:32 +0000 (18:27 -0700)
diff --combined arch/x86/kernel/process.c

index dc8ca8e,ad57d83..8040b75
--- 1/arch/x86/kernel/process.c
--- 2/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@@ -27,15 -27,6 +27,15 @@@
   #include <asm/debugreg.h>
   #include <asm/nmi.h>
   
+ +/*
+ + * per-CPU TSS segments. Threads are completely 'soft' on Linux,
+ + * no more per-task TSS's. The TSS size is kept cacheline-aligned
+ + * so they are allowed to end up in the .data..cacheline_aligned
+ + * section. Since TSS's are completely CPU-local, we want them
+ + * on exact cacheline boundaries, to eliminate cacheline ping-pong.
+ + */
+ +DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
+ +
   #ifdef CONFIG_X86_64
   static DEFINE_PER_CPU(unsigned char, is_idle);
   static ATOMIC_NOTIFIER_HEAD(idle_notifier);
@@@ -76,9 -67,10 +76,9 @@@ void free_thread_xstate(struct task_str
         fpu_free(&tsk->thread.fpu);
   }
   
- -void free_thread_info(struct thread_info *ti)
+ +void arch_release_task_struct(struct task_struct *tsk)
   {
- -      free_thread_xstate(ti->task);
- -      free_pages((unsigned long)ti, THREAD_ORDER);
+ +      free_thread_xstate(tsk);
   }
   
   void arch_task_cache_init(void)
@@@ -385,7 -377,7 +385,7 @@@ static inline void play_dead(void
   #ifdef CONFIG_X86_64
   void enter_idle(void)
   {
- -      percpu_write(is_idle, 1);
+ +      this_cpu_write(is_idle, 1);
         atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
   }
   
@@@ -524,6 -516,26 +524,6 @@@ void stop_this_cpu(void *dummy
         }
   }
   
- -static void do_nothing(void *unused)
- -{
- -}
- -
- -/*
- - * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
- - * pm_idle and update to new pm_idle value. Required while changing pm_idle
- - * handler on SMP systems.
- - *
- - * Caller must have changed pm_idle to the new value before the call. Old
- - * pm_idle value will not be used by any CPU after the return of this function.
- - */
- -void cpu_idle_wait(void)
- -{
- -      smp_mb();
- -      /* kick all the CPUs so that they exit out of pm_idle */
- -      smp_call_function(do_nothing, NULL, 1);
- -}
- -EXPORT_SYMBOL_GPL(cpu_idle_wait);
- -
   /* Default MONITOR/MWAIT with no hints, used for default C1 state */
   static void mwait_idle(void)
   {
@@@ -582,9 -594,17 +582,17 @@@ int mwait_usable(const struct cpuinfo_x
   {
         u32 eax, ebx, ecx, edx;
   
+       /* Use mwait if idle=mwait boot option is given */
         if (boot_option_idle_override == IDLE_FORCE_MWAIT)
                 return 1;
   
+       /*
+        * Any idle= boot option other than idle=mwait means that we must not
+        * use mwait. Eg: idle=halt or idle=poll or idle=nomwait
+        */
+       if (boot_option_idle_override != IDLE_NO_OVERRIDE)
+               return 0;
+ 
         if (c->cpuid_level < MWAIT_INFO)
                 return 0;
   
diff --combined arch/x86/kernel/smpboot.c

index 3acaf51,256c20c..433529e
--- 1/arch/x86/kernel/smpboot.c
--- 2/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@@ -76,8 -76,20 +76,8 @@@
   /* State of each CPU */
   DEFINE_PER_CPU(int, cpu_state) = { 0 };
   
- -/* Store all idle threads, this can be reused instead of creating
- -* a new thread. Also avoids complicated thread destroy functionality
- -* for idle threads.
- -*/
   #ifdef CONFIG_HOTPLUG_CPU
   /*
- - * Needed only for CONFIG_HOTPLUG_CPU because __cpuinitdata is
- - * removed after init for !CONFIG_HOTPLUG_CPU.
- - */
- -static DEFINE_PER_CPU(struct task_struct *, idle_thread_array);
- -#define get_idle_for_cpu(x)      (per_cpu(idle_thread_array, x))
- -#define set_idle_for_cpu(x, p)   (per_cpu(idle_thread_array, x) = (p))
- -
- -/*
    * We need this for trampoline_base protection from concurrent accesses when
    * off- and onlining cores wildly.
    */
@@@ -85,16 -97,20 +85,16 @@@ static DEFINE_MUTEX(x86_cpu_hotplug_dri
   
   void cpu_hotplug_driver_lock(void)
   {
- -        mutex_lock(&x86_cpu_hotplug_driver_mutex);
+ +      mutex_lock(&x86_cpu_hotplug_driver_mutex);
   }
   
   void cpu_hotplug_driver_unlock(void)
   {
- -        mutex_unlock(&x86_cpu_hotplug_driver_mutex);
+ +      mutex_unlock(&x86_cpu_hotplug_driver_mutex);
   }
   
   ssize_t arch_cpu_probe(const char *buf, size_t count) { return -1; }
   ssize_t arch_cpu_release(const char *buf, size_t count) { return -1; }
- -#else
- -static struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
- -#define get_idle_for_cpu(x)      (idle_thread_array[(x)])
- -#define set_idle_for_cpu(x, p)   (idle_thread_array[(x)] = (p))
   #endif
   
   /* Number of siblings per CPU package */
@@@ -299,59 -315,90 +299,90 @@@ void __cpuinit smp_store_cpu_info(int i
                 identify_secondary_cpu(c);
   }
   
- static void __cpuinit link_thread_siblings(int cpu1, int cpu2)
+ static bool __cpuinit
+ topology_sane(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o, const char *name)
   {
-       cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2));
-       cpumask_set_cpu(cpu2, cpu_sibling_mask(cpu1));
-       cpumask_set_cpu(cpu1, cpu_core_mask(cpu2));
-       cpumask_set_cpu(cpu2, cpu_core_mask(cpu1));
-       cpumask_set_cpu(cpu1, cpu_llc_shared_mask(cpu2));
-       cpumask_set_cpu(cpu2, cpu_llc_shared_mask(cpu1));
+       int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
+ 
+       return !WARN_ONCE(cpu_to_node(cpu1) != cpu_to_node(cpu2),
+               "sched: CPU #%d's %s-sibling CPU #%d is not on the same node! "
+               "[node: %d != %d]. Ignoring dependency.\n",
+               cpu1, name, cpu2, cpu_to_node(cpu1), cpu_to_node(cpu2));
+ }
+ 
+ #define link_mask(_m, c1, c2)                                         \
+ do {                                                                  \
+       cpumask_set_cpu((c1), cpu_##_m##_mask(c2));                     \
+       cpumask_set_cpu((c2), cpu_##_m##_mask(c1));                     \
+ } while (0)
+ 
+ static bool __cpuinit match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
+ {
+       if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
+               int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
+ 
+               if (c->phys_proc_id == o->phys_proc_id &&
+                   per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2) &&
+                   c->compute_unit_id == o->compute_unit_id)
+                       return topology_sane(c, o, "smt");
+ 
+       } else if (c->phys_proc_id == o->phys_proc_id &&
+                  c->cpu_core_id == o->cpu_core_id) {
+               return topology_sane(c, o, "smt");
+       }
+ 
+       return false;
+ }
+ 
+ static bool __cpuinit match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
+ {
+       int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
+ 
+       if (per_cpu(cpu_llc_id, cpu1) != BAD_APICID &&
+           per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2))
+               return topology_sane(c, o, "llc");
+ 
+       return false;
   }
   
+ static bool __cpuinit match_mc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
+ {
+       if (c->phys_proc_id == o->phys_proc_id)
+               return topology_sane(c, o, "mc");
+ 
+       return false;
+ }
   
   void __cpuinit set_cpu_sibling_map(int cpu)
   {
-       int i;
+       bool has_mc = boot_cpu_data.x86_max_cores > 1;
+       bool has_smt = smp_num_siblings > 1;
         struct cpuinfo_x86 *c = &cpu_data(cpu);
+       struct cpuinfo_x86 *o;
+       int i;
   
         cpumask_set_cpu(cpu, cpu_sibling_setup_mask);
   
-       if (smp_num_siblings > 1) {
-               for_each_cpu(i, cpu_sibling_setup_mask) {
-                       struct cpuinfo_x86 *o = &cpu_data(i);
- 
-                       if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
-                               if (c->phys_proc_id == o->phys_proc_id &&
-                                   per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i) &&
-                                   c->compute_unit_id == o->compute_unit_id)
-                                       link_thread_siblings(cpu, i);
-                       } else if (c->phys_proc_id == o->phys_proc_id &&
-                                  c->cpu_core_id == o->cpu_core_id) {
-                               link_thread_siblings(cpu, i);
-                       }
-               }
-       } else {
+       if (!has_smt && !has_mc) {
                 cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
-       }
- 
-       cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
- 
-       if (__this_cpu_read(cpu_info.x86_max_cores) == 1) {
-               cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu));
+               cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
+               cpumask_set_cpu(cpu, cpu_core_mask(cpu));
                 c->booted_cores = 1;
                 return;
         }
   
         for_each_cpu(i, cpu_sibling_setup_mask) {
-               if (per_cpu(cpu_llc_id, cpu) != BAD_APICID &&
-                   per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) {
-                       cpumask_set_cpu(i, cpu_llc_shared_mask(cpu));
-                       cpumask_set_cpu(cpu, cpu_llc_shared_mask(i));
-               }
-               if (c->phys_proc_id == cpu_data(i).phys_proc_id) {
-                       cpumask_set_cpu(i, cpu_core_mask(cpu));
-                       cpumask_set_cpu(cpu, cpu_core_mask(i));
+               o = &cpu_data(i);
+ 
+               if ((i == cpu) || (has_smt && match_smt(c, o)))
+                       link_mask(sibling, cpu, i);
+ 
+               if ((i == cpu) || (has_mc && match_llc(c, o)))
+                       link_mask(llc_shared, cpu, i);
+ 
+               if ((i == cpu) || (has_mc && match_mc(c, o))) {
+                       link_mask(core, cpu, i);
+ 
                         /*
                          *  Does this new cpu bringup a new core?
                          */
@@@ -382,8 -429,7 +413,7 @@@ const struct cpumask *cpu_coregroup_mas
          * For perf, we return last level cache shared map.
          * And for power savings, we return cpu_core_map
          */
-       if ((sched_mc_power_savings || sched_smt_power_savings) &&
-           !(cpu_has(c, X86_FEATURE_AMD_DCM)))
+       if (!(cpu_has(c, X86_FEATURE_AMD_DCM)))
                 return cpu_core_mask(cpu);
         else
                 return cpu_llc_shared_mask(cpu);
@@@ -602,6 -648,22 +632,6 @@@ wakeup_secondary_cpu_via_init(int phys_
         return (send_status | accept_status);
   }
   
- -struct create_idle {
- -      struct work_struct work;
- -      struct task_struct *idle;
- -      struct completion done;
- -      int cpu;
- -};
- -
- -static void __cpuinit do_fork_idle(struct work_struct *work)
- -{
- -      struct create_idle *c_idle =
- -              container_of(work, struct create_idle, work);
- -
- -      c_idle->idle = fork_idle(c_idle->cpu);
- -      complete(&c_idle->done);
- -}
- -
   /* reduce the number of lines printed when booting a large cpu count system */
   static void __cpuinit announce_cpu(int cpu, int apicid)
   {
@@@ -628,31 -690,58 +658,31 @@@
    * Returns zero if CPU booted OK, else error code from
    * ->wakeup_secondary_cpu.
    */
- -static int __cpuinit do_boot_cpu(int apicid, int cpu)
+ +static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
   {
         unsigned long boot_error = 0;
         unsigned long start_ip;
         int timeout;
- -      struct create_idle c_idle = {
- -              .cpu    = cpu,
- -              .done   = COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
- -      };
- -
- -      INIT_WORK_ONSTACK(&c_idle.work, do_fork_idle);
   
         alternatives_smp_switch(1);
   
- -      c_idle.idle = get_idle_for_cpu(cpu);
- -
- -      /*
- -       * We can't use kernel_thread since we must avoid to
- -       * reschedule the child.
- -       */
- -      if (c_idle.idle) {
- -              c_idle.idle->thread.sp = (unsigned long) (((struct pt_regs *)
- -                      (THREAD_SIZE +  task_stack_page(c_idle.idle))) - 1);
- -              init_idle(c_idle.idle, cpu);
- -              goto do_rest;
- -      }
+ +      idle->thread.sp = (unsigned long) (((struct pt_regs *)
+ +                        (THREAD_SIZE +  task_stack_page(idle))) - 1);
+ +      per_cpu(current_task, cpu) = idle;
   
- -      schedule_work(&c_idle.work);
- -      wait_for_completion(&c_idle.done);
- -
- -      if (IS_ERR(c_idle.idle)) {
- -              printk("failed fork for CPU %d\n", cpu);
- -              destroy_work_on_stack(&c_idle.work);
- -              return PTR_ERR(c_idle.idle);
- -      }
- -
- -      set_idle_for_cpu(cpu, c_idle.idle);
- -do_rest:
- -      per_cpu(current_task, cpu) = c_idle.idle;
   #ifdef CONFIG_X86_32
         /* Stack for startup_32 can be just as for start_secondary onwards */
         irq_ctx_init(cpu);
   #else
- -      clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
+ +      clear_tsk_thread_flag(idle, TIF_FORK);
         initial_gs = per_cpu_offset(cpu);
         per_cpu(kernel_stack, cpu) =
- -              (unsigned long)task_stack_page(c_idle.idle) -
+ +              (unsigned long)task_stack_page(idle) -
                 KERNEL_STACK_OFFSET + THREAD_SIZE;
   #endif
         early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
         initial_code = (unsigned long)start_secondary;
- -      stack_start  = c_idle.idle->thread.sp;
+ +      stack_start  = idle->thread.sp;
   
         /* start_ip had better be page-aligned! */
         start_ip = trampoline_address();
@@@ -754,10 -843,12 +784,10 @@@
                  */
                 smpboot_restore_warm_reset_vector();
         }
- -
- -      destroy_work_on_stack(&c_idle.work);
         return boot_error;
   }
   
- -int __cpuinit native_cpu_up(unsigned int cpu)
+ +int __cpuinit native_cpu_up(unsigned int cpu, struct task_struct *tidle)
   {
         int apicid = apic->cpu_present_to_apicid(cpu);
         unsigned long flags;
@@@ -790,7 -881,7 +820,7 @@@
   
         per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
   
- -      err = do_boot_cpu(apicid, cpu);
+ +      err = do_boot_cpu(apicid, cpu, tidle);
         if (err) {
                 pr_debug("do_boot_cpu failed %d\n", err);
                 return -EIO;
diff --combined include/linux/sched.h

index f774d88,3d64480..28fa9d0
--- 1/include/linux/sched.h
--- 2/include/linux/sched.h
+++ b/include/linux/sched.h
@@@ -855,61 -855,14 +855,14 @@@ enum cpu_idle_type 
   #define SD_WAKE_AFFINE                0x0020  /* Wake task to waking CPU */
   #define SD_PREFER_LOCAL               0x0040  /* Prefer to keep tasks local to this domain */
   #define SD_SHARE_CPUPOWER     0x0080  /* Domain members share cpu power */
- #define SD_POWERSAVINGS_BALANCE       0x0100  /* Balance for power savings */
   #define SD_SHARE_PKG_RESOURCES        0x0200  /* Domain members share cpu pkg resources */
   #define SD_SERIALIZE          0x0400  /* Only a single load balancing instance */
   #define SD_ASYM_PACKING               0x0800  /* Place busy groups earlier in the domain */
   #define SD_PREFER_SIBLING     0x1000  /* Prefer to place tasks in a sibling domain */
   #define SD_OVERLAP            0x2000  /* sched_domains of this level overlap */
   
- enum powersavings_balance_level {
-       POWERSAVINGS_BALANCE_NONE = 0,  /* No power saving load balance */
-       POWERSAVINGS_BALANCE_BASIC,     /* Fill one thread/core/package
-                                        * first for long running threads
-                                        */
-       POWERSAVINGS_BALANCE_WAKEUP,    /* Also bias task wakeups to semi-idle
-                                        * cpu package for power savings
-                                        */
-       MAX_POWERSAVINGS_BALANCE_LEVELS
- };
- 
- extern int sched_mc_power_savings, sched_smt_power_savings;
- 
- static inline int sd_balance_for_mc_power(void)
- {
-       if (sched_smt_power_savings)
-               return SD_POWERSAVINGS_BALANCE;
- 
-       if (!sched_mc_power_savings)
-               return SD_PREFER_SIBLING;
- 
-       return 0;
- }
- 
- static inline int sd_balance_for_package_power(void)
- {
-       if (sched_mc_power_savings | sched_smt_power_savings)
-               return SD_POWERSAVINGS_BALANCE;
- 
-       return SD_PREFER_SIBLING;
- }
- 
   extern int __weak arch_sd_sibiling_asym_packing(void);
   
- /*
-  * Optimise SD flags for power savings:
-  * SD_BALANCE_NEWIDLE helps aggressive task consolidation and power savings.
-  * Keep default SD flags if sched_{smt,mc}_power_saving=0
-  */
- 
- static inline int sd_power_saving_flags(void)
- {
-       if (sched_mc_power_savings | sched_smt_power_savings)
-               return SD_BALANCE_NEWIDLE;
- 
-       return 0;
- }
- 
   struct sched_group_power {
         atomic_t ref;
         /*
@@@ -1341,8 -1294,6 +1294,8 @@@ struct task_struct 
                                  * execve */
         unsigned in_iowait:1;
   
+ +      /* task may not gain privileges */
+ +      unsigned no_new_privs:1;
   
         /* Revert to default priority/policy when forking */
         unsigned sched_reset_on_fork:1;
@@@ -1452,7 -1403,7 +1405,7 @@@
         uid_t loginuid;
         unsigned int sessionid;
   #endif
- -      seccomp_t seccomp;
+ +      struct seccomp seccomp;
   
   /* Thread group tracking */
         u32 parent_exec_id;
@@@ -1907,22 -1858,12 +1860,22 @@@ static inline void rcu_copy_process(str
         INIT_LIST_HEAD(&p->rcu_node_entry);
   }
   
+ +static inline void rcu_switch_from(struct task_struct *prev)
+ +{
+ +      if (prev->rcu_read_lock_nesting != 0)
+ +              rcu_preempt_note_context_switch();
+ +}
+ +
   #else
   
   static inline void rcu_copy_process(struct task_struct *p)
   {
   }
   
+ +static inline void rcu_switch_from(struct task_struct *prev)
+ +{
+ +}
+ +
   #endif
   
   #ifdef CONFIG_SMP
@@@ -1962,7 -1903,7 +1915,7 @@@ static inline int set_cpus_allowed(stru
    */
   extern unsigned long long notrace sched_clock(void);
   /*
-  * See the comment in kernel/sched_clock.c
+  * See the comment in kernel/sched/clock.c
    */
   extern u64 cpu_clock(int cpu);
   extern u64 local_clock(void);
diff --combined include/linux/topology.h

index 9dc427c,09558d1..e91cd43
--- 1/include/linux/topology.h
--- 2/include/linux/topology.h
+++ b/include/linux/topology.h
@@@ -70,7 -70,6 +70,6 @@@ int arch_update_cpu_topology(void)
    * Below are the 3 major initializers used in building sched_domains:
    * SD_SIBLING_INIT, for SMT domains
    * SD_CPU_INIT, for SMP domains
-  * SD_NODE_INIT, for NUMA domains
    *
    * Any architecture that cares to do any tuning to these values should do so
    * by defining their own arch-specific initializer in include/asm/topology.h.
@@@ -99,7 -98,6 +98,6 @@@
                                 | 0*SD_BALANCE_WAKE                     \
                                 | 1*SD_WAKE_AFFINE                      \
                                 | 1*SD_SHARE_CPUPOWER                   \
-                               | 0*SD_POWERSAVINGS_BALANCE             \
                                 | 1*SD_SHARE_PKG_RESOURCES              \
                                 | 0*SD_SERIALIZE                        \
                                 | 0*SD_PREFER_SIBLING                   \
@@@ -135,8 -133,6 +133,6 @@@
                                 | 0*SD_SHARE_CPUPOWER                   \
                                 | 1*SD_SHARE_PKG_RESOURCES              \
                                 | 0*SD_SERIALIZE                        \
-                               | sd_balance_for_mc_power()             \
-                               | sd_power_saving_flags()               \
                                 ,                                       \
         .last_balance           = jiffies,                              \
         .balance_interval       = 1,                                    \
@@@ -168,56 -164,18 +164,18 @@@
                                 | 0*SD_SHARE_CPUPOWER                   \
                                 | 0*SD_SHARE_PKG_RESOURCES              \
                                 | 0*SD_SERIALIZE                        \
-                               | sd_balance_for_package_power()        \
-                               | sd_power_saving_flags()               \
                                 ,                                       \
         .last_balance           = jiffies,                              \
         .balance_interval       = 1,                                    \
   }
   #endif
   
- /* sched_domains SD_ALLNODES_INIT for NUMA machines */
- #define SD_ALLNODES_INIT (struct sched_domain) {                      \
-       .min_interval           = 64,                                   \
-       .max_interval           = 64*num_online_cpus(),                 \
-       .busy_factor            = 128,                                  \
-       .imbalance_pct          = 133,                                  \
-       .cache_nice_tries       = 1,                                    \
-       .busy_idx               = 3,                                    \
-       .idle_idx               = 3,                                    \
-       .flags                  = 1*SD_LOAD_BALANCE                     \
-                               | 1*SD_BALANCE_NEWIDLE                  \
-                               | 0*SD_BALANCE_EXEC                     \
-                               | 0*SD_BALANCE_FORK                     \
-                               | 0*SD_BALANCE_WAKE                     \
-                               | 0*SD_WAKE_AFFINE                      \
-                               | 0*SD_SHARE_CPUPOWER                   \
-                               | 0*SD_POWERSAVINGS_BALANCE             \
-                               | 0*SD_SHARE_PKG_RESOURCES              \
-                               | 1*SD_SERIALIZE                        \
-                               | 0*SD_PREFER_SIBLING                   \
-                               ,                                       \
-       .last_balance           = jiffies,                              \
-       .balance_interval       = 64,                                   \
- }
- 
- #ifndef SD_NODES_PER_DOMAIN
- #define SD_NODES_PER_DOMAIN 16
- #endif
- 
   #ifdef CONFIG_SCHED_BOOK
   #ifndef SD_BOOK_INIT
   #error Please define an appropriate SD_BOOK_INIT in include/asm/topology.h!!!
   #endif
   #endif /* CONFIG_SCHED_BOOK */
   
- #ifdef CONFIG_NUMA
- #ifndef SD_NODE_INIT
- #error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!!
- #endif
- 
- #endif /* CONFIG_NUMA */
- 
   #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
   DECLARE_PER_CPU(int, numa_node);
   
@@@ -239,7 -197,7 +197,7 @@@ static inline int cpu_to_node(int cpu
   #ifndef set_numa_node
   static inline void set_numa_node(int node)
   {
- -      percpu_write(numa_node, node);
+ +      this_cpu_write(numa_node, node);
   }
   #endif
   
@@@ -274,7 -232,7 +232,7 @@@ DECLARE_PER_CPU(int, _numa_mem_)
   #ifndef set_numa_mem
   static inline void set_numa_mem(int node)
   {
- -      percpu_write(_numa_mem_, node);
+ +      this_cpu_write(_numa_mem_, node);
   }
   #endif
   
diff --combined kernel/sched/core.c

index d2e2e17,ab9745f..d833cc9
--- 1/kernel/sched/core.c
--- 2/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@@ -83,7 -83,6 +83,7 @@@
   
   #include "sched.h"
   #include "../workqueue_sched.h"
+ +#include "../smpboot.h"
   
   #define CREATE_TRACE_POINTS
   #include <trace/events/sched.h>
@@@ -693,8 -692,6 +693,6 @@@ int tg_nop(struct task_group *tg, void 
   }
   #endif
   
- void update_cpu_load(struct rq *this_rq);
- 
   static void set_load_weight(struct task_struct *p)
   {
         int prio = p->static_prio - MAX_RT_PRIO;
@@@ -1914,7 -1911,7 +1912,7 @@@ prepare_task_switch(struct rq *rq, stru
                     struct task_struct *next)
   {
         sched_info_switch(prev, next);
- -      perf_event_task_sched_out(prev, next);
+ +      perf_event_task_sched(prev, next);
         fire_sched_out_preempt_notifiers(prev, next);
         prepare_lock_switch(rq, next);
         prepare_arch_switch(next);
@@@ -1957,6 -1954,13 +1955,6 @@@ static void finish_task_switch(struct r
          */
         prev_state = prev->state;
         finish_arch_switch(prev);
- -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
- -      local_irq_disable();
- -#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
- -      perf_event_task_sched_in(prev, current);
- -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
- -      local_irq_enable();
- -#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
         finish_lock_switch(rq, prev);
         finish_arch_post_lock_switch();
   
@@@ -2077,7 -2081,6 +2075,7 @@@ context_switch(struct rq *rq, struct ta
   #endif
   
         /* Here we just switch the register state and the stack. */
+ +      rcu_switch_from(prev);
         switch_to(prev, next, prev);
   
         barrier();
@@@ -2481,22 -2484,13 +2479,13 @@@ decay_load_missed(unsigned long load, u
    * scheduler tick (TICK_NSEC). With tickless idle this will not be called
    * every tick. We fix it up based on jiffies.
    */
- void update_cpu_load(struct rq *this_rq)
+ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
+                             unsigned long pending_updates)
   {
-       unsigned long this_load = this_rq->load.weight;
-       unsigned long curr_jiffies = jiffies;
-       unsigned long pending_updates;
         int i, scale;
   
         this_rq->nr_load_updates++;
   
-       /* Avoid repeated calls on same jiffy, when moving in and out of idle */
-       if (curr_jiffies == this_rq->last_load_update_tick)
-               return;
- 
-       pending_updates = curr_jiffies - this_rq->last_load_update_tick;
-       this_rq->last_load_update_tick = curr_jiffies;
- 
         /* Update our load: */
         this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
         for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
@@@ -2521,9 -2515,45 +2510,45 @@@
         sched_avg_update(this_rq);
   }
   
+ /*
+  * Called from nohz_idle_balance() to update the load ratings before doing the
+  * idle balance.
+  */
+ void update_idle_cpu_load(struct rq *this_rq)
+ {
+       unsigned long curr_jiffies = jiffies;
+       unsigned long load = this_rq->load.weight;
+       unsigned long pending_updates;
+ 
+       /*
+        * Bloody broken means of dealing with nohz, but better than nothing..
+        * jiffies is updated by one cpu, another cpu can drift wrt the jiffy
+        * update and see 0 difference the one time and 2 the next, even though
+        * we ticked at roughtly the same rate.
+        *
+        * Hence we only use this from nohz_idle_balance() and skip this
+        * nonsense when called from the scheduler_tick() since that's
+        * guaranteed a stable rate.
+        */
+       if (load || curr_jiffies == this_rq->last_load_update_tick)
+               return;
+ 
+       pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+       this_rq->last_load_update_tick = curr_jiffies;
+ 
+       __update_cpu_load(this_rq, load, pending_updates);
+ }
+ 
+ /*
+  * Called from scheduler_tick()
+  */
   static void update_cpu_load_active(struct rq *this_rq)
   {
-       update_cpu_load(this_rq);
+       /*
+        * See the mess in update_idle_cpu_load().
+        */
+       this_rq->last_load_update_tick = jiffies;
+       __update_cpu_load(this_rq, this_rq->load.weight, 1);
   
         calc_load_account_active(this_rq);
   }
@@@ -3108,6 -3138,7 +3133,7 @@@ static noinline void __schedule_bug(str
         if (irqs_disabled())
                 print_irqtrace_events(prev);
         dump_stack();
+       add_taint(TAINT_WARN);
   }
   
   /*
@@@ -5555,7 -5586,8 +5581,8 @@@ static int sched_domain_debug_one(struc
                         break;
                 }
   
-               if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
+               if (!(sd->flags & SD_OVERLAP) &&
+                   cpumask_intersects(groupmask, sched_group_cpus(group))) {
                         printk(KERN_CONT "\n");
                         printk(KERN_ERR "ERROR: repeated CPUs\n");
                         break;
@@@ -5893,99 -5925,11 +5920,11 @@@ static int __init isolated_cpu_setup(ch
   
   __setup("isolcpus=", isolated_cpu_setup);
   
- #ifdef CONFIG_NUMA
- 
- /**
-  * find_next_best_node - find the next node to include in a sched_domain
-  * @node: node whose sched_domain we're building
-  * @used_nodes: nodes already in the sched_domain
-  *
-  * Find the next node to include in a given scheduling domain. Simply
-  * finds the closest node not already in the @used_nodes map.
-  *
-  * Should use nodemask_t.
-  */
- static int find_next_best_node(int node, nodemask_t *used_nodes)
- {
-       int i, n, val, min_val, best_node = -1;
- 
-       min_val = INT_MAX;
- 
-       for (i = 0; i < nr_node_ids; i++) {
-               /* Start at @node */
-               n = (node + i) % nr_node_ids;
- 
-               if (!nr_cpus_node(n))
-                       continue;
- 
-               /* Skip already used nodes */
-               if (node_isset(n, *used_nodes))
-                       continue;
- 
-               /* Simple min distance search */
-               val = node_distance(node, n);
- 
-               if (val < min_val) {
-                       min_val = val;
-                       best_node = n;
-               }
-       }
- 
-       if (best_node != -1)
-               node_set(best_node, *used_nodes);
-       return best_node;
- }
- 
- /**
-  * sched_domain_node_span - get a cpumask for a node's sched_domain
-  * @node: node whose cpumask we're constructing
-  * @span: resulting cpumask
-  *
-  * Given a node, construct a good cpumask for its sched_domain to span. It
-  * should be one that prevents unnecessary balancing, but also spreads tasks
-  * out optimally.
-  */
- static void sched_domain_node_span(int node, struct cpumask *span)
- {
-       nodemask_t used_nodes;
-       int i;
- 
-       cpumask_clear(span);
-       nodes_clear(used_nodes);
- 
-       cpumask_or(span, span, cpumask_of_node(node));
-       node_set(node, used_nodes);
- 
-       for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
-               int next_node = find_next_best_node(node, &used_nodes);
-               if (next_node < 0)
-                       break;
-               cpumask_or(span, span, cpumask_of_node(next_node));
-       }
- }
- 
- static const struct cpumask *cpu_node_mask(int cpu)
- {
-       lockdep_assert_held(&sched_domains_mutex);
- 
-       sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
- 
-       return sched_domains_tmpmask;
- }
- 
- static const struct cpumask *cpu_allnodes_mask(int cpu)
- {
-       return cpu_possible_mask;
- }
- #endif /* CONFIG_NUMA */
- 
   static const struct cpumask *cpu_cpu_mask(int cpu)
   {
         return cpumask_of_node(cpu_to_node(cpu));
   }
   
- int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
- 
   struct sd_data {
         struct sched_domain **__percpu sd;
         struct sched_group **__percpu sg;
@@@ -6015,6 -5959,7 +5954,7 @@@ struct sched_domain_topology_level 
         sched_domain_init_f init;
         sched_domain_mask_f mask;
         int                 flags;
+       int                 numa_level;
         struct sd_data      data;
   };
   
@@@ -6206,10 -6151,6 +6146,6 @@@ sd_init_##type(struct sched_domain_topo
   }
   
   SD_INIT_FUNC(CPU)
- #ifdef CONFIG_NUMA
-  SD_INIT_FUNC(ALLNODES)
-  SD_INIT_FUNC(NODE)
- #endif
   #ifdef CONFIG_SCHED_SMT
    SD_INIT_FUNC(SIBLING)
   #endif
@@@ -6331,15 -6272,184 +6267,184 @@@ static struct sched_domain_topology_lev
         { sd_init_BOOK, cpu_book_mask, },
   #endif
         { sd_init_CPU, cpu_cpu_mask, },
- #ifdef CONFIG_NUMA
-       { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
-       { sd_init_ALLNODES, cpu_allnodes_mask, },
- #endif
         { NULL, },
   };
   
   static struct sched_domain_topology_level *sched_domain_topology = default_topology;
   
+ #ifdef CONFIG_NUMA
+ 
+ static int sched_domains_numa_levels;
+ static int sched_domains_numa_scale;
+ static int *sched_domains_numa_distance;
+ static struct cpumask ***sched_domains_numa_masks;
+ static int sched_domains_curr_level;
+ 
+ static inline int sd_local_flags(int level)
+ {
+       if (sched_domains_numa_distance[level] > REMOTE_DISTANCE)
+               return 0;
+ 
+       return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
+ }
+ 
+ static struct sched_domain *
+ sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
+ {
+       struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
+       int level = tl->numa_level;
+       int sd_weight = cpumask_weight(
+                       sched_domains_numa_masks[level][cpu_to_node(cpu)]);
+ 
+       *sd = (struct sched_domain){
+               .min_interval           = sd_weight,
+               .max_interval           = 2*sd_weight,
+               .busy_factor            = 32,
+               .imbalance_pct          = 125,
+               .cache_nice_tries       = 2,
+               .busy_idx               = 3,
+               .idle_idx               = 2,
+               .newidle_idx            = 0,
+               .wake_idx               = 0,
+               .forkexec_idx           = 0,
+ 
+               .flags                  = 1*SD_LOAD_BALANCE
+                                       | 1*SD_BALANCE_NEWIDLE
+                                       | 0*SD_BALANCE_EXEC
+                                       | 0*SD_BALANCE_FORK
+                                       | 0*SD_BALANCE_WAKE
+                                       | 0*SD_WAKE_AFFINE
+                                       | 0*SD_PREFER_LOCAL
+                                       | 0*SD_SHARE_CPUPOWER
+                                       | 0*SD_SHARE_PKG_RESOURCES
+                                       | 1*SD_SERIALIZE
+                                       | 0*SD_PREFER_SIBLING
+                                       | sd_local_flags(level)
+                                       ,
+               .last_balance           = jiffies,
+               .balance_interval       = sd_weight,
+       };
+       SD_INIT_NAME(sd, NUMA);
+       sd->private = &tl->data;
+ 
+       /*
+        * Ugly hack to pass state to sd_numa_mask()...
+        */
+       sched_domains_curr_level = tl->numa_level;
+ 
+       return sd;
+ }
+ 
+ static const struct cpumask *sd_numa_mask(int cpu)
+ {
+       return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
+ }
+ 
+ static void sched_init_numa(void)
+ {
+       int next_distance, curr_distance = node_distance(0, 0);
+       struct sched_domain_topology_level *tl;
+       int level = 0;
+       int i, j, k;
+ 
+       sched_domains_numa_scale = curr_distance;
+       sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
+       if (!sched_domains_numa_distance)
+               return;
+ 
+       /*
+        * O(nr_nodes^2) deduplicating selection sort -- in order to find the
+        * unique distances in the node_distance() table.
+        *
+        * Assumes node_distance(0,j) includes all distances in
+        * node_distance(i,j) in order to avoid cubic time.
+        *
+        * XXX: could be optimized to O(n log n) by using sort()
+        */
+       next_distance = curr_distance;
+       for (i = 0; i < nr_node_ids; i++) {
+               for (j = 0; j < nr_node_ids; j++) {
+                       int distance = node_distance(0, j);
+                       if (distance > curr_distance &&
+                                       (distance < next_distance ||
+                                        next_distance == curr_distance))
+                               next_distance = distance;
+               }
+               if (next_distance != curr_distance) {
+                       sched_domains_numa_distance[level++] = next_distance;
+                       sched_domains_numa_levels = level;
+                       curr_distance = next_distance;
+               } else break;
+       }
+       /*
+        * 'level' contains the number of unique distances, excluding the
+        * identity distance node_distance(i,i).
+        *
+        * The sched_domains_nume_distance[] array includes the actual distance
+        * numbers.
+        */
+ 
+       sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
+       if (!sched_domains_numa_masks)
+               return;
+ 
+       /*
+        * Now for each level, construct a mask per node which contains all
+        * cpus of nodes that are that many hops away from us.
+        */
+       for (i = 0; i < level; i++) {
+               sched_domains_numa_masks[i] =
+                       kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
+               if (!sched_domains_numa_masks[i])
+                       return;
+ 
+               for (j = 0; j < nr_node_ids; j++) {
+                       struct cpumask *mask = kzalloc_node(cpumask_size(), GFP_KERNEL, j);
+                       if (!mask)
+                               return;
+ 
+                       sched_domains_numa_masks[i][j] = mask;
+ 
+                       for (k = 0; k < nr_node_ids; k++) {
+                               if (node_distance(j, k) > sched_domains_numa_distance[i])
+                                       continue;
+ 
+                               cpumask_or(mask, mask, cpumask_of_node(k));
+                       }
+               }
+       }
+ 
+       tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
+                       sizeof(struct sched_domain_topology_level), GFP_KERNEL);
+       if (!tl)
+               return;
+ 
+       /*
+        * Copy the default topology bits..
+        */
+       for (i = 0; default_topology[i].init; i++)
+               tl[i] = default_topology[i];
+ 
+       /*
+        * .. and append 'j' levels of NUMA goodness.
+        */
+       for (j = 0; j < level; i++, j++) {
+               tl[i] = (struct sched_domain_topology_level){
+                       .init = sd_numa_init,
+                       .mask = sd_numa_mask,
+                       .flags = SDTL_OVERLAP,
+                       .numa_level = j,
+               };
+       }
+ 
+       sched_domain_topology = tl;
+ }
+ #else
+ static inline void sched_init_numa(void)
+ {
+ }
+ #endif /* CONFIG_NUMA */
+ 
   static int __sdt_alloc(const struct cpumask *cpu_map)
   {
         struct sched_domain_topology_level *tl;
@@@ -6707,97 -6817,6 +6812,6 @@@ match2
         mutex_unlock(&sched_domains_mutex);
   }
   
- #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
- static void reinit_sched_domains(void)
- {
-       get_online_cpus();
- 
-       /* Destroy domains first to force the rebuild */
-       partition_sched_domains(0, NULL, NULL);
- 
-       rebuild_sched_domains();
-       put_online_cpus();
- }
- 
- static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
- {
-       unsigned int level = 0;
- 
-       if (sscanf(buf, "%u", &level) != 1)
-               return -EINVAL;
- 
-       /*
-        * level is always be positive so don't check for
-        * level < POWERSAVINGS_BALANCE_NONE which is 0
-        * What happens on 0 or 1 byte write,
-        * need to check for count as well?
-        */
- 
-       if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
-               return -EINVAL;
- 
-       if (smt)
-               sched_smt_power_savings = level;
-       else
-               sched_mc_power_savings = level;
- 
-       reinit_sched_domains();
- 
-       return count;
- }
- 
- #ifdef CONFIG_SCHED_MC
- static ssize_t sched_mc_power_savings_show(struct device *dev,
-                                          struct device_attribute *attr,
-                                          char *buf)
- {
-       return sprintf(buf, "%u\n", sched_mc_power_savings);
- }
- static ssize_t sched_mc_power_savings_store(struct device *dev,
-                                           struct device_attribute *attr,
-                                           const char *buf, size_t count)
- {
-       return sched_power_savings_store(buf, count, 0);
- }
- static DEVICE_ATTR(sched_mc_power_savings, 0644,
-                  sched_mc_power_savings_show,
-                  sched_mc_power_savings_store);
- #endif
- 
- #ifdef CONFIG_SCHED_SMT
- static ssize_t sched_smt_power_savings_show(struct device *dev,
-                                           struct device_attribute *attr,
-                                           char *buf)
- {
-       return sprintf(buf, "%u\n", sched_smt_power_savings);
- }
- static ssize_t sched_smt_power_savings_store(struct device *dev,
-                                           struct device_attribute *attr,
-                                            const char *buf, size_t count)
- {
-       return sched_power_savings_store(buf, count, 1);
- }
- static DEVICE_ATTR(sched_smt_power_savings, 0644,
-                  sched_smt_power_savings_show,
-                  sched_smt_power_savings_store);
- #endif
- 
- int __init sched_create_sysfs_power_savings_entries(struct device *dev)
- {
-       int err = 0;
- 
- #ifdef CONFIG_SCHED_SMT
-       if (smt_capable())
-               err = device_create_file(dev, &dev_attr_sched_smt_power_savings);
- #endif
- #ifdef CONFIG_SCHED_MC
-       if (!err && mc_capable())
-               err = device_create_file(dev, &dev_attr_sched_mc_power_savings);
- #endif
-       return err;
- }
- #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
- 
   /*
    * Update cpusets according to cpu_active mask.  If cpusets are
    * disabled, cpuset_update_active_cpus() becomes a simple wrapper
@@@ -6835,6 -6854,8 +6849,8 @@@ void __init sched_init_smp(void
         alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
         alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
   
+       sched_init_numa();
+ 
         get_online_cpus();
         mutex_lock(&sched_domains_mutex);
         init_sched_domains(cpu_active_mask);
@@@ -7056,7 -7077,6 +7072,7 @@@ void __init sched_init(void
         /* May be allocated at isolcpus cmdline parse time */
         if (cpu_isolated_map == NULL)
                 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
+ +      idle_thread_set_boot_cpu();
   #endif
         init_sched_fair_class();
   
@@@ -7978,9 -7998,13 +7994,9 @@@ static struct cftype cpu_files[] = 
                 .write_u64 = cpu_rt_period_write_uint,
         },
   #endif
+ +      { }     /* terminate */
   };
   
- -static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
- -{
- -      return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));
- -}
- -
   struct cgroup_subsys cpu_cgroup_subsys = {
         .name           = "cpu",
         .create         = cpu_cgroup_create,
@@@ -7988,8 -8012,8 +8004,8 @@@
         .can_attach     = cpu_cgroup_can_attach,
         .attach         = cpu_cgroup_attach,
         .exit           = cpu_cgroup_exit,
- -      .populate       = cpu_cgroup_populate,
         .subsys_id      = cpu_cgroup_subsys_id,
+ +      .base_cftypes   = cpu_files,
         .early_init     = 1,
   };
   
@@@ -8174,9 -8198,13 +8190,9 @@@ static struct cftype files[] = 
                 .name = "stat",
                 .read_map = cpuacct_stats_show,
         },
+ +      { }     /* terminate */
   };
   
- -static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
- -{
- -      return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));
- -}
- -
   /*
    * charge this task's execution time to its accounting group.
    *
@@@ -8208,7 -8236,7 +8224,7 @@@ struct cgroup_subsys cpuacct_subsys = 
         .name = "cpuacct",
         .create = cpuacct_create,
         .destroy = cpuacct_destroy,
- -      .populate = cpuacct_populate,
         .subsys_id = cpuacct_subsys_id,
+ +      .base_cftypes = files,
   };
   #endif        /* CONFIG_CGROUP_CPUACCT */
author	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 23 May 2012 01:27:32 +0000 (18:27 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 23 May 2012 01:27:32 +0000 (18:27 -0700)
		1	2
arch/x86/kernel/process.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/smpboot.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/topology.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/core.c	patch \|	diff1 \|	diff2 \|	blob \| history