Merge branch 'core/percpu' into stackprotector

author Ingo Molnar <mingo@elte.hu>

Sun, 18 Jan 2009 17:37:14 +0000 (18:37 +0100)

committer Ingo Molnar <mingo@elte.hu>

Sun, 18 Jan 2009 17:37:14 +0000 (18:37 +0100)
author Ingo Molnar <mingo@elte.hu>
Sun, 18 Jan 2009 17:37:14 +0000 (18:37 +0100)
committer Ingo Molnar <mingo@elte.hu>
Sun, 18 Jan 2009 17:37:14 +0000 (18:37 +0100)
diff --combined arch/x86/Kconfig

index 615668d,73f7fe8..ef27aed
--- 1/arch/x86/Kconfig
--- 2/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@@ -27,6 -27,7 +27,7 @@@ config X8
         select HAVE_IOREMAP_PROT
         select HAVE_KPROBES
         select ARCH_WANT_OPTIONAL_GPIOLIB
+       select ARCH_WANT_FRAME_POINTERS
         select HAVE_KRETPROBES
         select HAVE_FTRACE_MCOUNT_RECORD
         select HAVE_DYNAMIC_FTRACE
@@@ -586,6 -587,16 +587,16 @@@ config AMD_IOMM
           your BIOS for an option to enable it or if you have an IVRS ACPI
           table.
   
+ config AMD_IOMMU_STATS
+       bool "Export AMD IOMMU statistics to debugfs"
+       depends on AMD_IOMMU
+       select DEBUG_FS
+       help
+         This option enables code in the AMD IOMMU driver to collect various
+         statistics about whats happening in the driver and exports that
+         information to userspace via debugfs.
+         If unsure, say N.
+ 
   # need this always selected by IOMMU for the VIA workaround
   config SWIOTLB
         def_bool y if X86_64
@@@ -599,21 -610,25 +610,25 @@@
   config IOMMU_HELPER
         def_bool (CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU)
   
+ config IOMMU_API
+       def_bool (AMD_IOMMU || DMAR)
+ 
   config MAXSMP
         bool "Configure Maximum number of SMP Processors and NUMA Nodes"
-       depends on X86_64 && SMP && BROKEN
+       depends on X86_64 && SMP && DEBUG_KERNEL && EXPERIMENTAL
+       select CPUMASK_OFFSTACK
         default n
         help
           Configure maximum number of CPUS and NUMA Nodes for this architecture.
           If unsure, say N.
   
   config NR_CPUS
-       int "Maximum number of CPUs (2-512)" if !MAXSMP
-       range 2 512
-       depends on SMP
+       int "Maximum number of CPUs" if SMP && !MAXSMP
+       range 2 512 if SMP && !MAXSMP
+       default "1" if !SMP
         default "4096" if MAXSMP
-       default "32" if X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000
-       default "8"
+       default "32" if SMP && (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000)
+       default "8" if SMP
         help
           This allows you to specify the maximum number of CPUs which this
           kernel will support.  The maximum supported value is 512 and the
@@@ -1325,17 -1340,13 +1340,17 @@@ config SECCOM
   
           If unsure, say Y. Only embedded should say N here.
   
+ +config CC_STACKPROTECTOR_ALL
+ +      bool
+ +
   config CC_STACKPROTECTOR
         bool "Enable -fstack-protector buffer overflow detection (EXPERIMENTAL)"
- -      depends on X86_64 && EXPERIMENTAL && BROKEN
+ +      depends on X86_64
+ +      select CC_STACKPROTECTOR_ALL
         help
- -         This option turns on the -fstack-protector GCC feature. This
- -        feature puts, at the beginning of critical functions, a canary
- -        value on the stack just before the return address, and validates
+ +          This option turns on the -fstack-protector GCC feature. This
+ +        feature puts, at the beginning of functions, a canary value on
+ +        the stack just before the return address, and validates
           the value just before actually returning.  Stack based buffer
           overflows (that need to overwrite this return address) now also
           overwrite the canary, which gets detected and the attack is then
@@@ -1343,8 -1354,15 +1358,8 @@@
   
           This feature requires gcc version 4.2 or above, or a distribution
           gcc with the feature backported. Older versions are automatically
- -        detected and for those versions, this configuration option is ignored.
- -
- -config CC_STACKPROTECTOR_ALL
- -      bool "Use stack-protector for all functions"
- -      depends on CC_STACKPROTECTOR
- -      help
- -        Normally, GCC only inserts the canary value protection for
- -        functions that use large-ish on-stack buffers. By enabling
- -        this option, GCC will be asked to do this for ALL functions.
+ +        detected and for those versions, this configuration option is
+ +        ignored. (and a warning is printed during bootup)
   
   source kernel/Kconfig.hz
   
diff --combined arch/x86/include/asm/pda.h

index 3fea2fd,c31ca04..5976cd8
--- 1/arch/x86/include/asm/pda.h
--- 2/arch/x86/include/asm/pda.h
+++ b/arch/x86/include/asm/pda.h
@@@ -5,133 -5,41 +5,41 @@@
   #include <linux/stddef.h>
   #include <linux/types.h>
   #include <linux/cache.h>
+ #include <linux/threads.h>
   #include <asm/page.h>
+ #include <asm/percpu.h>
   
   /* Per processor datastructure. %gs points to it while the kernel runs */
   struct x8664_pda {
-       struct task_struct *pcurrent;   /* 0  Current process */
-       unsigned long data_offset;      /* 8 Per cpu data offset from linker
-                                          address */
-       unsigned long kernelstack;      /* 16 top of kernel stack for current */
-       unsigned long oldrsp;           /* 24 user rsp for system call */
-       int irqcount;                   /* 32 Irq nesting counter. Starts -1 */
-       unsigned int cpunumber;         /* 36 Logical CPU number */
+       unsigned long unused1;
+       unsigned long unused2;
+       unsigned long unused3;
+       unsigned long unused4;
+       int unused5;
+       unsigned int unused6;           /* 36 was cpunumber */
- -#ifdef CONFIG_CC_STACKPROTECTOR
         unsigned long stack_canary;     /* 40 stack canary value */
                                         /* gcc-ABI: this canary MUST be at
                                            offset 40!!! */
-       char *irqstackptr;
-       short nodenumber;               /* number of current node (32k max) */
- -#endif
         short in_bootmem;               /* pda lives in bootmem */
-       unsigned int __softirq_pending;
-       unsigned int __nmi_count;       /* number of NMI on this CPUs */
-       short mmu_state;
-       short isidle;
-       struct mm_struct *active_mm;
-       unsigned apic_timer_irqs;
-       unsigned irq0_irqs;
-       unsigned irq_resched_count;
-       unsigned irq_call_count;
-       unsigned irq_tlb_count;
-       unsigned irq_thermal_count;
-       unsigned irq_threshold_count;
-       unsigned irq_spurious_count;
   } ____cacheline_aligned_in_smp;
   
- extern struct x8664_pda **_cpu_pda;
+ DECLARE_PER_CPU(struct x8664_pda, __pda);
   extern void pda_init(int);
   
- #define cpu_pda(i) (_cpu_pda[i])
+ #define cpu_pda(cpu)          (&per_cpu(__pda, cpu))
   
- /*
-  * There is no fast way to get the base address of the PDA, all the accesses
-  * have to mention %fs/%gs.  So it needs to be done this Torvaldian way.
-  */
- extern void __bad_pda_field(void) __attribute__((noreturn));
- 
- /*
-  * proxy_pda doesn't actually exist, but tell gcc it is accessed for
-  * all PDA accesses so it gets read/write dependencies right.
-  */
- extern struct x8664_pda _proxy_pda;
- 
- #define pda_offset(field) offsetof(struct x8664_pda, field)
- 
- #define pda_to_op(op, field, val)                                     \
- do {                                                                  \
-       typedef typeof(_proxy_pda.field) T__;                           \
-       if (0) { T__ tmp__; tmp__ = (val); }    /* type checking */     \
-       switch (sizeof(_proxy_pda.field)) {                             \
-       case 2:                                                         \
-               asm(op "w %1,%%gs:%c2" :                                \
-                   "+m" (_proxy_pda.field) :                           \
-                   "ri" ((T__)val),                                    \
-                   "i"(pda_offset(field)));                            \
-               break;                                                  \
-       case 4:                                                         \
-               asm(op "l %1,%%gs:%c2" :                                \
-                   "+m" (_proxy_pda.field) :                           \
-                   "ri" ((T__)val),                                    \
-                   "i" (pda_offset(field)));                           \
-               break;                                                  \
-       case 8:                                                         \
-               asm(op "q %1,%%gs:%c2":                                 \
-                   "+m" (_proxy_pda.field) :                           \
-                   "ri" ((T__)val),                                    \
-                   "i"(pda_offset(field)));                            \
-               break;                                                  \
-       default:                                                        \
-               __bad_pda_field();                                      \
-       }                                                               \
- } while (0)
- 
- #define pda_from_op(op, field)                        \
- ({                                            \
-       typeof(_proxy_pda.field) ret__;         \
-       switch (sizeof(_proxy_pda.field)) {     \
-       case 2:                                 \
-               asm(op "w %%gs:%c1,%0" :        \
-                   "=r" (ret__) :              \
-                   "i" (pda_offset(field)),    \
-                   "m" (_proxy_pda.field));    \
-               break;                          \
-       case 4:                                 \
-               asm(op "l %%gs:%c1,%0":         \
-                   "=r" (ret__):               \
-                   "i" (pda_offset(field)),    \
-                   "m" (_proxy_pda.field));    \
-               break;                          \
-       case 8:                                 \
-               asm(op "q %%gs:%c1,%0":         \
-                   "=r" (ret__) :              \
-                   "i" (pda_offset(field)),    \
-                   "m" (_proxy_pda.field));    \
-               break;                          \
-       default:                                \
-               __bad_pda_field();              \
-       }                                       \
-       ret__;                                  \
- })
- 
- #define read_pda(field)               pda_from_op("mov", field)
- #define write_pda(field, val) pda_to_op("mov", field, val)
- #define add_pda(field, val)   pda_to_op("add", field, val)
- #define sub_pda(field, val)   pda_to_op("sub", field, val)
- #define or_pda(field, val)    pda_to_op("or", field, val)
+ #define read_pda(field)               percpu_read(__pda.field)
+ #define write_pda(field, val) percpu_write(__pda.field, val)
+ #define add_pda(field, val)   percpu_add(__pda.field, val)
+ #define sub_pda(field, val)   percpu_sub(__pda.field, val)
+ #define or_pda(field, val)    percpu_or(__pda.field, val)
   
   /* This is not atomic against other CPUs -- CPU preemption needs to be off */
   #define test_and_clear_bit_pda(bit, field)                            \
- ({                                                                    \
-       int old__;                                                      \
-       asm volatile("btr %2,%%gs:%c3\n\tsbbl %0,%0"                    \
-                    : "=r" (old__), "+m" (_proxy_pda.field)            \
-                    : "dIr" (bit), "i" (pda_offset(field)) : "memory");\
-       old__;                                                          \
- })
+       x86_test_and_clear_bit_percpu(bit, __pda.field)
   
   #endif
   
- #define PDA_STACKOFFSET (5*8)
- 
+ +#define refresh_stack_canary() write_pda(stack_canary, current->stack_canary)
+ +
   #endif /* _ASM_X86_PDA_H */
diff --combined arch/x86/include/asm/stackprotector.h

index 0000000,0000000..c7f0d10

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/include/asm/stackprotector.h
@@@ -1,0 -1,0 +1,39 @@@
++#ifndef _ASM_STACKPROTECTOR_H
++#define _ASM_STACKPROTECTOR_H 1
++
++#include <asm/tsc.h>
++#include <asm/pda.h>
++
++/*
++ * Initialize the stackprotector canary value.
++ *
++ * NOTE: this must only be called from functions that never return,
++ * and it must always be inlined.
++ */
++static __always_inline void boot_init_stack_canary(void)
++{
++      u64 canary;
++      u64 tsc;
++
++      /*
++       * If we're the non-boot CPU, nothing set the PDA stack
++       * canary up for us - and if we are the boot CPU we have
++       * a 0 stack canary. This is a good place for updating
++       * it, as we wont ever return from this function (so the
++       * invalid canaries already on the stack wont ever
++       * trigger).
++       *
++       * We both use the random pool and the current TSC as a source
++       * of randomness. The TSC only matters for very early init,
++       * there it already has some randomness on most systems. Later
++       * on during the bootup the random pool has true entropy too.
++       */
++      get_random_bytes(&canary, sizeof(canary));
++      tsc = __native_read_tsc();
++      canary += tsc + (tsc << 32UL);
++
++      current->stack_canary = canary;
++      write_pda(stack_canary, canary);
++}
++
++#endif
diff --combined arch/x86/include/asm/system.h

index 2f6340a,d1dc27d..8cadfe9
--- 1/arch/x86/include/asm/system.h
--- 2/arch/x86/include/asm/system.h
+++ b/arch/x86/include/asm/system.h
@@@ -94,9 -94,7 +94,9 @@@ do {                                                                  
              "call __switch_to\n\t"                                       \
              ".globl thread_return\n"                                     \
              "thread_return:\n\t"                                         \
-            "movq %%gs:%P[pda_pcurrent],%%rsi\n\t"                       \
+            "movq "__percpu_arg([current_task])",%%rsi\n\t"              \
+ +           "movq %P[task_canary](%%rsi),%%r8\n\t"                       \
+ +           "movq %%r8,%%gs:%P[pda_canary]\n\t"                          \
              "movq %P[thread_info](%%rsi),%%r8\n\t"                       \
              LOCK_PREFIX "btr  %[tif_fork],%P[ti_flags](%%r8)\n\t"        \
              "movq %%rax,%%rdi\n\t"                                       \
@@@ -108,9 -106,7 +108,9 @@@
                [ti_flags] "i" (offsetof(struct thread_info, flags)),      \
                [tif_fork] "i" (TIF_FORK),                                 \
                [thread_info] "i" (offsetof(struct task_struct, stack)),   \
- -             [current_task] "m" (per_cpu_var(current_task))             \
+ +             [task_canary] "i" (offsetof(struct task_struct, stack_canary)),\
-              [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent)), \
++             [current_task] "m" (per_cpu_var(current_task)),            \
+ +             [pda_canary] "i" (offsetof(struct x8664_pda, stack_canary))\
              : "memory", "cc" __EXTRA_CLOBBER)
   #endif
   
diff --combined arch/x86/kernel/process_64.c

index efb0396,4523ff8..aa89eab
--- 1/arch/x86/kernel/process_64.c
--- 2/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@@ -16,7 -16,6 +16,7 @@@
   
   #include <stdarg.h>
   
+ +#include <linux/stackprotector.h>
   #include <linux/cpu.h>
   #include <linux/errno.h>
   #include <linux/sched.h>
@@@ -58,6 -57,12 +58,12 @@@
   
   asmlinkage extern void ret_from_fork(void);
   
+ DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
+ EXPORT_PER_CPU_SYMBOL(current_task);
+ 
+ DEFINE_PER_CPU(unsigned long, old_rsp);
+ static DEFINE_PER_CPU(unsigned char, is_idle);
+ 
   unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
   
   static ATOMIC_NOTIFIER_HEAD(idle_notifier);
@@@ -76,13 -81,13 +82,13 @@@ EXPORT_SYMBOL_GPL(idle_notifier_unregis
   
   void enter_idle(void)
   {
-       write_pda(isidle, 1);
+       percpu_write(is_idle, 1);
         atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
   }
   
   static void __exit_idle(void)
   {
-       if (test_and_clear_bit_pda(0, isidle) == 0)
+       if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
                 return;
         atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
   }
@@@ -112,17 -117,6 +118,17 @@@ static inline void play_dead(void
   void cpu_idle(void)
   {
         current_thread_info()->status |= TS_POLLING;
+ +
+ +      /*
+ +       * If we're the non-boot CPU, nothing set the PDA stack
+ +       * canary up for us - and if we are the boot CPU we have
+ +       * a 0 stack canary. This is a good place for updating
+ +       * it, as we wont ever return from this function (so the
+ +       * invalid canaries already on the stack wont ever
+ +       * trigger):
+ +       */
+ +      boot_init_stack_canary();
+ +
         /* endless idle loop with no priority at all */
         while (1) {
                 tick_nohz_stop_sched_tick(1);
@@@ -404,7 -398,7 +410,7 @@@ start_thread(struct pt_regs *regs, unsi
         load_gs_index(0);
         regs->ip                = new_ip;
         regs->sp                = new_sp;
-       write_pda(oldrsp, new_sp);
+       percpu_write(old_rsp, new_sp);
         regs->cs                = __USER_CS;
         regs->ss                = __USER_DS;
         regs->flags             = 0x200;
@@@ -625,14 -619,15 +631,14 @@@ __switch_to(struct task_struct *prev_p
         /*
          * Switch the PDA and FPU contexts.
          */
-       prev->usersp = read_pda(oldrsp);
-       write_pda(oldrsp, next->usersp);
-       write_pda(pcurrent, next_p);
+       prev->usersp = percpu_read(old_rsp);
+       percpu_write(old_rsp, next->usersp);
+       percpu_write(current_task, next_p);
   
-       write_pda(kernelstack,
+       percpu_write(kernel_stack,
                   (unsigned long)task_stack_page(next_p) +
-                 THREAD_SIZE - PDA_STACKOFFSET);
+                 THREAD_SIZE - KERNEL_STACK_OFFSET);
   #ifdef CONFIG_CC_STACKPROTECTOR
- -      write_pda(stack_canary, next_p->stack_canary);
         /*
          * Build time only check to make sure the stack_canary is at
          * offset 40 in the pda; this is a gcc ABI requirement
diff --combined arch/x86/mm/fault.c

index 4c056b5,90dfae5..37242c4
--- 1/arch/x86/mm/fault.c
--- 2/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@@ -26,7 -26,6 +26,7 @@@
   #include <linux/kprobes.h>
   #include <linux/uaccess.h>
   #include <linux/kdebug.h>
+ +#include <linux/magic.h>
   
   #include <asm/system.h>
   #include <asm/desc.h>
@@@ -535,7 -534,7 +535,7 @@@ static int vmalloc_fault(unsigned long 
            happen within a race in page table update. In the later
            case just flush. */
   
-       pgd = pgd_offset(current->mm ?: &init_mm, address);
+       pgd = pgd_offset(current->active_mm, address);
         pgd_ref = pgd_offset_k(address);
         if (pgd_none(*pgd_ref))
                 return -1;
@@@ -590,8 -589,6 +590,8 @@@ void __kprobes do_page_fault(struct pt_
         unsigned long address;
         int write, si_code;
         int fault;
+ +      unsigned long *stackend;
+ +
   #ifdef CONFIG_X86_64
         unsigned long flags;
         int sig;
@@@ -670,7 -667,6 +670,6 @@@
         if (unlikely(in_atomic() || !mm))
                 goto bad_area_nosemaphore;
   
- again:
         /*
          * When running in the kernel we expect faults to occur only to
          * addresses in user space.  All other faults represent errors in the
@@@ -845,10 -841,6 +844,10 @@@ no_context
   
         show_fault_oops(regs, error_code, address);
   
+ +      stackend = end_of_stack(tsk);
+ +      if (*stackend != STACK_END_MAGIC)
+ +              printk(KERN_ALERT "Thread overran stack, or stack corrupted\n");
+ +
         tsk->thread.cr2 = address;
         tsk->thread.trap_no = 14;
         tsk->thread.error_code = error_code;
@@@ -866,25 -858,14 +865,14 @@@
         oops_end(flags, regs, sig);
   #endif
   
- /*
-  * We ran out of memory, or some other thing happened to us that made
-  * us unable to handle the page fault gracefully.
-  */
   out_of_memory:
+       /*
+        * We ran out of memory, call the OOM killer, and return the userspace
+        * (which will retry the fault, or kill us if we got oom-killed).
+        */
         up_read(&mm->mmap_sem);
-       if (is_global_init(tsk)) {
-               yield();
-               /*
-                * Re-lookup the vma - in theory the vma tree might
-                * have changed:
-                */
-               goto again;
-       }
- 
-       printk("VM: killing process %s\n", tsk->comm);
-       if (error_code & PF_USER)
-               do_group_exit(SIGKILL);
-       goto no_context;
+       pagefault_out_of_memory();
+       return;
   
   do_sigbus:
         up_read(&mm->mmap_sem);
diff --combined include/linux/magic.h

index a07aa79,439f6f3..561a5ff
--- 1/include/linux/magic.h
--- 2/include/linux/magic.h
+++ b/include/linux/magic.h
@@@ -13,6 -13,7 +13,7 @@@
   #define EFS_SUPER_MAGIC               0x414A53
   #define EXT2_SUPER_MAGIC      0xEF53
   #define EXT3_SUPER_MAGIC      0xEF53
+ #define XENFS_SUPER_MAGIC     0xabba1974
   #define EXT4_SUPER_MAGIC      0xEF53
   #define HPFS_SUPER_MAGIC      0xf995e849
   #define ISOFS_SUPER_MAGIC     0x9660
@@@ -46,5 -47,4 +47,5 @@@
   #define FUTEXFS_SUPER_MAGIC   0xBAD1DEA
   #define INOTIFYFS_SUPER_MAGIC 0x2BAD1DEA
   
+ +#define STACK_END_MAGIC               0x57AC6E9D
   #endif /* __LINUX_MAGIC_H__ */
diff --combined include/linux/sched.h

index bd5ff78,4cae9b8..a85b0ce
--- 1/include/linux/sched.h
--- 2/include/linux/sched.h
+++ b/include/linux/sched.h
@@@ -250,7 -250,7 +250,7 @@@ extern void init_idle_bootup_task(struc
   extern int runqueue_is_locked(void);
   extern void task_rq_unlock_wait(struct task_struct *p);
   
- extern cpumask_t nohz_cpu_mask;
+ extern cpumask_var_t nohz_cpu_mask;
   #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
   extern int select_nohz_load_balancer(int cpu);
   #else
@@@ -284,7 -284,6 +284,6 @@@ long io_schedule_timeout(long timeout)
   
   extern void cpu_init (void);
   extern void trap_init(void);
- extern void account_process_tick(struct task_struct *task, int user);
   extern void update_process_times(int user);
   extern void scheduler_tick(void);
   
@@@ -387,6 -386,9 +386,9 @@@ extern void arch_unmap_area_topdown(str
                 (mm)->hiwater_vm = (mm)->total_vm;      \
   } while (0)
   
+ #define get_mm_hiwater_rss(mm)        max((mm)->hiwater_rss, get_mm_rss(mm))
+ #define get_mm_hiwater_vm(mm) max((mm)->hiwater_vm, (mm)->total_vm)
+ 
   extern void set_dumpable(struct mm_struct *mm, int value);
   extern int get_dumpable(struct mm_struct *mm);
   
@@@ -758,20 -760,51 +760,51 @@@ enum cpu_idle_type 
   #define SD_SERIALIZE          1024    /* Only a single load balancing instance */
   #define SD_WAKE_IDLE_FAR      2048    /* Gain latency sacrificing cache hit */
   
- #define BALANCE_FOR_MC_POWER  \
-       (sched_smt_power_savings ? SD_POWERSAVINGS_BALANCE : 0)
+ enum powersavings_balance_level {
+       POWERSAVINGS_BALANCE_NONE = 0,  /* No power saving load balance */
+       POWERSAVINGS_BALANCE_BASIC,     /* Fill one thread/core/package
+                                        * first for long running threads
+                                        */
+       POWERSAVINGS_BALANCE_WAKEUP,    /* Also bias task wakeups to semi-idle
+                                        * cpu package for power savings
+                                        */
+       MAX_POWERSAVINGS_BALANCE_LEVELS
+ };
   
- #define BALANCE_FOR_PKG_POWER \
-       ((sched_mc_power_savings || sched_smt_power_savings) ?  \
-        SD_POWERSAVINGS_BALANCE : 0)
+ extern int sched_mc_power_savings, sched_smt_power_savings;
   
- #define test_sd_parent(sd, flag)      ((sd->parent &&         \
-                                        (sd->parent->flags & flag)) ? 1 : 0)
+ static inline int sd_balance_for_mc_power(void)
+ {
+       if (sched_smt_power_savings)
+               return SD_POWERSAVINGS_BALANCE;
   
+       return 0;
+ }
+ 
+ static inline int sd_balance_for_package_power(void)
+ {
+       if (sched_mc_power_savings | sched_smt_power_savings)
+               return SD_POWERSAVINGS_BALANCE;
+ 
+       return 0;
+ }
+ 
+ /*
+  * Optimise SD flags for power savings:
+  * SD_BALANCE_NEWIDLE helps agressive task consolidation and power savings.
+  * Keep default SD flags if sched_{smt,mc}_power_saving=0
+  */
+ 
+ static inline int sd_power_saving_flags(void)
+ {
+       if (sched_mc_power_savings | sched_smt_power_savings)
+               return SD_BALANCE_NEWIDLE;
+ 
+       return 0;
+ }
   
   struct sched_group {
         struct sched_group *next;       /* Must be a circular list */
-       cpumask_t cpumask;
   
         /*
          * CPU power of this group, SCHED_LOAD_SCALE being max power for a
@@@ -784,8 -817,15 +817,15 @@@
          * (see include/linux/reciprocal_div.h)
          */
         u32 reciprocal_cpu_power;
+ 
+       unsigned long cpumask[];
   };
   
+ static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
+ {
+       return to_cpumask(sg->cpumask);
+ }
+ 
   enum sched_domain_level {
         SD_LV_NONE = 0,
         SD_LV_SIBLING,
@@@ -809,7 -849,6 +849,6 @@@ struct sched_domain 
         struct sched_domain *parent;    /* top domain must be null terminated */
         struct sched_domain *child;     /* bottom domain must be null terminated */
         struct sched_group *groups;     /* the balancing groups of the domain */
-       cpumask_t span;                 /* span of all CPUs in this domain */
         unsigned long min_interval;     /* Minimum balance interval ms */
         unsigned long max_interval;     /* Maximum balance interval ms */
         unsigned int busy_factor;       /* less balancing by factor if busy */
@@@ -864,18 -903,34 +903,34 @@@
   #ifdef CONFIG_SCHED_DEBUG
         char *name;
   #endif
+ 
+       /* span of all CPUs in this domain */
+       unsigned long span[];
   };
   
- extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
+ static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
+ {
+       return to_cpumask(sd->span);
+ }
+ 
+ extern void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
                                     struct sched_domain_attr *dattr_new);
- extern int arch_reinit_sched_domains(void);
+ 
+ /* Test a flag in parent sched domain */
+ static inline int test_sd_parent(struct sched_domain *sd, int flag)
+ {
+       if (sd->parent && (sd->parent->flags & flag))
+               return 1;
+ 
+       return 0;
+ }
   
   #else /* CONFIG_SMP */
   
   struct sched_domain_attr;
   
   static inline void
- partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
+ partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
                         struct sched_domain_attr *dattr_new)
   {
   }
@@@ -926,7 -981,7 +981,7 @@@ struct sched_class 
         void (*task_wake_up) (struct rq *this_rq, struct task_struct *task);
   
         void (*set_cpus_allowed)(struct task_struct *p,
-                                const cpumask_t *newmask);
+                                const struct cpumask *newmask);
   
         void (*rq_online)(struct rq *rq);
         void (*rq_offline)(struct rq *rq);
@@@ -1102,9 -1157,10 +1157,9 @@@ struct task_struct 
         pid_t pid;
         pid_t tgid;
   
- -#ifdef CONFIG_CC_STACKPROTECTOR
         /* Canary value for the -fstack-protector gcc feature */
         unsigned long stack_canary;
- -#endif
+ +
         /* 
          * pointers to (original) parent process, youngest child, younger sibling,
          * older sibling, respectively.  (p->father can be replaced with 
@@@ -1578,12 -1634,12 +1633,12 @@@ extern cputime_t task_gtime(struct task
   
   #ifdef CONFIG_SMP
   extern int set_cpus_allowed_ptr(struct task_struct *p,
-                               const cpumask_t *new_mask);
+                               const struct cpumask *new_mask);
   #else
   static inline int set_cpus_allowed_ptr(struct task_struct *p,
-                                      const cpumask_t *new_mask)
+                                      const struct cpumask *new_mask)
   {
-       if (!cpu_isset(0, *new_mask))
+       if (!cpumask_test_cpu(0, new_mask))
                 return -EINVAL;
         return 0;
   }
@@@ -1650,16 -1706,16 +1705,16 @@@ extern void wake_up_idle_cpu(int cpu)
   static inline void wake_up_idle_cpu(int cpu) { }
   #endif
   
- #ifdef CONFIG_SCHED_DEBUG
   extern unsigned int sysctl_sched_latency;
   extern unsigned int sysctl_sched_min_granularity;
   extern unsigned int sysctl_sched_wakeup_granularity;
+ extern unsigned int sysctl_sched_shares_ratelimit;
+ extern unsigned int sysctl_sched_shares_thresh;
+ #ifdef CONFIG_SCHED_DEBUG
   extern unsigned int sysctl_sched_child_runs_first;
   extern unsigned int sysctl_sched_features;
   extern unsigned int sysctl_sched_migration_cost;
   extern unsigned int sysctl_sched_nr_migrate;
- extern unsigned int sysctl_sched_shares_ratelimit;
- extern unsigned int sysctl_sched_shares_thresh;
   
   int sched_nr_latency_handler(struct ctl_table *table, int write,
                 struct file *file, void __user *buffer, size_t *length,
@@@ -2010,19 -2066,6 +2065,19 @@@ static inline int object_is_on_stack(vo
   
   extern void thread_info_cache_init(void);
   
+ +#ifdef CONFIG_DEBUG_STACK_USAGE
+ +static inline unsigned long stack_not_used(struct task_struct *p)
+ +{
+ +      unsigned long *n = end_of_stack(p);
+ +
+ +      do {    /* Skip over canary */
+ +              n++;
+ +      } while (!*n);
+ +
+ +      return (unsigned long)n - (unsigned long)end_of_stack(p);
+ +}
+ +#endif
+ +
   /* set thread flags in other task's structures
    * - see asm/thread_info.h for TIF_xxxx flags available
    */
@@@ -2207,10 -2250,8 +2262,8 @@@ __trace_special(void *__tr, void *__dat
   }
   #endif
   
- extern long sched_setaffinity(pid_t pid, const cpumask_t *new_mask);
- extern long sched_getaffinity(pid_t pid, cpumask_t *mask);
- 
- extern int sched_mc_power_savings, sched_smt_power_savings;
+ extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
+ extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
   
   extern void normalize_rt_tasks(void);
   
diff --combined init/main.c

index 07da4de,8442094..bfe4fb0
--- 1/init/main.c
--- 2/init/main.c
+++ b/init/main.c
@@@ -14,7 -14,6 +14,7 @@@
   #include <linux/proc_fs.h>
   #include <linux/kernel.h>
   #include <linux/syscalls.h>
+ +#include <linux/stackprotector.h>
   #include <linux/string.h>
   #include <linux/ctype.h>
   #include <linux/delay.h>
@@@ -51,7 -50,6 +51,6 @@@
   #include <linux/rmap.h>
   #include <linux/mempolicy.h>
   #include <linux/key.h>
- #include <linux/unwind.h>
   #include <linux/buffer_head.h>
   #include <linux/page_cgroup.h>
   #include <linux/debug_locks.h>
@@@ -64,6 -62,7 +63,7 @@@
   #include <linux/signal.h>
   #include <linux/idr.h>
   #include <linux/ftrace.h>
+ #include <linux/async.h>
   #include <trace/boot.h>
   
   #include <asm/io.h>
@@@ -76,15 -75,6 +76,6 @@@
   #include <asm/smp.h>
   #endif
   
- /*
-  * This is one of the first .c files built. Error out early if we have compiler
-  * trouble.
-  */
- 
- #if __GNUC__ == 4 && __GNUC_MINOR__ == 1 && __GNUC_PATCHLEVEL__ == 0
- #warning gcc-4.1.0 is known to miscompile the kernel.  A different compiler version is recommended.
- #endif
- 
   static int kernel_init(void *);
   
   extern void init_IRQ(void);
@@@ -118,7 -108,7 +109,7 @@@ EXPORT_SYMBOL(system_state)
   
   extern void time_init(void);
   /* Default late time init is NULL. archs can override this later. */
- void (*late_time_init)(void);
+ void (*__initdata late_time_init)(void);
   extern void softirq_init(void);
   
   /* Untouched command line saved by arch-specific code. */
@@@ -381,12 -371,7 +372,7 @@@ EXPORT_SYMBOL(nr_cpu_ids)
   /* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */
   static void __init setup_nr_cpu_ids(void)
   {
-       int cpu, highest_cpu = 0;
- 
-       for_each_possible_cpu(cpu)
-               highest_cpu = cpu;
- 
-       nr_cpu_ids = highest_cpu + 1;
+       nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1;
   }
   
   #ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
@@@ -462,7 -447,7 +448,7 @@@ static void __init setup_command_line(c
    * gcc-3.4 accidentally inlines this function, so use noinline.
    */
   
- static void noinline __init_refok rest_init(void)
+ static noinline void __init_refok rest_init(void)
         __releases(kernel_lock)
   {
         int pid;
@@@ -528,9 -513,9 +514,9 @@@ static void __init boot_cpu_init(void
   {
         int cpu = smp_processor_id();
         /* Mark the boot cpu "present", "online" etc for SMP and UP case */
-       cpu_set(cpu, cpu_online_map);
-       cpu_set(cpu, cpu_present_map);
-       cpu_set(cpu, cpu_possible_map);
+       set_cpu_online(cpu, true);
+       set_cpu_present(cpu, true);
+       set_cpu_possible(cpu, true);
   }
   
   void __init __weak smp_setup_processor_id(void)
@@@ -541,15 -526,6 +527,6 @@@ void __init __weak thread_info_cache_in
   {
   }
   
- void __init __weak arch_early_irq_init(void)
- {
- }
- 
- void __init __weak early_irq_init(void)
- {
-       arch_early_irq_init();
- }
- 
   asmlinkage void __init start_kernel(void)
   {
         char * command_line;
@@@ -561,15 -537,8 +538,14 @@@
          * Need to run as early as possible, to initialize the
          * lockdep hash:
          */
-       unwind_init();
         lockdep_init();
         debug_objects_early_init();
+ +
+ +      /*
+ +       * Set up the the initial canary ASAP:
+ +       */
+ +      boot_init_stack_canary();
+ +
         cgroup_init_early();
   
         local_irq_disable();
@@@ -589,7 -558,6 +565,6 @@@
         setup_arch(&command_line);
         mm_init_owner(&init_mm, &init_task);
         setup_command_line(command_line);
-       unwind_setup();
         setup_per_cpu_areas();
         setup_nr_cpu_ids();
         smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
@@@ -632,7 -600,8 +607,8 @@@
         sched_clock_init();
         profile_init();
         if (!irqs_disabled())
-               printk("start_kernel(): bug: interrupts were enabled early\n");
+               printk(KERN_CRIT "start_kernel(): bug: interrupts were "
+                                "enabled early\n");
         early_boot_irqs_on();
         local_irq_enable();
   
@@@ -717,7 -686,7 +693,7 @@@
         rest_init();
   }
   
- static int initcall_debug;
+ int initcall_debug;
   core_param(initcall_debug, initcall_debug, bool, 0644);
   
   int do_one_initcall(initcall_t fn)
@@@ -816,8 -785,10 +792,10 @@@ static void run_init_process(char *init
   /* This is a non __init function. Force it to be noinline otherwise gcc
    * makes it inline to init() and it becomes part of init.text section
    */
- static int noinline init_post(void)
+ static noinline int init_post(void)
   {
+       /* need to finish all async __init code before freeing the memory */
+       async_synchronize_full();
         free_initmem();
         unlock_kernel();
         mark_rodata_ro();
diff --combined kernel/exit.c

index e69edc7,c7740fa..2a803c2
--- 1/kernel/exit.c
--- 2/kernel/exit.c
+++ b/kernel/exit.c
@@@ -642,35 -642,31 +642,31 @@@ retry
         /*
          * We found no owner yet mm_users > 1: this implies that we are
          * most likely racing with swapoff (try_to_unuse()) or /proc or
-        * ptrace or page migration (get_task_mm()).  Mark owner as NULL,
-        * so that subsystems can understand the callback and take action.
+        * ptrace or page migration (get_task_mm()).  Mark owner as NULL.
          */
-       down_write(&mm->mmap_sem);
-       cgroup_mm_owner_callbacks(mm->owner, NULL);
         mm->owner = NULL;
-       up_write(&mm->mmap_sem);
         return;
   
   assign_new_owner:
         BUG_ON(c == p);
         get_task_struct(c);
-       read_unlock(&tasklist_lock);
-       down_write(&mm->mmap_sem);
         /*
          * The task_lock protects c->mm from changing.
          * We always want mm->owner->mm == mm
          */
         task_lock(c);
+       /*
+        * Delay read_unlock() till we have the task_lock()
+        * to ensure that c does not slip away underneath us
+        */
+       read_unlock(&tasklist_lock);
         if (c->mm != mm) {
                 task_unlock(c);
-               up_write(&mm->mmap_sem);
                 put_task_struct(c);
                 goto retry;
         }
-       cgroup_mm_owner_callbacks(mm->owner, c);
         mm->owner = c;
         task_unlock(c);
-       up_write(&mm->mmap_sem);
         put_task_struct(c);
   }
   #endif /* CONFIG_MM_OWNER */
@@@ -981,9 -977,12 +977,9 @@@ static void check_stack_usage(void
   {
         static DEFINE_SPINLOCK(low_water_lock);
         static int lowest_to_date = THREAD_SIZE;
- -      unsigned long *n = end_of_stack(current);
         unsigned long free;
   
- -      while (*n == 0)
- -              n++;
- -      free = (unsigned long)n - (unsigned long)end_of_stack(current);
+ +      free = stack_not_used(current);
   
         if (free >= lowest_to_date)
                 return;
@@@ -1052,10 -1051,7 +1048,7 @@@ NORET_TYPE void do_exit(long code
                                 preempt_count());
   
         acct_update_integrals(tsk);
-       if (tsk->mm) {
-               update_hiwater_rss(tsk->mm);
-               update_hiwater_vm(tsk->mm);
-       }
+ 
         group_dead = atomic_dec_and_test(&tsk->signal->live);
         if (group_dead) {
                 hrtimer_cancel(&tsk->signal->real_timer);
diff --combined kernel/fork.c

index 913284e,1d68f12..4a9b318
--- 1/kernel/fork.c
--- 2/kernel/fork.c
+++ b/kernel/fork.c
@@@ -61,7 -61,6 +61,7 @@@
   #include <linux/proc_fs.h>
   #include <linux/blkdev.h>
   #include <trace/sched.h>
+ +#include <linux/magic.h>
   
   #include <asm/pgtable.h>
   #include <asm/pgalloc.h>
@@@ -213,8 -212,6 +213,8 @@@ static struct task_struct *dup_task_str
   {
         struct task_struct *tsk;
         struct thread_info *ti;
+ +      unsigned long *stackend;
+ +
         int err;
   
         prepare_to_copy(orig);
@@@ -240,8 -237,6 +240,8 @@@
                 goto out;
   
         setup_thread_stack(tsk, orig);
+ +      stackend = end_of_stack(tsk);
+ +      *stackend = STACK_END_MAGIC;    /* for overflow detection */
   
   #ifdef CONFIG_CC_STACKPROTECTOR
         tsk->stack_canary = get_random_int();
@@@ -405,6 -400,18 +405,18 @@@ __cacheline_aligned_in_smp DEFINE_SPINL
   #define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
   #define free_mm(mm)   (kmem_cache_free(mm_cachep, (mm)))
   
+ static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT;
+ 
+ static int __init coredump_filter_setup(char *s)
+ {
+       default_dump_filter =
+               (simple_strtoul(s, NULL, 0) << MMF_DUMP_FILTER_SHIFT) &
+               MMF_DUMP_FILTER_MASK;
+       return 1;
+ }
+ 
+ __setup("coredump_filter=", coredump_filter_setup);
+ 
   #include <linux/init_task.h>
   
   static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
@@@ -413,8 -420,7 +425,7 @@@
         atomic_set(&mm->mm_count, 1);
         init_rwsem(&mm->mmap_sem);
         INIT_LIST_HEAD(&mm->mmlist);
-       mm->flags = (current->mm) ? current->mm->flags
-                                 : MMF_DUMP_FILTER_DEFAULT;
+       mm->flags = (current->mm) ? current->mm->flags : default_dump_filter;
         mm->core_state = NULL;
         mm->nr_ptes = 0;
         set_mm_counter(mm, file_rss, 0);
@@@ -763,7 -769,7 +774,7 @@@ static int copy_sighand(unsigned long c
   {
         struct sighand_struct *sig;
   
-       if (clone_flags & (CLONE_SIGHAND | CLONE_THREAD)) {
+       if (clone_flags & CLONE_SIGHAND) {
                 atomic_inc(&current->sighand->count);
                 return 0;
         }
@@@ -1120,12 -1126,12 +1131,12 @@@ static struct task_struct *copy_process
   
         if (pid != &init_struct_pid) {
                 retval = -ENOMEM;
-               pid = alloc_pid(task_active_pid_ns(p));
+               pid = alloc_pid(p->nsproxy->pid_ns);
                 if (!pid)
                         goto bad_fork_cleanup_io;
   
                 if (clone_flags & CLONE_NEWPID) {
-                       retval = pid_ns_prepare_proc(task_active_pid_ns(p));
+                       retval = pid_ns_prepare_proc(p->nsproxy->pid_ns);
                         if (retval < 0)
                                 goto bad_fork_free_pid;
                 }
@@@ -1475,12 -1481,10 +1486,10 @@@ void __init proc_caches_init(void
         fs_cachep = kmem_cache_create("fs_cache",
                         sizeof(struct fs_struct), 0,
                         SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
-       vm_area_cachep = kmem_cache_create("vm_area_struct",
-                       sizeof(struct vm_area_struct), 0,
-                       SLAB_PANIC, NULL);
         mm_cachep = kmem_cache_create("mm_struct",
                         sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
                         SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+       mmap_init();
   }
   
   /*
diff --combined kernel/panic.c

index 3a0b089,2a2ff36..33cab3d
--- 1/kernel/panic.c
--- 2/kernel/panic.c
+++ b/kernel/panic.c
@@@ -74,9 -74,6 +74,9 @@@ NORET_TYPE void panic(const char * fmt
         vsnprintf(buf, sizeof(buf), fmt, args);
         va_end(args);
         printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf);
+ +#ifdef CONFIG_DEBUG_BUGVERBOSE
+ +      dump_stack();
+ +#endif
         bust_spinlocks(0);
   
         /*
@@@ -302,6 -299,8 +302,8 @@@ static int init_oops_id(void
   {
         if (!oops_id)
                 get_random_bytes(&oops_id, sizeof(oops_id));
+       else
+               oops_id++;
   
         return 0;
   }
@@@ -356,22 -355,15 +358,22 @@@ EXPORT_SYMBOL(warn_slowpath)
   #endif
   
   #ifdef CONFIG_CC_STACKPROTECTOR
+ +
+ +#ifndef GCC_HAS_SP
+ +#warning You have selected the CONFIG_CC_STACKPROTECTOR option, but the gcc used does not support this.
+ +#endif
+ +
   /*
    * Called when gcc's -fstack-protector feature is used, and
    * gcc detects corruption of the on-stack canary value
    */
   void __stack_chk_fail(void)
   {
- -      panic("stack-protector: Kernel stack is corrupted");
+ +      panic("stack-protector: Kernel stack is corrupted in: %p\n",
+ +              __builtin_return_address(0));
   }
   EXPORT_SYMBOL(__stack_chk_fail);
+ +
   #endif
   
   core_param(panic, panic_timeout, int, 0644);
diff --combined kernel/sched.c

index c731dd8,8be2c13..1d29090
--- 1/kernel/sched.c
--- 2/kernel/sched.c
+++ b/kernel/sched.c
@@@ -125,6 -125,9 +125,9 @@@ DEFINE_TRACE(sched_switch)
   DEFINE_TRACE(sched_migrate_task);
   
   #ifdef CONFIG_SMP
+ 
+ static void double_rq_lock(struct rq *rq1, struct rq *rq2);
+ 
   /*
    * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
    * Since cpu_power is a 'constant', we can use a reciprocal divide.
@@@ -498,18 -501,26 +501,26 @@@ struct rt_rq 
    */
   struct root_domain {
         atomic_t refcount;
-       cpumask_t span;
-       cpumask_t online;
+       cpumask_var_t span;
+       cpumask_var_t online;
   
         /*
          * The "RT overload" flag: it gets set if a CPU has more than
          * one runnable RT task.
          */
-       cpumask_t rto_mask;
+       cpumask_var_t rto_mask;
         atomic_t rto_count;
   #ifdef CONFIG_SMP
         struct cpupri cpupri;
   #endif
+ #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+       /*
+        * Preferred wake up cpu nominated by sched_mc balance that will be
+        * used when most cpus are idle in the system indicating overall very
+        * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2)
+        */
+       unsigned int sched_mc_preferred_wakeup_cpu;
+ #endif
   };
   
   /*
@@@ -1514,7 -1525,7 +1525,7 @@@ static int tg_shares_up(struct task_gro
         struct sched_domain *sd = data;
         int i;
   
-       for_each_cpu_mask(i, sd->span) {
+       for_each_cpu(i, sched_domain_span(sd)) {
                 /*
                  * If there are currently no tasks on the cpu pretend there
                  * is one of average load so that when a new task gets to
@@@ -1535,7 -1546,7 +1546,7 @@@
         if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
                 shares = tg->shares;
   
-       for_each_cpu_mask(i, sd->span)
+       for_each_cpu(i, sched_domain_span(sd))
                 update_group_shares_cpu(tg, i, shares, rq_weight);
   
         return 0;
@@@ -2101,15 -2112,17 +2112,17 @@@ find_idlest_group(struct sched_domain *
                 int i;
   
                 /* Skip over this group if it has no CPUs allowed */
-               if (!cpus_intersects(group->cpumask, p->cpus_allowed))
+               if (!cpumask_intersects(sched_group_cpus(group),
+                                       &p->cpus_allowed))
                         continue;
   
-               local_group = cpu_isset(this_cpu, group->cpumask);
+               local_group = cpumask_test_cpu(this_cpu,
+                                              sched_group_cpus(group));
   
                 /* Tally up the load of all CPUs in the group */
                 avg_load = 0;
   
-               for_each_cpu_mask_nr(i, group->cpumask) {
+               for_each_cpu(i, sched_group_cpus(group)) {
                         /* Bias balancing toward cpus of our domain */
                         if (local_group)
                                 load = source_load(i, load_idx);
@@@ -2141,17 -2154,14 +2154,14 @@@
    * find_idlest_cpu - find the idlest cpu among the cpus in group.
    */
   static int
- find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu,
-               cpumask_t *tmp)
+ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
   {
         unsigned long load, min_load = ULONG_MAX;
         int idlest = -1;
         int i;
   
         /* Traverse only the allowed CPUs */
-       cpus_and(*tmp, group->cpumask, p->cpus_allowed);
- 
-       for_each_cpu_mask_nr(i, *tmp) {
+       for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
                 load = weighted_cpuload(i);
   
                 if (load < min_load || (load == min_load && i == this_cpu)) {
@@@ -2193,7 -2203,6 +2203,6 @@@ static int sched_balance_self(int cpu, 
                 update_shares(sd);
   
         while (sd) {
-               cpumask_t span, tmpmask;
                 struct sched_group *group;
                 int new_cpu, weight;
   
@@@ -2202,14 -2211,13 +2211,13 @@@
                         continue;
                 }
   
-               span = sd->span;
                 group = find_idlest_group(sd, t, cpu);
                 if (!group) {
                         sd = sd->child;
                         continue;
                 }
   
-               new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask);
+               new_cpu = find_idlest_cpu(group, t, cpu);
                 if (new_cpu == -1 || new_cpu == cpu) {
                         /* Now try balancing at a lower domain level of cpu */
                         sd = sd->child;
@@@ -2218,10 -2226,10 +2226,10 @@@
   
                 /* Now try balancing at a lower domain level of new_cpu */
                 cpu = new_cpu;
+               weight = cpumask_weight(sched_domain_span(sd));
                 sd = NULL;
-               weight = cpus_weight(span);
                 for_each_domain(cpu, tmp) {
-                       if (weight <= cpus_weight(tmp->span))
+                       if (weight <= cpumask_weight(sched_domain_span(tmp)))
                                 break;
                         if (tmp->flags & flag)
                                 sd = tmp;
@@@ -2266,7 -2274,7 +2274,7 @@@ static int try_to_wake_up(struct task_s
                 cpu = task_cpu(p);
   
                 for_each_domain(this_cpu, sd) {
-                       if (cpu_isset(cpu, sd->span)) {
+                       if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
                                 update_shares(sd);
                                 break;
                         }
@@@ -2315,7 -2323,7 +2323,7 @@@
         else {
                 struct sched_domain *sd;
                 for_each_domain(this_cpu, sd) {
-                       if (cpu_isset(cpu, sd->span)) {
+                       if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
                                 schedstat_inc(sd, ttwu_wake_remote);
                                 break;
                         }
@@@ -2846,7 -2854,7 +2854,7 @@@ static void sched_migrate_task(struct t
         struct rq *rq;
   
         rq = task_rq_lock(p, &flags);
-       if (!cpu_isset(dest_cpu, p->cpus_allowed)
+       if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)
             || unlikely(!cpu_active(dest_cpu)))
                 goto out;
   
@@@ -2911,7 -2919,7 +2919,7 @@@ int can_migrate_task(struct task_struc
          * 2) cannot be migrated to this CPU due to cpus_allowed, or
          * 3) are cache-hot on their current CPU.
          */
-       if (!cpu_isset(this_cpu, p->cpus_allowed)) {
+       if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
                 schedstat_inc(p, se.nr_failed_migrations_affine);
                 return 0;
         }
@@@ -3086,7 -3094,7 +3094,7 @@@ static int move_one_task(struct rq *thi
   static struct sched_group *
   find_busiest_group(struct sched_domain *sd, int this_cpu,
                    unsigned long *imbalance, enum cpu_idle_type idle,
-                  int *sd_idle, const cpumask_t *cpus, int *balance)
+                  int *sd_idle, const struct cpumask *cpus, int *balance)
   {
         struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
         unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@@ -3122,10 -3130,11 +3130,11 @@@
                 unsigned long sum_avg_load_per_task;
                 unsigned long avg_load_per_task;
   
-               local_group = cpu_isset(this_cpu, group->cpumask);
+               local_group = cpumask_test_cpu(this_cpu,
+                                              sched_group_cpus(group));
   
                 if (local_group)
-                       balance_cpu = first_cpu(group->cpumask);
+                       balance_cpu = cpumask_first(sched_group_cpus(group));
   
                 /* Tally up the load of all CPUs in the group */
                 sum_weighted_load = sum_nr_running = avg_load = 0;
@@@ -3134,13 -3143,8 +3143,8 @@@
                 max_cpu_load = 0;
                 min_cpu_load = ~0UL;
   
-               for_each_cpu_mask_nr(i, group->cpumask) {
-                       struct rq *rq;
- 
-                       if (!cpu_isset(i, *cpus))
-                               continue;
- 
-                       rq = cpu_rq(i);
+               for_each_cpu_and(i, sched_group_cpus(group), cpus) {
+                       struct rq *rq = cpu_rq(i);
   
                         if (*sd_idle && rq->nr_running)
                                 *sd_idle = 0;
@@@ -3251,8 -3255,8 +3255,8 @@@
                  */
                 if ((sum_nr_running < min_nr_running) ||
                     (sum_nr_running == min_nr_running &&
-                    first_cpu(group->cpumask) <
-                    first_cpu(group_min->cpumask))) {
+                    cpumask_first(sched_group_cpus(group)) >
+                    cpumask_first(sched_group_cpus(group_min)))) {
                         group_min = group;
                         min_nr_running = sum_nr_running;
                         min_load_per_task = sum_weighted_load /
@@@ -3267,8 -3271,8 +3271,8 @@@
                 if (sum_nr_running <= group_capacity - 1) {
                         if (sum_nr_running > leader_nr_running ||
                             (sum_nr_running == leader_nr_running &&
-                            first_cpu(group->cpumask) >
-                             first_cpu(group_leader->cpumask))) {
+                            cpumask_first(sched_group_cpus(group)) <
+                            cpumask_first(sched_group_cpus(group_leader)))) {
                                 group_leader = group;
                                 leader_nr_running = sum_nr_running;
                         }
@@@ -3394,6 -3398,10 +3398,10 @@@ out_balanced
   
         if (this == group_leader && group_leader != group_min) {
                 *imbalance = min_load_per_task;
+               if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
+                       cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
+                               cpumask_first(sched_group_cpus(group_leader));
+               }
                 return group_min;
         }
   #endif
@@@ -3407,16 -3415,16 +3415,16 @@@ ret
    */
   static struct rq *
   find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
-                  unsigned long imbalance, const cpumask_t *cpus)
+                  unsigned long imbalance, const struct cpumask *cpus)
   {
         struct rq *busiest = NULL, *rq;
         unsigned long max_load = 0;
         int i;
   
-       for_each_cpu_mask_nr(i, group->cpumask) {
+       for_each_cpu(i, sched_group_cpus(group)) {
                 unsigned long wl;
   
-               if (!cpu_isset(i, *cpus))
+               if (!cpumask_test_cpu(i, cpus))
                         continue;
   
                 rq = cpu_rq(i);
@@@ -3446,7 -3454,7 +3454,7 @@@
    */
   static int load_balance(int this_cpu, struct rq *this_rq,
                         struct sched_domain *sd, enum cpu_idle_type idle,
-                       int *balance, cpumask_t *cpus)
+                       int *balance, struct cpumask *cpus)
   {
         int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
         struct sched_group *group;
@@@ -3454,7 -3462,7 +3462,7 @@@
         struct rq *busiest;
         unsigned long flags;
   
-       cpus_setall(*cpus);
+       cpumask_setall(cpus);
   
         /*
          * When power savings policy is enabled for the parent domain, idle
@@@ -3514,8 -3522,8 +3522,8 @@@ redo
   
                 /* All tasks on this runqueue were pinned by CPU affinity */
                 if (unlikely(all_pinned)) {
-                       cpu_clear(cpu_of(busiest), *cpus);
-                       if (!cpus_empty(*cpus))
+                       cpumask_clear_cpu(cpu_of(busiest), cpus);
+                       if (!cpumask_empty(cpus))
                                 goto redo;
                         goto out_balanced;
                 }
@@@ -3532,7 -3540,8 +3540,8 @@@
                         /* don't kick the migration_thread, if the curr
                          * task on busiest cpu can't be moved to this_cpu
                          */
-                       if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
+                       if (!cpumask_test_cpu(this_cpu,
+                                             &busiest->curr->cpus_allowed)) {
                                 spin_unlock_irqrestore(&busiest->lock, flags);
                                 all_pinned = 1;
                                 goto out_one_pinned;
@@@ -3607,7 -3616,7 +3616,7 @@@ out
    */
   static int
   load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
-                       cpumask_t *cpus)
+                       struct cpumask *cpus)
   {
         struct sched_group *group;
         struct rq *busiest = NULL;
@@@ -3616,7 -3625,7 +3625,7 @@@
         int sd_idle = 0;
         int all_pinned = 0;
   
-       cpus_setall(*cpus);
+       cpumask_setall(cpus);
   
         /*
          * When power savings policy is enabled for the parent domain, idle
@@@ -3660,17 -3669,76 +3669,76 @@@ redo
                 double_unlock_balance(this_rq, busiest);
   
                 if (unlikely(all_pinned)) {
-                       cpu_clear(cpu_of(busiest), *cpus);
-                       if (!cpus_empty(*cpus))
+                       cpumask_clear_cpu(cpu_of(busiest), cpus);
+                       if (!cpumask_empty(cpus))
                                 goto redo;
                 }
         }
   
         if (!ld_moved) {
+               int active_balance = 0;
+ 
                 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
                 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
                     !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
                         return -1;
+ 
+               if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
+                       return -1;
+ 
+               if (sd->nr_balance_failed++ < 2)
+                       return -1;
+ 
+               /*
+                * The only task running in a non-idle cpu can be moved to this
+                * cpu in an attempt to completely freeup the other CPU
+                * package. The same method used to move task in load_balance()
+                * have been extended for load_balance_newidle() to speedup
+                * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2)
+                *
+                * The package power saving logic comes from
+                * find_busiest_group().  If there are no imbalance, then
+                * f_b_g() will return NULL.  However when sched_mc={1,2} then
+                * f_b_g() will select a group from which a running task may be
+                * pulled to this cpu in order to make the other package idle.
+                * If there is no opportunity to make a package idle and if
+                * there are no imbalance, then f_b_g() will return NULL and no
+                * action will be taken in load_balance_newidle().
+                *
+                * Under normal task pull operation due to imbalance, there
+                * will be more than one task in the source run queue and
+                * move_tasks() will succeed.  ld_moved will be true and this
+                * active balance code will not be triggered.
+                */
+ 
+               /* Lock busiest in correct order while this_rq is held */
+               double_lock_balance(this_rq, busiest);
+ 
+               /*
+                * don't kick the migration_thread, if the curr
+                * task on busiest cpu can't be moved to this_cpu
+                */
+               if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
+                       double_unlock_balance(this_rq, busiest);
+                       all_pinned = 1;
+                       return ld_moved;
+               }
+ 
+               if (!busiest->active_balance) {
+                       busiest->active_balance = 1;
+                       busiest->push_cpu = this_cpu;
+                       active_balance = 1;
+               }
+ 
+               double_unlock_balance(this_rq, busiest);
+               /*
+                * Should not call ttwu while holding a rq->lock
+                */
+               spin_unlock(&this_rq->lock);
+               if (active_balance)
+                       wake_up_process(busiest->migration_thread);
+               spin_lock(&this_rq->lock);
+ 
         } else
                 sd->nr_balance_failed = 0;
   
@@@ -3696,7 -3764,10 +3764,10 @@@ static void idle_balance(int this_cpu, 
         struct sched_domain *sd;
         int pulled_task = 0;
         unsigned long next_balance = jiffies + HZ;
-       cpumask_t tmpmask;
+       cpumask_var_t tmpmask;
+ 
+       if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC))
+               return;
   
         for_each_domain(this_cpu, sd) {
                 unsigned long interval;
@@@ -3707,7 -3778,7 +3778,7 @@@
                 if (sd->flags & SD_BALANCE_NEWIDLE)
                         /* If we've pulled tasks over stop searching: */
                         pulled_task = load_balance_newidle(this_cpu, this_rq,
-                                                          sd, &tmpmask);
+                                                          sd, tmpmask);
   
                 interval = msecs_to_jiffies(sd->balance_interval);
                 if (time_after(next_balance, sd->last_balance + interval))
@@@ -3722,6 -3793,7 +3793,7 @@@
                  */
                 this_rq->next_balance = next_balance;
         }
+       free_cpumask_var(tmpmask);
   }
   
   /*
@@@ -3759,7 -3831,7 +3831,7 @@@ static void active_load_balance(struct 
         /* Search for an sd spanning us and the target CPU. */
         for_each_domain(target_cpu, sd) {
                 if ((sd->flags & SD_LOAD_BALANCE) &&
-                   cpu_isset(busiest_cpu, sd->span))
+                   cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
                                 break;
         }
   
@@@ -3778,10 -3850,9 +3850,9 @@@
   #ifdef CONFIG_NO_HZ
   static struct {
         atomic_t load_balancer;
-       cpumask_t cpu_mask;
+       cpumask_var_t cpu_mask;
   } nohz ____cacheline_aligned = {
         .load_balancer = ATOMIC_INIT(-1),
-       .cpu_mask = CPU_MASK_NONE,
   };
   
   /*
@@@ -3809,7 -3880,7 +3880,7 @@@ int select_nohz_load_balancer(int stop_
         int cpu = smp_processor_id();
   
         if (stop_tick) {
-               cpu_set(cpu, nohz.cpu_mask);
+               cpumask_set_cpu(cpu, nohz.cpu_mask);
                 cpu_rq(cpu)->in_nohz_recently = 1;
   
                 /*
@@@ -3823,7 -3894,7 +3894,7 @@@
                 }
   
                 /* time for ilb owner also to sleep */
-               if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
+               if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
                         if (atomic_read(&nohz.load_balancer) == cpu)
                                 atomic_set(&nohz.load_balancer, -1);
                         return 0;
@@@ -3836,10 -3907,10 +3907,10 @@@
                 } else if (atomic_read(&nohz.load_balancer) == cpu)
                         return 1;
         } else {
-               if (!cpu_isset(cpu, nohz.cpu_mask))
+               if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
                         return 0;
   
-               cpu_clear(cpu, nohz.cpu_mask);
+               cpumask_clear_cpu(cpu, nohz.cpu_mask);
   
                 if (atomic_read(&nohz.load_balancer) == cpu)
                         if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
@@@ -3867,7 -3938,11 +3938,11 @@@ static void rebalance_domains(int cpu, 
         unsigned long next_balance = jiffies + 60*HZ;
         int update_next_balance = 0;
         int need_serialize;
-       cpumask_t tmp;
+       cpumask_var_t tmp;
+ 
+       /* Fails alloc?  Rebalancing probably not a priority right now. */
+       if (!alloc_cpumask_var(&tmp, GFP_ATOMIC))
+               return;
   
         for_each_domain(cpu, sd) {
                 if (!(sd->flags & SD_LOAD_BALANCE))
@@@ -3892,7 -3967,7 +3967,7 @@@
                 }
   
                 if (time_after_eq(jiffies, sd->last_balance + interval)) {
-                       if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) {
+                       if (load_balance(cpu, rq, sd, idle, &balance, tmp)) {
                                 /*
                                  * We've pulled tasks over so either we're no
                                  * longer idle, or one of our SMT siblings is
@@@ -3926,6 -4001,8 +4001,8 @@@ out
          */
         if (likely(update_next_balance))
                 rq->next_balance = next_balance;
+ 
+       free_cpumask_var(tmp);
   }
   
   /*
@@@ -3950,12 -4027,13 +4027,13 @@@ static void run_rebalance_domains(struc
          */
         if (this_rq->idle_at_tick &&
             atomic_read(&nohz.load_balancer) == this_cpu) {
-               cpumask_t cpus = nohz.cpu_mask;
                 struct rq *rq;
                 int balance_cpu;
   
-               cpu_clear(this_cpu, cpus);
-               for_each_cpu_mask_nr(balance_cpu, cpus) {
+               for_each_cpu(balance_cpu, nohz.cpu_mask) {
+                       if (balance_cpu == this_cpu)
+                               continue;
+ 
                         /*
                          * If this cpu gets work to do, stop the load balancing
                          * work being done for other cpus. Next load
@@@ -3993,7 -4071,7 +4071,7 @@@ static inline void trigger_load_balance
                 rq->in_nohz_recently = 0;
   
                 if (atomic_read(&nohz.load_balancer) == cpu) {
-                       cpu_clear(cpu, nohz.cpu_mask);
+                       cpumask_clear_cpu(cpu, nohz.cpu_mask);
                         atomic_set(&nohz.load_balancer, -1);
                 }
   
@@@ -4006,7 -4084,7 +4084,7 @@@
                          * TBD: Traverse the sched domains and nominate
                          * the nearest cpu in the nohz.cpu_mask.
                          */
-                       int ilb = first_cpu(nohz.cpu_mask);
+                       int ilb = cpumask_first(nohz.cpu_mask);
   
                         if (ilb < nr_cpu_ids)
                                 resched_cpu(ilb);
@@@ -4018,7 -4096,7 +4096,7 @@@
          * cpus with ticks stopped, is it time for that to stop?
          */
         if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
-           cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
+           cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
                 resched_cpu(cpu);
                 return;
         }
@@@ -4028,7 -4106,7 +4106,7 @@@
          * someone else, then no need raise the SCHED_SOFTIRQ
          */
         if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
-           cpu_isset(cpu, nohz.cpu_mask))
+           cpumask_test_cpu(cpu, nohz.cpu_mask))
                 return;
   #endif
         if (time_after_eq(jiffies, rq->next_balance))
@@@ -4080,13 -4158,17 +4158,17 @@@ unsigned long long task_delta_exec(stru
    * Account user cpu time to a process.
    * @p: the process that the cpu time gets accounted to
    * @cputime: the cpu time spent in user space since the last update
+  * @cputime_scaled: cputime scaled by cpu frequency
    */
- void account_user_time(struct task_struct *p, cputime_t cputime)
+ void account_user_time(struct task_struct *p, cputime_t cputime,
+                      cputime_t cputime_scaled)
   {
         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
         cputime64_t tmp;
   
+       /* Add user time to process. */
         p->utime = cputime_add(p->utime, cputime);
+       p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
         account_group_user_time(p, cputime);
   
         /* Add user time to cpustat. */
@@@ -4103,51 -4185,48 +4185,48 @@@
    * Account guest cpu time to a process.
    * @p: the process that the cpu time gets accounted to
    * @cputime: the cpu time spent in virtual machine since the last update
+  * @cputime_scaled: cputime scaled by cpu frequency
    */
- static void account_guest_time(struct task_struct *p, cputime_t cputime)
+ static void account_guest_time(struct task_struct *p, cputime_t cputime,
+                              cputime_t cputime_scaled)
   {
         cputime64_t tmp;
         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
   
         tmp = cputime_to_cputime64(cputime);
   
+       /* Add guest time to process. */
         p->utime = cputime_add(p->utime, cputime);
+       p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
         account_group_user_time(p, cputime);
         p->gtime = cputime_add(p->gtime, cputime);
   
+       /* Add guest time to cpustat. */
         cpustat->user = cputime64_add(cpustat->user, tmp);
         cpustat->guest = cputime64_add(cpustat->guest, tmp);
   }
   
   /*
-  * Account scaled user cpu time to a process.
-  * @p: the process that the cpu time gets accounted to
-  * @cputime: the cpu time spent in user space since the last update
-  */
- void account_user_time_scaled(struct task_struct *p, cputime_t cputime)
- {
-       p->utimescaled = cputime_add(p->utimescaled, cputime);
- }
- 
- /*
    * Account system cpu time to a process.
    * @p: the process that the cpu time gets accounted to
    * @hardirq_offset: the offset to subtract from hardirq_count()
    * @cputime: the cpu time spent in kernel space since the last update
+  * @cputime_scaled: cputime scaled by cpu frequency
    */
   void account_system_time(struct task_struct *p, int hardirq_offset,
-                        cputime_t cputime)
+                        cputime_t cputime, cputime_t cputime_scaled)
   {
         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
-       struct rq *rq = this_rq();
         cputime64_t tmp;
   
         if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
-               account_guest_time(p, cputime);
+               account_guest_time(p, cputime, cputime_scaled);
                 return;
         }
   
+       /* Add system time to process. */
         p->stime = cputime_add(p->stime, cputime);
+       p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
         account_group_system_time(p, cputime);
   
         /* Add system time to cpustat. */
@@@ -4156,49 -4235,85 +4235,85 @@@
                 cpustat->irq = cputime64_add(cpustat->irq, tmp);
         else if (softirq_count())
                 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
-       else if (p != rq->idle)
-               cpustat->system = cputime64_add(cpustat->system, tmp);
-       else if (atomic_read(&rq->nr_iowait) > 0)
-               cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
         else
-               cpustat->idle = cputime64_add(cpustat->idle, tmp);
+               cpustat->system = cputime64_add(cpustat->system, tmp);
+ 
         /* Account for system time used */
         acct_update_integrals(p);
   }
   
   /*
-  * Account scaled system cpu time to a process.
-  * @p: the process that the cpu time gets accounted to
-  * @hardirq_offset: the offset to subtract from hardirq_count()
-  * @cputime: the cpu time spent in kernel space since the last update
+  * Account for involuntary wait time.
+  * @steal: the cpu time spent in involuntary wait
    */
- void account_system_time_scaled(struct task_struct *p, cputime_t cputime)
+ void account_steal_time(cputime_t cputime)
   {
-       p->stimescaled = cputime_add(p->stimescaled, cputime);
+       struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+       cputime64_t cputime64 = cputime_to_cputime64(cputime);
+ 
+       cpustat->steal = cputime64_add(cpustat->steal, cputime64);
   }
   
   /*
-  * Account for involuntary wait time.
-  * @p: the process from which the cpu time has been stolen
-  * @steal: the cpu time spent in involuntary wait
+  * Account for idle time.
+  * @cputime: the cpu time spent in idle wait
    */
- void account_steal_time(struct task_struct *p, cputime_t steal)
+ void account_idle_time(cputime_t cputime)
   {
         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
-       cputime64_t tmp = cputime_to_cputime64(steal);
+       cputime64_t cputime64 = cputime_to_cputime64(cputime);
         struct rq *rq = this_rq();
   
-       if (p == rq->idle) {
-               p->stime = cputime_add(p->stime, steal);
-               if (atomic_read(&rq->nr_iowait) > 0)
-                       cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
-               else
-                       cpustat->idle = cputime64_add(cpustat->idle, tmp);
-       } else
-               cpustat->steal = cputime64_add(cpustat->steal, tmp);
+       if (atomic_read(&rq->nr_iowait) > 0)
+               cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);
+       else
+               cpustat->idle = cputime64_add(cpustat->idle, cputime64);
+ }
+ 
+ #ifndef CONFIG_VIRT_CPU_ACCOUNTING
+ 
+ /*
+  * Account a single tick of cpu time.
+  * @p: the process that the cpu time gets accounted to
+  * @user_tick: indicates if the tick is a user or a system tick
+  */
+ void account_process_tick(struct task_struct *p, int user_tick)
+ {
+       cputime_t one_jiffy = jiffies_to_cputime(1);
+       cputime_t one_jiffy_scaled = cputime_to_scaled(one_jiffy);
+       struct rq *rq = this_rq();
+ 
+       if (user_tick)
+               account_user_time(p, one_jiffy, one_jiffy_scaled);
+       else if (p != rq->idle)
+               account_system_time(p, HARDIRQ_OFFSET, one_jiffy,
+                                   one_jiffy_scaled);
+       else
+               account_idle_time(one_jiffy);
   }
   
   /*
+  * Account multiple ticks of steal time.
+  * @p: the process from which the cpu time has been stolen
+  * @ticks: number of stolen ticks
+  */
+ void account_steal_ticks(unsigned long ticks)
+ {
+       account_steal_time(jiffies_to_cputime(ticks));
+ }
+ 
+ /*
+  * Account multiple ticks of idle time.
+  * @ticks: number of stolen ticks
+  */
+ void account_idle_ticks(unsigned long ticks)
+ {
+       account_idle_time(jiffies_to_cputime(ticks));
+ }
+ 
+ #endif
+ 
+ /*
    * Use precise platform statistics if available:
    */
   #ifdef CONFIG_VIRT_CPU_ACCOUNTING
@@@ -5401,10 -5516,9 +5516,9 @@@ out_unlock
         return retval;
   }
   
- long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
+ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
   {
-       cpumask_t cpus_allowed;
-       cpumask_t new_mask = *in_mask;
+       cpumask_var_t cpus_allowed, new_mask;
         struct task_struct *p;
         int retval;
   
@@@ -5426,6 -5540,14 +5540,14 @@@
         get_task_struct(p);
         read_unlock(&tasklist_lock);
   
+       if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
+               retval = -ENOMEM;
+               goto out_put_task;
+       }
+       if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
+               retval = -ENOMEM;
+               goto out_free_cpus_allowed;
+       }
         retval = -EPERM;
         if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
                 goto out_unlock;
@@@ -5434,37 -5556,41 +5556,41 @@@
         if (retval)
                 goto out_unlock;
   
-       cpuset_cpus_allowed(p, &cpus_allowed);
-       cpus_and(new_mask, new_mask, cpus_allowed);
+       cpuset_cpus_allowed(p, cpus_allowed);
+       cpumask_and(new_mask, in_mask, cpus_allowed);
    again:
-       retval = set_cpus_allowed_ptr(p, &new_mask);
+       retval = set_cpus_allowed_ptr(p, new_mask);
   
         if (!retval) {
-               cpuset_cpus_allowed(p, &cpus_allowed);
-               if (!cpus_subset(new_mask, cpus_allowed)) {
+               cpuset_cpus_allowed(p, cpus_allowed);
+               if (!cpumask_subset(new_mask, cpus_allowed)) {
                         /*
                          * We must have raced with a concurrent cpuset
                          * update. Just reset the cpus_allowed to the
                          * cpuset's cpus_allowed
                          */
-                       new_mask = cpus_allowed;
+                       cpumask_copy(new_mask, cpus_allowed);
                         goto again;
                 }
         }
   out_unlock:
+       free_cpumask_var(new_mask);
+ out_free_cpus_allowed:
+       free_cpumask_var(cpus_allowed);
+ out_put_task:
         put_task_struct(p);
         put_online_cpus();
         return retval;
   }
   
   static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
-                            cpumask_t *new_mask)
+                            struct cpumask *new_mask)
   {
-       if (len < sizeof(cpumask_t)) {
-               memset(new_mask, 0, sizeof(cpumask_t));
-       } else if (len > sizeof(cpumask_t)) {
-               len = sizeof(cpumask_t);
-       }
+       if (len < cpumask_size())
+               cpumask_clear(new_mask);
+       else if (len > cpumask_size())
+               len = cpumask_size();
+ 
         return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
   }
   
@@@ -5477,17 -5603,20 +5603,20 @@@
   asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
                                       unsigned long __user *user_mask_ptr)
   {
-       cpumask_t new_mask;
+       cpumask_var_t new_mask;
         int retval;
   
-       retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
-       if (retval)
-               return retval;
+       if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
+               return -ENOMEM;
   
-       return sched_setaffinity(pid, &new_mask);
+       retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
+       if (retval == 0)
+               retval = sched_setaffinity(pid, new_mask);
+       free_cpumask_var(new_mask);
+       return retval;
   }
   
- long sched_getaffinity(pid_t pid, cpumask_t *mask)
+ long sched_getaffinity(pid_t pid, struct cpumask *mask)
   {
         struct task_struct *p;
         int retval;
@@@ -5504,7 -5633,7 +5633,7 @@@
         if (retval)
                 goto out_unlock;
   
-       cpus_and(*mask, p->cpus_allowed, cpu_online_map);
+       cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
   
   out_unlock:
         read_unlock(&tasklist_lock);
@@@ -5523,19 -5652,24 +5652,24 @@@ asmlinkage long sys_sched_getaffinity(p
                                       unsigned long __user *user_mask_ptr)
   {
         int ret;
-       cpumask_t mask;
+       cpumask_var_t mask;
   
-       if (len < sizeof(cpumask_t))
+       if (len < cpumask_size())
                 return -EINVAL;
   
-       ret = sched_getaffinity(pid, &mask);
-       if (ret < 0)
-               return ret;
+       if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+               return -ENOMEM;
   
-       if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
-               return -EFAULT;
+       ret = sched_getaffinity(pid, mask);
+       if (ret == 0) {
+               if (copy_to_user(user_mask_ptr, mask, cpumask_size()))
+                       ret = -EFAULT;
+               else
+                       ret = cpumask_size();
+       }
+       free_cpumask_var(mask);
   
-       return sizeof(cpumask_t);
+       return ret;
   }
   
   /**
@@@ -5805,7 -5939,12 +5939,7 @@@ void sched_show_task(struct task_struc
                 printk(KERN_CONT " %016lx ", thread_saved_pc(p));
   #endif
   #ifdef CONFIG_DEBUG_STACK_USAGE
- -      {
- -              unsigned long *n = end_of_stack(p);
- -              while (!*n)
- -                      n++;
- -              free = (unsigned long)n - (unsigned long)end_of_stack(p);
- -      }
+ +      free = stack_not_used(p);
   #endif
         printk(KERN_CONT "%5lu %5d %6d\n", free,
                 task_pid_nr(p), task_pid_nr(p->real_parent));
@@@ -5872,7 -6011,7 +6006,7 @@@ void __cpuinit init_idle(struct task_st
         idle->se.exec_start = sched_clock();
   
         idle->prio = idle->normal_prio = MAX_PRIO;
-       idle->cpus_allowed = cpumask_of_cpu(cpu);
+       cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
         __set_task_cpu(idle, cpu);
   
         rq->curr = rq->idle = idle;
@@@ -5899,9 -6038,9 +6033,9 @@@
    * indicates which cpus entered this state. This is used
    * in the rcu update to wait only for active cpus. For system
    * which do not switch off the HZ timer nohz_cpu_mask should
-  * always be CPU_MASK_NONE.
+  * always be CPU_BITS_NONE.
    */
- cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
+ cpumask_var_t nohz_cpu_mask;
   
   /*
    * Increase the granularity value when there are more CPUs,
@@@ -5956,7 -6095,7 +6090,7 @@@ static inline void sched_init_granulari
    * task must not exit() & deallocate itself prematurely. The
    * call is not atomic; no spinlocks may be held.
    */
- int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
+ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
   {
         struct migration_req req;
         unsigned long flags;
@@@ -5964,13 -6103,13 +6098,13 @@@
         int ret = 0;
   
         rq = task_rq_lock(p, &flags);
-       if (!cpus_intersects(*new_mask, cpu_online_map)) {
+       if (!cpumask_intersects(new_mask, cpu_online_mask)) {
                 ret = -EINVAL;
                 goto out;
         }
   
         if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
-                    !cpus_equal(p->cpus_allowed, *new_mask))) {
+                    !cpumask_equal(&p->cpus_allowed, new_mask))) {
                 ret = -EINVAL;
                 goto out;
         }
@@@ -5978,15 -6117,15 +6112,15 @@@
         if (p->sched_class->set_cpus_allowed)
                 p->sched_class->set_cpus_allowed(p, new_mask);
         else {
-               p->cpus_allowed = *new_mask;
-               p->rt.nr_cpus_allowed = cpus_weight(*new_mask);
+               cpumask_copy(&p->cpus_allowed, new_mask);
+               p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
         }
   
         /* Can the task run on the task's current CPU? If so, we're done */
-       if (cpu_isset(task_cpu(p), *new_mask))
+       if (cpumask_test_cpu(task_cpu(p), new_mask))
                 goto out;
   
-       if (migrate_task(p, any_online_cpu(*new_mask), &req)) {
+       if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {
                 /* Need help from migration thread: drop lock and wait. */
                 task_rq_unlock(rq, &flags);
                 wake_up_process(rq->migration_thread);
@@@ -6028,7 -6167,7 +6162,7 @@@ static int __migrate_task(struct task_s
         if (task_cpu(p) != src_cpu)
                 goto done;
         /* Affinity changed (again). */
-       if (!cpu_isset(dest_cpu, p->cpus_allowed))
+       if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
                 goto fail;
   
         on_rq = p->se.on_rq;
@@@ -6125,50 -6264,41 +6259,41 @@@ static int __migrate_task_irq(struct ta
    */
   static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
   {
-       unsigned long flags;
-       cpumask_t mask;
-       struct rq *rq;
         int dest_cpu;
+       const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(dead_cpu));
   
-       do {
-               /* On same node? */
-               mask = node_to_cpumask(cpu_to_node(dead_cpu));
-               cpus_and(mask, mask, p->cpus_allowed);
-               dest_cpu = any_online_cpu(mask);
+ again:
+       /* Look for allowed, online CPU in same node. */
+       for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask)
+               if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
+                       goto move;
   
-               /* On any allowed CPU? */
-               if (dest_cpu >= nr_cpu_ids)
-                       dest_cpu = any_online_cpu(p->cpus_allowed);
+       /* Any allowed, online CPU? */
+       dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask);
+       if (dest_cpu < nr_cpu_ids)
+               goto move;
   
-               /* No more Mr. Nice Guy. */
-               if (dest_cpu >= nr_cpu_ids) {
-                       cpumask_t cpus_allowed;
+       /* No more Mr. Nice Guy. */
+       if (dest_cpu >= nr_cpu_ids) {
+               cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
+               dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed);
   
-                       cpuset_cpus_allowed_locked(p, &cpus_allowed);
-                       /*
-                        * Try to stay on the same cpuset, where the
-                        * current cpuset may be a subset of all cpus.
-                        * The cpuset_cpus_allowed_locked() variant of
-                        * cpuset_cpus_allowed() will not block. It must be
-                        * called within calls to cpuset_lock/cpuset_unlock.
-                        */
-                       rq = task_rq_lock(p, &flags);
-                       p->cpus_allowed = cpus_allowed;
-                       dest_cpu = any_online_cpu(p->cpus_allowed);
-                       task_rq_unlock(rq, &flags);
- 
-                       /*
-                        * Don't tell them about moving exiting tasks or
-                        * kernel threads (both mm NULL), since they never
-                        * leave kernel.
-                        */
-                       if (p->mm && printk_ratelimit()) {
-                               printk(KERN_INFO "process %d (%s) no "
-                                      "longer affine to cpu%d\n",
-                                       task_pid_nr(p), p->comm, dead_cpu);
-                       }
+               /*
+                * Don't tell them about moving exiting tasks or
+                * kernel threads (both mm NULL), since they never
+                * leave kernel.
+                */
+               if (p->mm && printk_ratelimit()) {
+                       printk(KERN_INFO "process %d (%s) no "
+                              "longer affine to cpu%d\n",
+                              task_pid_nr(p), p->comm, dead_cpu);
                 }
-       } while (!__migrate_task_irq(p, dead_cpu, dest_cpu));
+       }
+ 
+ move:
+       /* It can have affinity changed while we were choosing. */
+       if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))
+               goto again;
   }
   
   /*
@@@ -6180,7 -6310,7 +6305,7 @@@
    */
   static void migrate_nr_uninterruptible(struct rq *rq_src)
   {
-       struct rq *rq_dest = cpu_rq(any_online_cpu(*CPU_MASK_ALL_PTR));
+       struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask));
         unsigned long flags;
   
         local_irq_save(flags);
@@@ -6470,7 -6600,7 +6595,7 @@@ static void set_rq_online(struct rq *rq
         if (!rq->online) {
                 const struct sched_class *class;
   
-               cpu_set(rq->cpu, rq->rd->online);
+               cpumask_set_cpu(rq->cpu, rq->rd->online);
                 rq->online = 1;
   
                 for_each_class(class) {
@@@ -6490,7 -6620,7 +6615,7 @@@ static void set_rq_offline(struct rq *r
                                 class->rq_offline(rq);
                 }
   
-               cpu_clear(rq->cpu, rq->rd->online);
+               cpumask_clear_cpu(rq->cpu, rq->rd->online);
                 rq->online = 0;
         }
   }
@@@ -6531,7 -6661,7 +6656,7 @@@ migration_call(struct notifier_block *n
                 rq = cpu_rq(cpu);
                 spin_lock_irqsave(&rq->lock, flags);
                 if (rq->rd) {
-                       BUG_ON(!cpu_isset(cpu, rq->rd->span));
+                       BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
   
                         set_rq_online(rq);
                 }
@@@ -6545,7 -6675,7 +6670,7 @@@
                         break;
                 /* Unbind it from offline cpu so it can run. Fall thru. */
                 kthread_bind(cpu_rq(cpu)->migration_thread,
-                            any_online_cpu(cpu_online_map));
+                            cpumask_any(cpu_online_mask));
                 kthread_stop(cpu_rq(cpu)->migration_thread);
                 cpu_rq(cpu)->migration_thread = NULL;
                 break;
@@@ -6595,7 -6725,7 +6720,7 @@@
                 rq = cpu_rq(cpu);
                 spin_lock_irqsave(&rq->lock, flags);
                 if (rq->rd) {
-                       BUG_ON(!cpu_isset(cpu, rq->rd->span));
+                       BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
                         set_rq_offline(rq);
                 }
                 spin_unlock_irqrestore(&rq->lock, flags);
@@@ -6634,13 -6764,13 +6759,13 @@@ early_initcall(migration_init)
   #ifdef CONFIG_SCHED_DEBUG
   
   static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
-                                 cpumask_t *groupmask)
+                                 struct cpumask *groupmask)
   {
         struct sched_group *group = sd->groups;
         char str[256];
   
-       cpulist_scnprintf(str, sizeof(str), sd->span);
-       cpus_clear(*groupmask);
+       cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
+       cpumask_clear(groupmask);
   
         printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
   
@@@ -6654,11 -6784,11 +6779,11 @@@
   
         printk(KERN_CONT "span %s level %s\n", str, sd->name);
   
-       if (!cpu_isset(cpu, sd->span)) {
+       if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
                 printk(KERN_ERR "ERROR: domain->span does not contain "
                                 "CPU%d\n", cpu);
         }
-       if (!cpu_isset(cpu, group->cpumask)) {
+       if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
                 printk(KERN_ERR "ERROR: domain->groups does not contain"
                                 " CPU%d\n", cpu);
         }
@@@ -6678,31 -6808,32 +6803,32 @@@
                         break;
                 }
   
-               if (!cpus_weight(group->cpumask)) {
+               if (!cpumask_weight(sched_group_cpus(group))) {
                         printk(KERN_CONT "\n");
                         printk(KERN_ERR "ERROR: empty group\n");
                         break;
                 }
   
-               if (cpus_intersects(*groupmask, group->cpumask)) {
+               if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
                         printk(KERN_CONT "\n");
                         printk(KERN_ERR "ERROR: repeated CPUs\n");
                         break;
                 }
   
-               cpus_or(*groupmask, *groupmask, group->cpumask);
+               cpumask_or(groupmask, groupmask, sched_group_cpus(group));
   
-               cpulist_scnprintf(str, sizeof(str), group->cpumask);
+               cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
                 printk(KERN_CONT " %s", str);
   
                 group = group->next;
         } while (group != sd->groups);
         printk(KERN_CONT "\n");
   
-       if (!cpus_equal(sd->span, *groupmask))
+       if (!cpumask_equal(sched_domain_span(sd), groupmask))
                 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
   
-       if (sd->parent && !cpus_subset(*groupmask, sd->parent->span))
+       if (sd->parent &&
+           !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
                 printk(KERN_ERR "ERROR: parent span is not a superset "
                         "of domain->span\n");
         return 0;
@@@ -6710,7 -6841,7 +6836,7 @@@
   
   static void sched_domain_debug(struct sched_domain *sd, int cpu)
   {
-       cpumask_t *groupmask;
+       cpumask_var_t groupmask;
         int level = 0;
   
         if (!sd) {
@@@ -6720,8 -6851,7 +6846,7 @@@
   
         printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
   
-       groupmask = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
-       if (!groupmask) {
+       if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {
                 printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
                 return;
         }
@@@ -6734,7 -6864,7 +6859,7 @@@
                 if (!sd)
                         break;
         }
-       kfree(groupmask);
+       free_cpumask_var(groupmask);
   }
   #else /* !CONFIG_SCHED_DEBUG */
   # define sched_domain_debug(sd, cpu) do { } while (0)
@@@ -6742,7 -6872,7 +6867,7 @@@
   
   static int sd_degenerate(struct sched_domain *sd)
   {
-       if (cpus_weight(sd->span) == 1)
+       if (cpumask_weight(sched_domain_span(sd)) == 1)
                 return 1;
   
         /* Following flags need at least 2 groups */
@@@ -6773,7 -6903,7 +6898,7 @@@ sd_parent_degenerate(struct sched_domai
         if (sd_degenerate(parent))
                 return 1;
   
-       if (!cpus_equal(sd->span, parent->span))
+       if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
                 return 0;
   
         /* Does parent contain flags not in child? */
@@@ -6797,6 -6927,16 +6922,16 @@@
         return 1;
   }
   
+ static void free_rootdomain(struct root_domain *rd)
+ {
+       cpupri_cleanup(&rd->cpupri);
+ 
+       free_cpumask_var(rd->rto_mask);
+       free_cpumask_var(rd->online);
+       free_cpumask_var(rd->span);
+       kfree(rd);
+ }
+ 
   static void rq_attach_root(struct rq *rq, struct root_domain *rd)
   {
         unsigned long flags;
@@@ -6806,38 -6946,62 +6941,62 @@@
         if (rq->rd) {
                 struct root_domain *old_rd = rq->rd;
   
-               if (cpu_isset(rq->cpu, old_rd->online))
+               if (cpumask_test_cpu(rq->cpu, old_rd->online))
                         set_rq_offline(rq);
   
-               cpu_clear(rq->cpu, old_rd->span);
+               cpumask_clear_cpu(rq->cpu, old_rd->span);
   
                 if (atomic_dec_and_test(&old_rd->refcount))
-                       kfree(old_rd);
+                       free_rootdomain(old_rd);
         }
   
         atomic_inc(&rd->refcount);
         rq->rd = rd;
   
-       cpu_set(rq->cpu, rd->span);
-       if (cpu_isset(rq->cpu, cpu_online_map))
+       cpumask_set_cpu(rq->cpu, rd->span);
+       if (cpumask_test_cpu(rq->cpu, cpu_online_mask))
                 set_rq_online(rq);
   
         spin_unlock_irqrestore(&rq->lock, flags);
   }
   
- static void init_rootdomain(struct root_domain *rd)
+ static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem)
   {
         memset(rd, 0, sizeof(*rd));
   
-       cpus_clear(rd->span);
-       cpus_clear(rd->online);
+       if (bootmem) {
+               alloc_bootmem_cpumask_var(&def_root_domain.span);
+               alloc_bootmem_cpumask_var(&def_root_domain.online);
+               alloc_bootmem_cpumask_var(&def_root_domain.rto_mask);
+               cpupri_init(&rd->cpupri, true);
+               return 0;
+       }
+ 
+       if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
+               goto out;
+       if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
+               goto free_span;
+       if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+               goto free_online;
+ 
+       if (cpupri_init(&rd->cpupri, false) != 0)
+               goto free_rto_mask;
+       return 0;
   
-       cpupri_init(&rd->cpupri);
+ free_rto_mask:
+       free_cpumask_var(rd->rto_mask);
+ free_online:
+       free_cpumask_var(rd->online);
+ free_span:
+       free_cpumask_var(rd->span);
+ out:
+       return -ENOMEM;
   }
   
   static void init_defrootdomain(void)
   {
-       init_rootdomain(&def_root_domain);
+       init_rootdomain(&def_root_domain, true);
+ 
         atomic_set(&def_root_domain.refcount, 1);
   }
   
@@@ -6849,7 -7013,10 +7008,10 @@@ static struct root_domain *alloc_rootdo
         if (!rd)
                 return NULL;
   
-       init_rootdomain(rd);
+       if (init_rootdomain(rd, false) != 0) {
+               kfree(rd);
+               return NULL;
+       }
   
         return rd;
   }
@@@ -6891,19 -7058,12 +7053,12 @@@ cpu_attach_domain(struct sched_domain *
   }
   
   /* cpus with isolated domains */
- static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
+ static cpumask_var_t cpu_isolated_map;
   
   /* Setup the mask of cpus configured for isolated domains */
   static int __init isolated_cpu_setup(char *str)
   {
-       static int __initdata ints[NR_CPUS];
-       int i;
- 
-       str = get_options(str, ARRAY_SIZE(ints), ints);
-       cpus_clear(cpu_isolated_map);
-       for (i = 1; i <= ints[0]; i++)
-               if (ints[i] < NR_CPUS)
-                       cpu_set(ints[i], cpu_isolated_map);
+       cpulist_parse(str, cpu_isolated_map);
         return 1;
   }
   
@@@ -6912,42 -7072,43 +7067,43 @@@ __setup("isolcpus=", isolated_cpu_setup
   /*
    * init_sched_build_groups takes the cpumask we wish to span, and a pointer
    * to a function which identifies what group(along with sched group) a CPU
-  * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
-  * (due to the fact that we keep track of groups covered with a cpumask_t).
+  * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
+  * (due to the fact that we keep track of groups covered with a struct cpumask).
    *
    * init_sched_build_groups will build a circular linked list of the groups
    * covered by the given span, and will set each group's ->cpumask correctly,
    * and ->cpu_power to 0.
    */
   static void
- init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
-                       int (*group_fn)(int cpu, const cpumask_t *cpu_map,
+ init_sched_build_groups(const struct cpumask *span,
+                       const struct cpumask *cpu_map,
+                       int (*group_fn)(int cpu, const struct cpumask *cpu_map,
                                         struct sched_group **sg,
-                                       cpumask_t *tmpmask),
-                       cpumask_t *covered, cpumask_t *tmpmask)
+                                       struct cpumask *tmpmask),
+                       struct cpumask *covered, struct cpumask *tmpmask)
   {
         struct sched_group *first = NULL, *last = NULL;
         int i;
   
-       cpus_clear(*covered);
+       cpumask_clear(covered);
   
-       for_each_cpu_mask_nr(i, *span) {
+       for_each_cpu(i, span) {
                 struct sched_group *sg;
                 int group = group_fn(i, cpu_map, &sg, tmpmask);
                 int j;
   
-               if (cpu_isset(i, *covered))
+               if (cpumask_test_cpu(i, covered))
                         continue;
   
-               cpus_clear(sg->cpumask);
+               cpumask_clear(sched_group_cpus(sg));
                 sg->__cpu_power = 0;
   
-               for_each_cpu_mask_nr(j, *span) {
+               for_each_cpu(j, span) {
                         if (group_fn(j, cpu_map, NULL, tmpmask) != group)
                                 continue;
   
-                       cpu_set(j, *covered);
-                       cpu_set(j, sg->cpumask);
+                       cpumask_set_cpu(j, covered);
+                       cpumask_set_cpu(j, sched_group_cpus(sg));
                 }
                 if (!first)
                         first = sg;
@@@ -7011,23 -7172,21 +7167,21 @@@ static int find_next_best_node(int node
    * should be one that prevents unnecessary balancing, but also spreads tasks
    * out optimally.
    */
- static void sched_domain_node_span(int node, cpumask_t *span)
+ static void sched_domain_node_span(int node, struct cpumask *span)
   {
         nodemask_t used_nodes;
-       node_to_cpumask_ptr(nodemask, node);
         int i;
   
-       cpus_clear(*span);
+       cpumask_clear(span);
         nodes_clear(used_nodes);
   
-       cpus_or(*span, *span, *nodemask);
+       cpumask_or(span, span, cpumask_of_node(node));
         node_set(node, used_nodes);
   
         for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
                 int next_node = find_next_best_node(node, &used_nodes);
   
-               node_to_cpumask_ptr_next(nodemask, next_node);
-               cpus_or(*span, *span, *nodemask);
+               cpumask_or(span, span, cpumask_of_node(next_node));
         }
   }
   #endif /* CONFIG_NUMA */
@@@ -7035,18 -7194,33 +7189,33 @@@
   int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
   
   /*
+  * The cpus mask in sched_group and sched_domain hangs off the end.
+  * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space
+  * for nr_cpu_ids < CONFIG_NR_CPUS.
+  */
+ struct static_sched_group {
+       struct sched_group sg;
+       DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
+ };
+ 
+ struct static_sched_domain {
+       struct sched_domain sd;
+       DECLARE_BITMAP(span, CONFIG_NR_CPUS);
+ };
+ 
+ /*
    * SMT sched-domains:
    */
   #ifdef CONFIG_SCHED_SMT
- static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
- static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
+ static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
+ static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus);
   
   static int
- cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
-                cpumask_t *unused)
+ cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
+                struct sched_group **sg, struct cpumask *unused)
   {
         if (sg)
-               *sg = &per_cpu(sched_group_cpus, cpu);
+               *sg = &per_cpu(sched_group_cpus, cpu).sg;
         return cpu;
   }
   #endif /* CONFIG_SCHED_SMT */
@@@ -7055,56 -7229,53 +7224,53 @@@
    * multi-core sched-domains:
    */
   #ifdef CONFIG_SCHED_MC
- static DEFINE_PER_CPU(struct sched_domain, core_domains);
- static DEFINE_PER_CPU(struct sched_group, sched_group_core);
+ static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
+ static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
   #endif /* CONFIG_SCHED_MC */
   
   #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
   static int
- cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
-                 cpumask_t *mask)
+ cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
+                 struct sched_group **sg, struct cpumask *mask)
   {
         int group;
   
-       *mask = per_cpu(cpu_sibling_map, cpu);
-       cpus_and(*mask, *mask, *cpu_map);
-       group = first_cpu(*mask);
+       cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
+       group = cpumask_first(mask);
         if (sg)
-               *sg = &per_cpu(sched_group_core, group);
+               *sg = &per_cpu(sched_group_core, group).sg;
         return group;
   }
   #elif defined(CONFIG_SCHED_MC)
   static int
- cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
-                 cpumask_t *unused)
+ cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
+                 struct sched_group **sg, struct cpumask *unused)
   {
         if (sg)
-               *sg = &per_cpu(sched_group_core, cpu);
+               *sg = &per_cpu(sched_group_core, cpu).sg;
         return cpu;
   }
   #endif
   
- static DEFINE_PER_CPU(struct sched_domain, phys_domains);
- static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
+ static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
+ static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
   
   static int
- cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
-                 cpumask_t *mask)
+ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
+                 struct sched_group **sg, struct cpumask *mask)
   {
         int group;
   #ifdef CONFIG_SCHED_MC
-       *mask = cpu_coregroup_map(cpu);
-       cpus_and(*mask, *mask, *cpu_map);
-       group = first_cpu(*mask);
+       cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
+       group = cpumask_first(mask);
   #elif defined(CONFIG_SCHED_SMT)
-       *mask = per_cpu(cpu_sibling_map, cpu);
-       cpus_and(*mask, *mask, *cpu_map);
-       group = first_cpu(*mask);
+       cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
+       group = cpumask_first(mask);
   #else
         group = cpu;
   #endif
         if (sg)
-               *sg = &per_cpu(sched_group_phys, group);
+               *sg = &per_cpu(sched_group_phys, group).sg;
         return group;
   }
   
@@@ -7114,23 -7285,23 +7280,23 @@@
    * groups, so roll our own. Now each node has its own list of groups which
    * gets dynamically allocated.
    */
- static DEFINE_PER_CPU(struct sched_domain, node_domains);
+ static DEFINE_PER_CPU(struct static_sched_domain, node_domains);
   static struct sched_group ***sched_group_nodes_bycpu;
   
- static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
- static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
+ static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);
+ static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
   
- static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
-                                struct sched_group **sg, cpumask_t *nodemask)
+ static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
+                                struct sched_group **sg,
+                                struct cpumask *nodemask)
   {
         int group;
   
-       *nodemask = node_to_cpumask(cpu_to_node(cpu));
-       cpus_and(*nodemask, *nodemask, *cpu_map);
-       group = first_cpu(*nodemask);
+       cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map);
+       group = cpumask_first(nodemask);
   
         if (sg)
-               *sg = &per_cpu(sched_group_allnodes, group);
+               *sg = &per_cpu(sched_group_allnodes, group).sg;
         return group;
   }
   
@@@ -7142,11 -7313,11 +7308,11 @@@ static void init_numa_sched_groups_powe
         if (!sg)
                 return;
         do {
-               for_each_cpu_mask_nr(j, sg->cpumask) {
+               for_each_cpu(j, sched_group_cpus(sg)) {
                         struct sched_domain *sd;
   
-                       sd = &per_cpu(phys_domains, j);
-                       if (j != first_cpu(sd->groups->cpumask)) {
+                       sd = &per_cpu(phys_domains, j).sd;
+                       if (j != cpumask_first(sched_group_cpus(sd->groups))) {
                                 /*
                                  * Only add "power" once for each
                                  * physical package.
@@@ -7163,11 -7334,12 +7329,12 @@@
   
   #ifdef CONFIG_NUMA
   /* Free memory allocated for various sched_group structures */
- static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
+ static void free_sched_groups(const struct cpumask *cpu_map,
+                             struct cpumask *nodemask)
   {
         int cpu, i;
   
-       for_each_cpu_mask_nr(cpu, *cpu_map) {
+       for_each_cpu(cpu, cpu_map) {
                 struct sched_group **sched_group_nodes
                         = sched_group_nodes_bycpu[cpu];
   
@@@ -7177,9 -7349,8 +7344,8 @@@
                 for (i = 0; i < nr_node_ids; i++) {
                         struct sched_group *oldsg, *sg = sched_group_nodes[i];
   
-                       *nodemask = node_to_cpumask(i);
-                       cpus_and(*nodemask, *nodemask, *cpu_map);
-                       if (cpus_empty(*nodemask))
+                       cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
+                       if (cpumask_empty(nodemask))
                                 continue;
   
                         if (sg == NULL)
@@@ -7197,7 -7368,8 +7363,8 @@@ next_sg
         }
   }
   #else /* !CONFIG_NUMA */
- static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
+ static void free_sched_groups(const struct cpumask *cpu_map,
+                             struct cpumask *nodemask)
   {
   }
   #endif /* CONFIG_NUMA */
@@@ -7223,7 -7395,7 +7390,7 @@@ static void init_sched_groups_power(in
   
         WARN_ON(!sd || !sd->groups);
   
-       if (cpu != first_cpu(sd->groups->cpumask))
+       if (cpu != cpumask_first(sched_group_cpus(sd->groups)))
                 return;
   
         child = sd->child;
@@@ -7288,48 -7460,6 +7455,6 @@@ SD_INIT_FUNC(CPU
    SD_INIT_FUNC(MC)
   #endif
   
- /*
-  * To minimize stack usage kmalloc room for cpumasks and share the
-  * space as the usage in build_sched_domains() dictates.  Used only
-  * if the amount of space is significant.
-  */
- struct allmasks {
-       cpumask_t tmpmask;                      /* make this one first */
-       union {
-               cpumask_t nodemask;
-               cpumask_t this_sibling_map;
-               cpumask_t this_core_map;
-       };
-       cpumask_t send_covered;
- 
- #ifdef CONFIG_NUMA
-       cpumask_t domainspan;
-       cpumask_t covered;
-       cpumask_t notcovered;
- #endif
- };
- 
- #if   NR_CPUS > 128
- #define SCHED_CPUMASK_DECLARE(v)      struct allmasks *v
- static inline void sched_cpumask_alloc(struct allmasks **masks)
- {
-       *masks = kmalloc(sizeof(**masks), GFP_KERNEL);
- }
- static inline void sched_cpumask_free(struct allmasks *masks)
- {
-       kfree(masks);
- }
- #else
- #define SCHED_CPUMASK_DECLARE(v)      struct allmasks _v, *v = &_v
- static inline void sched_cpumask_alloc(struct allmasks **masks)
- { }
- static inline void sched_cpumask_free(struct allmasks *masks)
- { }
- #endif
- 
- #define       SCHED_CPUMASK_VAR(v, a)         cpumask_t *v = (cpumask_t *) \
-                       ((unsigned long)(a) + offsetof(struct allmasks, v))
- 
   static int default_relax_domain_level = -1;
   
   static int __init setup_relax_domain_level(char *str)
@@@ -7369,17 -7499,38 +7494,38 @@@ static void set_domain_attribute(struc
    * Build sched domains for a given set of cpus and attach the sched domains
    * to the individual cpus
    */
- static int __build_sched_domains(const cpumask_t *cpu_map,
+ static int __build_sched_domains(const struct cpumask *cpu_map,
                                  struct sched_domain_attr *attr)
   {
-       int i;
+       int i, err = -ENOMEM;
         struct root_domain *rd;
-       SCHED_CPUMASK_DECLARE(allmasks);
-       cpumask_t *tmpmask;
+       cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered,
+               tmpmask;
   #ifdef CONFIG_NUMA
+       cpumask_var_t domainspan, covered, notcovered;
         struct sched_group **sched_group_nodes = NULL;
         int sd_allnodes = 0;
   
+       if (!alloc_cpumask_var(&domainspan, GFP_KERNEL))
+               goto out;
+       if (!alloc_cpumask_var(&covered, GFP_KERNEL))
+               goto free_domainspan;
+       if (!alloc_cpumask_var(&notcovered, GFP_KERNEL))
+               goto free_covered;
+ #endif
+ 
+       if (!alloc_cpumask_var(&nodemask, GFP_KERNEL))
+               goto free_notcovered;
+       if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL))
+               goto free_nodemask;
+       if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL))
+               goto free_this_sibling_map;
+       if (!alloc_cpumask_var(&send_covered, GFP_KERNEL))
+               goto free_this_core_map;
+       if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
+               goto free_send_covered;
+ 
+ #ifdef CONFIG_NUMA
         /*
          * Allocate the per-node list of sched groups
          */
@@@ -7387,75 -7538,57 +7533,57 @@@
                                     GFP_KERNEL);
         if (!sched_group_nodes) {
                 printk(KERN_WARNING "Can not alloc sched group node list\n");
-               return -ENOMEM;
+               goto free_tmpmask;
         }
   #endif
   
         rd = alloc_rootdomain();
         if (!rd) {
                 printk(KERN_WARNING "Cannot alloc root domain\n");
- #ifdef CONFIG_NUMA
-               kfree(sched_group_nodes);
- #endif
-               return -ENOMEM;
-       }
- 
-       /* get space for all scratch cpumask variables */
-       sched_cpumask_alloc(&allmasks);
-       if (!allmasks) {
-               printk(KERN_WARNING "Cannot alloc cpumask array\n");
-               kfree(rd);
- #ifdef CONFIG_NUMA
-               kfree(sched_group_nodes);
- #endif
-               return -ENOMEM;
+               goto free_sched_groups;
         }
   
-       tmpmask = (cpumask_t *)allmasks;
- 
- 
   #ifdef CONFIG_NUMA
-       sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
+       sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes;
   #endif
   
         /*
          * Set up domains for cpus specified by the cpu_map.
          */
-       for_each_cpu_mask_nr(i, *cpu_map) {
+       for_each_cpu(i, cpu_map) {
                 struct sched_domain *sd = NULL, *p;
-               SCHED_CPUMASK_VAR(nodemask, allmasks);
   
-               *nodemask = node_to_cpumask(cpu_to_node(i));
-               cpus_and(*nodemask, *nodemask, *cpu_map);
+               cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map);
   
   #ifdef CONFIG_NUMA
-               if (cpus_weight(*cpu_map) >
-                               SD_NODES_PER_DOMAIN*cpus_weight(*nodemask)) {
-                       sd = &per_cpu(allnodes_domains, i);
+               if (cpumask_weight(cpu_map) >
+                               SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) {
+                       sd = &per_cpu(allnodes_domains, i).sd;
                         SD_INIT(sd, ALLNODES);
                         set_domain_attribute(sd, attr);
-                       sd->span = *cpu_map;
+                       cpumask_copy(sched_domain_span(sd), cpu_map);
                         cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
                         p = sd;
                         sd_allnodes = 1;
                 } else
                         p = NULL;
   
-               sd = &per_cpu(node_domains, i);
+               sd = &per_cpu(node_domains, i).sd;
                 SD_INIT(sd, NODE);
                 set_domain_attribute(sd, attr);
-               sched_domain_node_span(cpu_to_node(i), &sd->span);
+               sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
                 sd->parent = p;
                 if (p)
                         p->child = sd;
-               cpus_and(sd->span, sd->span, *cpu_map);
+               cpumask_and(sched_domain_span(sd),
+                           sched_domain_span(sd), cpu_map);
   #endif
   
                 p = sd;
-               sd = &per_cpu(phys_domains, i);
+               sd = &per_cpu(phys_domains, i).sd;
                 SD_INIT(sd, CPU);
                 set_domain_attribute(sd, attr);
-               sd->span = *nodemask;
+               cpumask_copy(sched_domain_span(sd), nodemask);
                 sd->parent = p;
                 if (p)
                         p->child = sd;
@@@ -7463,11 -7596,11 +7591,11 @@@
   
   #ifdef CONFIG_SCHED_MC
                 p = sd;
-               sd = &per_cpu(core_domains, i);
+               sd = &per_cpu(core_domains, i).sd;
                 SD_INIT(sd, MC);
                 set_domain_attribute(sd, attr);
-               sd->span = cpu_coregroup_map(i);
-               cpus_and(sd->span, sd->span, *cpu_map);
+               cpumask_and(sched_domain_span(sd), cpu_map,
+                                                  cpu_coregroup_mask(i));
                 sd->parent = p;
                 p->child = sd;
                 cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
@@@ -7475,11 -7608,11 +7603,11 @@@
   
   #ifdef CONFIG_SCHED_SMT
                 p = sd;
-               sd = &per_cpu(cpu_domains, i);
+               sd = &per_cpu(cpu_domains, i).sd;
                 SD_INIT(sd, SIBLING);
                 set_domain_attribute(sd, attr);
-               sd->span = per_cpu(cpu_sibling_map, i);
-               cpus_and(sd->span, sd->span, *cpu_map);
+               cpumask_and(sched_domain_span(sd),
+                           &per_cpu(cpu_sibling_map, i), cpu_map);
                 sd->parent = p;
                 p->child = sd;
                 cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
@@@ -7488,13 -7621,10 +7616,10 @@@
   
   #ifdef CONFIG_SCHED_SMT
         /* Set up CPU (sibling) groups */
-       for_each_cpu_mask_nr(i, *cpu_map) {
-               SCHED_CPUMASK_VAR(this_sibling_map, allmasks);
-               SCHED_CPUMASK_VAR(send_covered, allmasks);
- 
-               *this_sibling_map = per_cpu(cpu_sibling_map, i);
-               cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map);
-               if (i != first_cpu(*this_sibling_map))
+       for_each_cpu(i, cpu_map) {
+               cpumask_and(this_sibling_map,
+                           &per_cpu(cpu_sibling_map, i), cpu_map);
+               if (i != cpumask_first(this_sibling_map))
                         continue;
   
                 init_sched_build_groups(this_sibling_map, cpu_map,
@@@ -7505,13 -7635,9 +7630,9 @@@
   
   #ifdef CONFIG_SCHED_MC
         /* Set up multi-core groups */
-       for_each_cpu_mask_nr(i, *cpu_map) {
-               SCHED_CPUMASK_VAR(this_core_map, allmasks);
-               SCHED_CPUMASK_VAR(send_covered, allmasks);
- 
-               *this_core_map = cpu_coregroup_map(i);
-               cpus_and(*this_core_map, *this_core_map, *cpu_map);
-               if (i != first_cpu(*this_core_map))
+       for_each_cpu(i, cpu_map) {
+               cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map);
+               if (i != cpumask_first(this_core_map))
                         continue;
   
                 init_sched_build_groups(this_core_map, cpu_map,
@@@ -7522,12 -7648,8 +7643,8 @@@
   
         /* Set up physical groups */
         for (i = 0; i < nr_node_ids; i++) {
-               SCHED_CPUMASK_VAR(nodemask, allmasks);
-               SCHED_CPUMASK_VAR(send_covered, allmasks);
- 
-               *nodemask = node_to_cpumask(i);
-               cpus_and(*nodemask, *nodemask, *cpu_map);
-               if (cpus_empty(*nodemask))
+               cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
+               if (cpumask_empty(nodemask))
                         continue;
   
                 init_sched_build_groups(nodemask, cpu_map,
@@@ -7538,8 -7660,6 +7655,6 @@@
   #ifdef CONFIG_NUMA
         /* Set up node groups */
         if (sd_allnodes) {
-               SCHED_CPUMASK_VAR(send_covered, allmasks);
- 
                 init_sched_build_groups(cpu_map, cpu_map,
                                         &cpu_to_allnodes_group,
                                         send_covered, tmpmask);
@@@ -7548,58 -7668,53 +7663,53 @@@
         for (i = 0; i < nr_node_ids; i++) {
                 /* Set up node groups */
                 struct sched_group *sg, *prev;
-               SCHED_CPUMASK_VAR(nodemask, allmasks);
-               SCHED_CPUMASK_VAR(domainspan, allmasks);
-               SCHED_CPUMASK_VAR(covered, allmasks);
                 int j;
   
-               *nodemask = node_to_cpumask(i);
-               cpus_clear(*covered);
- 
-               cpus_and(*nodemask, *nodemask, *cpu_map);
-               if (cpus_empty(*nodemask)) {
+               cpumask_clear(covered);
+               cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
+               if (cpumask_empty(nodemask)) {
                         sched_group_nodes[i] = NULL;
                         continue;
                 }
   
                 sched_domain_node_span(i, domainspan);
-               cpus_and(*domainspan, *domainspan, *cpu_map);
+               cpumask_and(domainspan, domainspan, cpu_map);
   
-               sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
+               sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
+                                 GFP_KERNEL, i);
                 if (!sg) {
                         printk(KERN_WARNING "Can not alloc domain group for "
                                 "node %d\n", i);
                         goto error;
                 }
                 sched_group_nodes[i] = sg;
-               for_each_cpu_mask_nr(j, *nodemask) {
+               for_each_cpu(j, nodemask) {
                         struct sched_domain *sd;
   
-                       sd = &per_cpu(node_domains, j);
+                       sd = &per_cpu(node_domains, j).sd;
                         sd->groups = sg;
                 }
                 sg->__cpu_power = 0;
-               sg->cpumask = *nodemask;
+               cpumask_copy(sched_group_cpus(sg), nodemask);
                 sg->next = sg;
-               cpus_or(*covered, *covered, *nodemask);
+               cpumask_or(covered, covered, nodemask);
                 prev = sg;
   
                 for (j = 0; j < nr_node_ids; j++) {
-                       SCHED_CPUMASK_VAR(notcovered, allmasks);
                         int n = (i + j) % nr_node_ids;
-                       node_to_cpumask_ptr(pnodemask, n);
   
-                       cpus_complement(*notcovered, *covered);
-                       cpus_and(*tmpmask, *notcovered, *cpu_map);
-                       cpus_and(*tmpmask, *tmpmask, *domainspan);
-                       if (cpus_empty(*tmpmask))
+                       cpumask_complement(notcovered, covered);
+                       cpumask_and(tmpmask, notcovered, cpu_map);
+                       cpumask_and(tmpmask, tmpmask, domainspan);
+                       if (cpumask_empty(tmpmask))
                                 break;
   
-                       cpus_and(*tmpmask, *tmpmask, *pnodemask);
-                       if (cpus_empty(*tmpmask))
+                       cpumask_and(tmpmask, tmpmask, cpumask_of_node(n));
+                       if (cpumask_empty(tmpmask))
                                 continue;
   
-                       sg = kmalloc_node(sizeof(struct sched_group),
+                       sg = kmalloc_node(sizeof(struct sched_group) +
+                                         cpumask_size(),
                                           GFP_KERNEL, i);
                         if (!sg) {
                                 printk(KERN_WARNING
@@@ -7607,9 -7722,9 +7717,9 @@@
                                 goto error;
                         }
                         sg->__cpu_power = 0;
-                       sg->cpumask = *tmpmask;
+                       cpumask_copy(sched_group_cpus(sg), tmpmask);
                         sg->next = prev->next;
-                       cpus_or(*covered, *covered, *tmpmask);
+                       cpumask_or(covered, covered, tmpmask);
                         prev->next = sg;
                         prev = sg;
                 }
@@@ -7618,22 -7733,22 +7728,22 @@@
   
         /* Calculate CPU power for physical packages and nodes */
   #ifdef CONFIG_SCHED_SMT
-       for_each_cpu_mask_nr(i, *cpu_map) {
-               struct sched_domain *sd = &per_cpu(cpu_domains, i);
+       for_each_cpu(i, cpu_map) {
+               struct sched_domain *sd = &per_cpu(cpu_domains, i).sd;
   
                 init_sched_groups_power(i, sd);
         }
   #endif
   #ifdef CONFIG_SCHED_MC
-       for_each_cpu_mask_nr(i, *cpu_map) {
-               struct sched_domain *sd = &per_cpu(core_domains, i);
+       for_each_cpu(i, cpu_map) {
+               struct sched_domain *sd = &per_cpu(core_domains, i).sd;
   
                 init_sched_groups_power(i, sd);
         }
   #endif
   
-       for_each_cpu_mask_nr(i, *cpu_map) {
-               struct sched_domain *sd = &per_cpu(phys_domains, i);
+       for_each_cpu(i, cpu_map) {
+               struct sched_domain *sd = &per_cpu(phys_domains, i).sd;
   
                 init_sched_groups_power(i, sd);
         }
@@@ -7645,53 -7760,78 +7755,78 @@@
         if (sd_allnodes) {
                 struct sched_group *sg;
   
-               cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg,
+               cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
                                                                 tmpmask);
                 init_numa_sched_groups_power(sg);
         }
   #endif
   
         /* Attach the domains */
-       for_each_cpu_mask_nr(i, *cpu_map) {
+       for_each_cpu(i, cpu_map) {
                 struct sched_domain *sd;
   #ifdef CONFIG_SCHED_SMT
-               sd = &per_cpu(cpu_domains, i);
+               sd = &per_cpu(cpu_domains, i).sd;
   #elif defined(CONFIG_SCHED_MC)
-               sd = &per_cpu(core_domains, i);
+               sd = &per_cpu(core_domains, i).sd;
   #else
-               sd = &per_cpu(phys_domains, i);
+               sd = &per_cpu(phys_domains, i).sd;
   #endif
                 cpu_attach_domain(sd, rd, i);
         }
   
-       sched_cpumask_free(allmasks);
-       return 0;
+       err = 0;
+ 
+ free_tmpmask:
+       free_cpumask_var(tmpmask);
+ free_send_covered:
+       free_cpumask_var(send_covered);
+ free_this_core_map:
+       free_cpumask_var(this_core_map);
+ free_this_sibling_map:
+       free_cpumask_var(this_sibling_map);
+ free_nodemask:
+       free_cpumask_var(nodemask);
+ free_notcovered:
+ #ifdef CONFIG_NUMA
+       free_cpumask_var(notcovered);
+ free_covered:
+       free_cpumask_var(covered);
+ free_domainspan:
+       free_cpumask_var(domainspan);
+ out:
+ #endif
+       return err;
+ 
+ free_sched_groups:
+ #ifdef CONFIG_NUMA
+       kfree(sched_group_nodes);
+ #endif
+       goto free_tmpmask;
   
   #ifdef CONFIG_NUMA
   error:
         free_sched_groups(cpu_map, tmpmask);
-       sched_cpumask_free(allmasks);
-       kfree(rd);
-       return -ENOMEM;
+       free_rootdomain(rd);
+       goto free_tmpmask;
   #endif
   }
   
- static int build_sched_domains(const cpumask_t *cpu_map)
+ static int build_sched_domains(const struct cpumask *cpu_map)
   {
         return __build_sched_domains(cpu_map, NULL);
   }
   
- static cpumask_t *doms_cur;   /* current sched domains */
+ static struct cpumask *doms_cur;      /* current sched domains */
   static int ndoms_cur;         /* number of sched domains in 'doms_cur' */
   static struct sched_domain_attr *dattr_cur;
                                 /* attribues of custom domains in 'doms_cur' */
   
   /*
    * Special case: If a kmalloc of a doms_cur partition (array of
-  * cpumask_t) fails, then fallback to a single sched domain,
-  * as determined by the single cpumask_t fallback_doms.
+  * cpumask) fails, then fallback to a single sched domain,
+  * as determined by the single cpumask fallback_doms.
    */
- static cpumask_t fallback_doms;
+ static cpumask_var_t fallback_doms;
   
   /*
    * arch_update_cpu_topology lets virtualized architectures update the
@@@ -7708,16 -7848,16 +7843,16 @@@ int __attribute__((weak)) arch_update_c
    * For now this just excludes isolated cpus, but could be used to
    * exclude other special cases in the future.
    */
- static int arch_init_sched_domains(const cpumask_t *cpu_map)
+ static int arch_init_sched_domains(const struct cpumask *cpu_map)
   {
         int err;
   
         arch_update_cpu_topology();
         ndoms_cur = 1;
-       doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
+       doms_cur = kmalloc(cpumask_size(), GFP_KERNEL);
         if (!doms_cur)
-               doms_cur = &fallback_doms;
-       cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map);
+               doms_cur = fallback_doms;
+       cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map);
         dattr_cur = NULL;
         err = build_sched_domains(doms_cur);
         register_sched_domain_sysctl();
@@@ -7725,8 -7865,8 +7860,8 @@@
         return err;
   }
   
- static void arch_destroy_sched_domains(const cpumask_t *cpu_map,
-                                      cpumask_t *tmpmask)
+ static void arch_destroy_sched_domains(const struct cpumask *cpu_map,
+                                      struct cpumask *tmpmask)
   {
         free_sched_groups(cpu_map, tmpmask);
   }
@@@ -7735,15 -7875,16 +7870,16 @@@
    * Detach sched domains from a group of cpus specified in cpu_map
    * These cpus will now be attached to the NULL domain
    */
- static void detach_destroy_domains(const cpumask_t *cpu_map)
+ static void detach_destroy_domains(const struct cpumask *cpu_map)
   {
-       cpumask_t tmpmask;
+       /* Save because hotplug lock held. */
+       static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);
         int i;
   
-       for_each_cpu_mask_nr(i, *cpu_map)
+       for_each_cpu(i, cpu_map)
                 cpu_attach_domain(NULL, &def_root_domain, i);
         synchronize_sched();
-       arch_destroy_sched_domains(cpu_map, &tmpmask);
+       arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));
   }
   
   /* handle null as "default" */
@@@ -7768,7 -7909,7 +7904,7 @@@ static int dattrs_equal(struct sched_do
    * doms_new[] to the current sched domain partitioning, doms_cur[].
    * It destroys each deleted domain and builds each new domain.
    *
-  * 'doms_new' is an array of cpumask_t's of length 'ndoms_new'.
+  * 'doms_new' is an array of cpumask's of length 'ndoms_new'.
    * The masks don't intersect (don't overlap.) We should setup one
    * sched domain for each mask. CPUs not in any of the cpumasks will
    * not be load balanced. If the same cpumask appears both in the
@@@ -7782,13 -7923,14 +7918,14 @@@
    * the single partition 'fallback_doms', it also forces the domains
    * to be rebuilt.
    *
-  * If doms_new == NULL it will be replaced with cpu_online_map.
+  * If doms_new == NULL it will be replaced with cpu_online_mask.
    * ndoms_new == 0 is a special case for destroying existing domains,
    * and it will not create the default domain.
    *
    * Call with hotplug lock held
    */
- void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
+ /* FIXME: Change to struct cpumask *doms_new[] */
+ void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
                              struct sched_domain_attr *dattr_new)
   {
         int i, j, n;
@@@ -7807,7 -7949,7 +7944,7 @@@
         /* Destroy deleted domains */
         for (i = 0; i < ndoms_cur; i++) {
                 for (j = 0; j < n && !new_topology; j++) {
-                       if (cpus_equal(doms_cur[i], doms_new[j])
+                       if (cpumask_equal(&doms_cur[i], &doms_new[j])
                             && dattrs_equal(dattr_cur, i, dattr_new, j))
                                 goto match1;
                 }
@@@ -7819,15 -7961,15 +7956,15 @@@ match1
   
         if (doms_new == NULL) {
                 ndoms_cur = 0;
-               doms_new = &fallback_doms;
-               cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
+               doms_new = fallback_doms;
+               cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map);
                 WARN_ON_ONCE(dattr_new);
         }
   
         /* Build new domains */
         for (i = 0; i < ndoms_new; i++) {
                 for (j = 0; j < ndoms_cur && !new_topology; j++) {
-                       if (cpus_equal(doms_new[i], doms_cur[j])
+                       if (cpumask_equal(&doms_new[i], &doms_cur[j])
                             && dattrs_equal(dattr_new, i, dattr_cur, j))
                                 goto match2;
                 }
@@@ -7839,7 -7981,7 +7976,7 @@@ match2
         }
   
         /* Remember the new sched domains */
-       if (doms_cur != &fallback_doms)
+       if (doms_cur != fallback_doms)
                 kfree(doms_cur);
         kfree(dattr_cur);       /* kfree(NULL) is safe */
         doms_cur = doms_new;
@@@ -7852,7 -7994,7 +7989,7 @@@
   }
   
   #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
- int arch_reinit_sched_domains(void)
+ static void arch_reinit_sched_domains(void)
   {
         get_online_cpus();
   
@@@ -7861,25 -8003,33 +7998,33 @@@
   
         rebuild_sched_domains();
         put_online_cpus();
- 
-       return 0;
   }
   
   static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
   {
-       int ret;
+       unsigned int level = 0;
+ 
+       if (sscanf(buf, "%u", &level) != 1)
+               return -EINVAL;
+ 
+       /*
+        * level is always be positive so don't check for
+        * level < POWERSAVINGS_BALANCE_NONE which is 0
+        * What happens on 0 or 1 byte write,
+        * need to check for count as well?
+        */
   
-       if (buf[0] != '0' && buf[0] != '1')
+       if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
                 return -EINVAL;
   
         if (smt)
-               sched_smt_power_savings = (buf[0] == '1');
+               sched_smt_power_savings = level;
         else
-               sched_mc_power_savings = (buf[0] == '1');
+               sched_mc_power_savings = level;
   
-       ret = arch_reinit_sched_domains();
+       arch_reinit_sched_domains();
   
-       return ret ? ret : count;
+       return count;
   }
   
   #ifdef CONFIG_SCHED_MC
@@@ -7914,7 -8064,7 +8059,7 @@@ static SYSDEV_CLASS_ATTR(sched_smt_powe
                    sched_smt_power_savings_store);
   #endif
   
- int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
+ int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
   {
         int err = 0;
   
@@@ -7979,7 -8129,9 +8124,9 @@@ static int update_runtime(struct notifi
   
   void __init sched_init_smp(void)
   {
-       cpumask_t non_isolated_cpus;
+       cpumask_var_t non_isolated_cpus;
+ 
+       alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
   
   #if defined(CONFIG_NUMA)
         sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
@@@ -7988,10 -8140,10 +8135,10 @@@
   #endif
         get_online_cpus();
         mutex_lock(&sched_domains_mutex);
-       arch_init_sched_domains(&cpu_online_map);
-       cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
-       if (cpus_empty(non_isolated_cpus))
-               cpu_set(smp_processor_id(), non_isolated_cpus);
+       arch_init_sched_domains(cpu_online_mask);
+       cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
+       if (cpumask_empty(non_isolated_cpus))
+               cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
         mutex_unlock(&sched_domains_mutex);
         put_online_cpus();
   
@@@ -8006,9 -8158,13 +8153,13 @@@
         init_hrtick();
   
         /* Move init over to a non-isolated CPU */
-       if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0)
+       if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
                 BUG();
         sched_init_granularity();
+       free_cpumask_var(non_isolated_cpus);
+ 
+       alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
+       init_sched_rt_class();
   }
   #else
   void __init sched_init_smp(void)
@@@ -8323,6 -8479,15 +8474,15 @@@ void __init sched_init(void
          */
         current->sched_class = &fair_sched_class;
   
+       /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
+       alloc_bootmem_cpumask_var(&nohz_cpu_mask);
+ #ifdef CONFIG_SMP
+ #ifdef CONFIG_NO_HZ
+       alloc_bootmem_cpumask_var(&nohz.cpu_mask);
+ #endif
+       alloc_bootmem_cpumask_var(&cpu_isolated_map);
+ #endif /* SMP */
+ 
         scheduler_running = 1;
   }
author	Ingo Molnar <mingo@elte.hu>
	Sun, 18 Jan 2009 17:37:14 +0000 (18:37 +0100)
committer	Ingo Molnar <mingo@elte.hu>
	Sun, 18 Jan 2009 17:37:14 +0000 (18:37 +0100)
		1	2
arch/x86/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/pda.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/stackprotector.h	patch \|	\|	\|	blob
arch/x86/include/asm/system.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/process_64.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/mm/fault.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/magic.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history
init/main.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/exit.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/fork.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/panic.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched.c	patch \|	diff1 \|	diff2 \|	blob \| history