Merge branch 'core/percpu' into stackprotector
authorIngo Molnar <mingo@elte.hu>
Sun, 18 Jan 2009 17:37:14 +0000 (18:37 +0100)
committerIngo Molnar <mingo@elte.hu>
Sun, 18 Jan 2009 17:37:14 +0000 (18:37 +0100)
Conflicts:
arch/x86/include/asm/pda.h
arch/x86/include/asm/system.h

Also, moved include/asm-x86/stackprotector.h to arch/x86/include/asm.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
13 files changed:
1  2 
arch/x86/Kconfig
arch/x86/include/asm/pda.h
arch/x86/include/asm/stackprotector.h
arch/x86/include/asm/system.h
arch/x86/kernel/process_64.c
arch/x86/mm/fault.c
include/linux/magic.h
include/linux/sched.h
init/main.c
kernel/exit.c
kernel/fork.c
kernel/panic.c
kernel/sched.c

diff --combined arch/x86/Kconfig
@@@ -27,6 -27,7 +27,7 @@@ config X8
        select HAVE_IOREMAP_PROT
        select HAVE_KPROBES
        select ARCH_WANT_OPTIONAL_GPIOLIB
+       select ARCH_WANT_FRAME_POINTERS
        select HAVE_KRETPROBES
        select HAVE_FTRACE_MCOUNT_RECORD
        select HAVE_DYNAMIC_FTRACE
@@@ -586,6 -587,16 +587,16 @@@ config AMD_IOMM
          your BIOS for an option to enable it or if you have an IVRS ACPI
          table.
  
+ config AMD_IOMMU_STATS
+       bool "Export AMD IOMMU statistics to debugfs"
+       depends on AMD_IOMMU
+       select DEBUG_FS
+       help
+         This option enables code in the AMD IOMMU driver to collect various
+         statistics about whats happening in the driver and exports that
+         information to userspace via debugfs.
+         If unsure, say N.
  # need this always selected by IOMMU for the VIA workaround
  config SWIOTLB
        def_bool y if X86_64
  config IOMMU_HELPER
        def_bool (CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU)
  
+ config IOMMU_API
+       def_bool (AMD_IOMMU || DMAR)
  config MAXSMP
        bool "Configure Maximum number of SMP Processors and NUMA Nodes"
-       depends on X86_64 && SMP && BROKEN
+       depends on X86_64 && SMP && DEBUG_KERNEL && EXPERIMENTAL
+       select CPUMASK_OFFSTACK
        default n
        help
          Configure maximum number of CPUS and NUMA Nodes for this architecture.
          If unsure, say N.
  
  config NR_CPUS
-       int "Maximum number of CPUs (2-512)" if !MAXSMP
-       range 2 512
-       depends on SMP
+       int "Maximum number of CPUs" if SMP && !MAXSMP
+       range 2 512 if SMP && !MAXSMP
+       default "1" if !SMP
        default "4096" if MAXSMP
-       default "32" if X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000
-       default "8"
+       default "32" if SMP && (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000)
+       default "8" if SMP
        help
          This allows you to specify the maximum number of CPUs which this
          kernel will support.  The maximum supported value is 512 and the
@@@ -1325,17 -1340,13 +1340,17 @@@ config SECCOM
  
          If unsure, say Y. Only embedded should say N here.
  
 +config CC_STACKPROTECTOR_ALL
 +      bool
 +
  config CC_STACKPROTECTOR
        bool "Enable -fstack-protector buffer overflow detection (EXPERIMENTAL)"
 -      depends on X86_64 && EXPERIMENTAL && BROKEN
 +      depends on X86_64
 +      select CC_STACKPROTECTOR_ALL
        help
 -         This option turns on the -fstack-protector GCC feature. This
 -        feature puts, at the beginning of critical functions, a canary
 -        value on the stack just before the return address, and validates
 +          This option turns on the -fstack-protector GCC feature. This
 +        feature puts, at the beginning of functions, a canary value on
 +        the stack just before the return address, and validates
          the value just before actually returning.  Stack based buffer
          overflows (that need to overwrite this return address) now also
          overwrite the canary, which gets detected and the attack is then
  
          This feature requires gcc version 4.2 or above, or a distribution
          gcc with the feature backported. Older versions are automatically
 -        detected and for those versions, this configuration option is ignored.
 -
 -config CC_STACKPROTECTOR_ALL
 -      bool "Use stack-protector for all functions"
 -      depends on CC_STACKPROTECTOR
 -      help
 -        Normally, GCC only inserts the canary value protection for
 -        functions that use large-ish on-stack buffers. By enabling
 -        this option, GCC will be asked to do this for ALL functions.
 +        detected and for those versions, this configuration option is
 +        ignored. (and a warning is printed during bootup)
  
  source kernel/Kconfig.hz
  
  #include <linux/stddef.h>
  #include <linux/types.h>
  #include <linux/cache.h>
+ #include <linux/threads.h>
  #include <asm/page.h>
+ #include <asm/percpu.h>
  
  /* Per processor datastructure. %gs points to it while the kernel runs */
  struct x8664_pda {
-       struct task_struct *pcurrent;   /* 0  Current process */
-       unsigned long data_offset;      /* 8 Per cpu data offset from linker
-                                          address */
-       unsigned long kernelstack;      /* 16 top of kernel stack for current */
-       unsigned long oldrsp;           /* 24 user rsp for system call */
-       int irqcount;                   /* 32 Irq nesting counter. Starts -1 */
-       unsigned int cpunumber;         /* 36 Logical CPU number */
+       unsigned long unused1;
+       unsigned long unused2;
+       unsigned long unused3;
+       unsigned long unused4;
+       int unused5;
+       unsigned int unused6;           /* 36 was cpunumber */
 -#ifdef CONFIG_CC_STACKPROTECTOR
        unsigned long stack_canary;     /* 40 stack canary value */
                                        /* gcc-ABI: this canary MUST be at
                                           offset 40!!! */
-       char *irqstackptr;
-       short nodenumber;               /* number of current node (32k max) */
 -#endif
        short in_bootmem;               /* pda lives in bootmem */
-       unsigned int __softirq_pending;
-       unsigned int __nmi_count;       /* number of NMI on this CPUs */
-       short mmu_state;
-       short isidle;
-       struct mm_struct *active_mm;
-       unsigned apic_timer_irqs;
-       unsigned irq0_irqs;
-       unsigned irq_resched_count;
-       unsigned irq_call_count;
-       unsigned irq_tlb_count;
-       unsigned irq_thermal_count;
-       unsigned irq_threshold_count;
-       unsigned irq_spurious_count;
  } ____cacheline_aligned_in_smp;
  
extern struct x8664_pda **_cpu_pda;
DECLARE_PER_CPU(struct x8664_pda, __pda);
  extern void pda_init(int);
  
- #define cpu_pda(i) (_cpu_pda[i])
+ #define cpu_pda(cpu)          (&per_cpu(__pda, cpu))
  
- /*
-  * There is no fast way to get the base address of the PDA, all the accesses
-  * have to mention %fs/%gs.  So it needs to be done this Torvaldian way.
-  */
- extern void __bad_pda_field(void) __attribute__((noreturn));
- /*
-  * proxy_pda doesn't actually exist, but tell gcc it is accessed for
-  * all PDA accesses so it gets read/write dependencies right.
-  */
- extern struct x8664_pda _proxy_pda;
- #define pda_offset(field) offsetof(struct x8664_pda, field)
- #define pda_to_op(op, field, val)                                     \
- do {                                                                  \
-       typedef typeof(_proxy_pda.field) T__;                           \
-       if (0) { T__ tmp__; tmp__ = (val); }    /* type checking */     \
-       switch (sizeof(_proxy_pda.field)) {                             \
-       case 2:                                                         \
-               asm(op "w %1,%%gs:%c2" :                                \
-                   "+m" (_proxy_pda.field) :                           \
-                   "ri" ((T__)val),                                    \
-                   "i"(pda_offset(field)));                            \
-               break;                                                  \
-       case 4:                                                         \
-               asm(op "l %1,%%gs:%c2" :                                \
-                   "+m" (_proxy_pda.field) :                           \
-                   "ri" ((T__)val),                                    \
-                   "i" (pda_offset(field)));                           \
-               break;                                                  \
-       case 8:                                                         \
-               asm(op "q %1,%%gs:%c2":                                 \
-                   "+m" (_proxy_pda.field) :                           \
-                   "ri" ((T__)val),                                    \
-                   "i"(pda_offset(field)));                            \
-               break;                                                  \
-       default:                                                        \
-               __bad_pda_field();                                      \
-       }                                                               \
- } while (0)
- #define pda_from_op(op, field)                        \
- ({                                            \
-       typeof(_proxy_pda.field) ret__;         \
-       switch (sizeof(_proxy_pda.field)) {     \
-       case 2:                                 \
-               asm(op "w %%gs:%c1,%0" :        \
-                   "=r" (ret__) :              \
-                   "i" (pda_offset(field)),    \
-                   "m" (_proxy_pda.field));    \
-               break;                          \
-       case 4:                                 \
-               asm(op "l %%gs:%c1,%0":         \
-                   "=r" (ret__):               \
-                   "i" (pda_offset(field)),    \
-                   "m" (_proxy_pda.field));    \
-               break;                          \
-       case 8:                                 \
-               asm(op "q %%gs:%c1,%0":         \
-                   "=r" (ret__) :              \
-                   "i" (pda_offset(field)),    \
-                   "m" (_proxy_pda.field));    \
-               break;                          \
-       default:                                \
-               __bad_pda_field();              \
-       }                                       \
-       ret__;                                  \
- })
- #define read_pda(field)               pda_from_op("mov", field)
- #define write_pda(field, val) pda_to_op("mov", field, val)
- #define add_pda(field, val)   pda_to_op("add", field, val)
- #define sub_pda(field, val)   pda_to_op("sub", field, val)
- #define or_pda(field, val)    pda_to_op("or", field, val)
+ #define read_pda(field)               percpu_read(__pda.field)
+ #define write_pda(field, val) percpu_write(__pda.field, val)
+ #define add_pda(field, val)   percpu_add(__pda.field, val)
+ #define sub_pda(field, val)   percpu_sub(__pda.field, val)
+ #define or_pda(field, val)    percpu_or(__pda.field, val)
  
  /* This is not atomic against other CPUs -- CPU preemption needs to be off */
  #define test_and_clear_bit_pda(bit, field)                            \
- ({                                                                    \
-       int old__;                                                      \
-       asm volatile("btr %2,%%gs:%c3\n\tsbbl %0,%0"                    \
-                    : "=r" (old__), "+m" (_proxy_pda.field)            \
-                    : "dIr" (bit), "i" (pda_offset(field)) : "memory");\
-       old__;                                                          \
- })
+       x86_test_and_clear_bit_percpu(bit, __pda.field)
  
  #endif
  
- #define PDA_STACKOFFSET (5*8)
 +#define refresh_stack_canary() write_pda(stack_canary, current->stack_canary)
 +
  #endif /* _ASM_X86_PDA_H */
index 0000000,0000000..c7f0d10
new file mode 100644 (file)
--- /dev/null
--- /dev/null
@@@ -1,0 -1,0 +1,39 @@@
++#ifndef _ASM_STACKPROTECTOR_H
++#define _ASM_STACKPROTECTOR_H 1
++
++#include <asm/tsc.h>
++#include <asm/pda.h>
++
++/*
++ * Initialize the stackprotector canary value.
++ *
++ * NOTE: this must only be called from functions that never return,
++ * and it must always be inlined.
++ */
++static __always_inline void boot_init_stack_canary(void)
++{
++      u64 canary;
++      u64 tsc;
++
++      /*
++       * If we're the non-boot CPU, nothing set the PDA stack
++       * canary up for us - and if we are the boot CPU we have
++       * a 0 stack canary. This is a good place for updating
++       * it, as we wont ever return from this function (so the
++       * invalid canaries already on the stack wont ever
++       * trigger).
++       *
++       * We both use the random pool and the current TSC as a source
++       * of randomness. The TSC only matters for very early init,
++       * there it already has some randomness on most systems. Later
++       * on during the bootup the random pool has true entropy too.
++       */
++      get_random_bytes(&canary, sizeof(canary));
++      tsc = __native_read_tsc();
++      canary += tsc + (tsc << 32UL);
++
++      current->stack_canary = canary;
++      write_pda(stack_canary, canary);
++}
++
++#endif
@@@ -94,9 -94,7 +94,9 @@@ do {                                                                  
             "call __switch_to\n\t"                                       \
             ".globl thread_return\n"                                     \
             "thread_return:\n\t"                                         \
-            "movq %%gs:%P[pda_pcurrent],%%rsi\n\t"                       \
+            "movq "__percpu_arg([current_task])",%%rsi\n\t"              \
 +           "movq %P[task_canary](%%rsi),%%r8\n\t"                       \
 +           "movq %%r8,%%gs:%P[pda_canary]\n\t"                          \
             "movq %P[thread_info](%%rsi),%%r8\n\t"                       \
             LOCK_PREFIX "btr  %[tif_fork],%P[ti_flags](%%r8)\n\t"        \
             "movq %%rax,%%rdi\n\t"                                       \
               [ti_flags] "i" (offsetof(struct thread_info, flags)),      \
               [tif_fork] "i" (TIF_FORK),                                 \
               [thread_info] "i" (offsetof(struct task_struct, stack)),   \
 -             [current_task] "m" (per_cpu_var(current_task))             \
 +             [task_canary] "i" (offsetof(struct task_struct, stack_canary)),\
-              [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent)), \
++             [current_task] "m" (per_cpu_var(current_task)),            \
 +             [pda_canary] "i" (offsetof(struct x8664_pda, stack_canary))\
             : "memory", "cc" __EXTRA_CLOBBER)
  #endif
  
@@@ -16,7 -16,6 +16,7 @@@
  
  #include <stdarg.h>
  
 +#include <linux/stackprotector.h>
  #include <linux/cpu.h>
  #include <linux/errno.h>
  #include <linux/sched.h>
  
  asmlinkage extern void ret_from_fork(void);
  
+ DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
+ EXPORT_PER_CPU_SYMBOL(current_task);
+ DEFINE_PER_CPU(unsigned long, old_rsp);
+ static DEFINE_PER_CPU(unsigned char, is_idle);
  unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
  
  static ATOMIC_NOTIFIER_HEAD(idle_notifier);
@@@ -76,13 -81,13 +82,13 @@@ EXPORT_SYMBOL_GPL(idle_notifier_unregis
  
  void enter_idle(void)
  {
-       write_pda(isidle, 1);
+       percpu_write(is_idle, 1);
        atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
  }
  
  static void __exit_idle(void)
  {
-       if (test_and_clear_bit_pda(0, isidle) == 0)
+       if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
                return;
        atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
  }
@@@ -112,17 -117,6 +118,17 @@@ static inline void play_dead(void
  void cpu_idle(void)
  {
        current_thread_info()->status |= TS_POLLING;
 +
 +      /*
 +       * If we're the non-boot CPU, nothing set the PDA stack
 +       * canary up for us - and if we are the boot CPU we have
 +       * a 0 stack canary. This is a good place for updating
 +       * it, as we wont ever return from this function (so the
 +       * invalid canaries already on the stack wont ever
 +       * trigger):
 +       */
 +      boot_init_stack_canary();
 +
        /* endless idle loop with no priority at all */
        while (1) {
                tick_nohz_stop_sched_tick(1);
@@@ -404,7 -398,7 +410,7 @@@ start_thread(struct pt_regs *regs, unsi
        load_gs_index(0);
        regs->ip                = new_ip;
        regs->sp                = new_sp;
-       write_pda(oldrsp, new_sp);
+       percpu_write(old_rsp, new_sp);
        regs->cs                = __USER_CS;
        regs->ss                = __USER_DS;
        regs->flags             = 0x200;
@@@ -625,14 -619,15 +631,14 @@@ __switch_to(struct task_struct *prev_p
        /*
         * Switch the PDA and FPU contexts.
         */
-       prev->usersp = read_pda(oldrsp);
-       write_pda(oldrsp, next->usersp);
-       write_pda(pcurrent, next_p);
+       prev->usersp = percpu_read(old_rsp);
+       percpu_write(old_rsp, next->usersp);
+       percpu_write(current_task, next_p);
  
-       write_pda(kernelstack,
+       percpu_write(kernel_stack,
                  (unsigned long)task_stack_page(next_p) +
-                 THREAD_SIZE - PDA_STACKOFFSET);
+                 THREAD_SIZE - KERNEL_STACK_OFFSET);
  #ifdef CONFIG_CC_STACKPROTECTOR
 -      write_pda(stack_canary, next_p->stack_canary);
        /*
         * Build time only check to make sure the stack_canary is at
         * offset 40 in the pda; this is a gcc ABI requirement
diff --combined arch/x86/mm/fault.c
@@@ -26,7 -26,6 +26,7 @@@
  #include <linux/kprobes.h>
  #include <linux/uaccess.h>
  #include <linux/kdebug.h>
 +#include <linux/magic.h>
  
  #include <asm/system.h>
  #include <asm/desc.h>
@@@ -535,7 -534,7 +535,7 @@@ static int vmalloc_fault(unsigned long 
           happen within a race in page table update. In the later
           case just flush. */
  
-       pgd = pgd_offset(current->mm ?: &init_mm, address);
+       pgd = pgd_offset(current->active_mm, address);
        pgd_ref = pgd_offset_k(address);
        if (pgd_none(*pgd_ref))
                return -1;
@@@ -590,8 -589,6 +590,8 @@@ void __kprobes do_page_fault(struct pt_
        unsigned long address;
        int write, si_code;
        int fault;
 +      unsigned long *stackend;
 +
  #ifdef CONFIG_X86_64
        unsigned long flags;
        int sig;
        if (unlikely(in_atomic() || !mm))
                goto bad_area_nosemaphore;
  
- again:
        /*
         * When running in the kernel we expect faults to occur only to
         * addresses in user space.  All other faults represent errors in the
@@@ -845,10 -841,6 +844,10 @@@ no_context
  
        show_fault_oops(regs, error_code, address);
  
 +      stackend = end_of_stack(tsk);
 +      if (*stackend != STACK_END_MAGIC)
 +              printk(KERN_ALERT "Thread overran stack, or stack corrupted\n");
 +
        tsk->thread.cr2 = address;
        tsk->thread.trap_no = 14;
        tsk->thread.error_code = error_code;
        oops_end(flags, regs, sig);
  #endif
  
- /*
-  * We ran out of memory, or some other thing happened to us that made
-  * us unable to handle the page fault gracefully.
-  */
  out_of_memory:
+       /*
+        * We ran out of memory, call the OOM killer, and return the userspace
+        * (which will retry the fault, or kill us if we got oom-killed).
+        */
        up_read(&mm->mmap_sem);
-       if (is_global_init(tsk)) {
-               yield();
-               /*
-                * Re-lookup the vma - in theory the vma tree might
-                * have changed:
-                */
-               goto again;
-       }
-       printk("VM: killing process %s\n", tsk->comm);
-       if (error_code & PF_USER)
-               do_group_exit(SIGKILL);
-       goto no_context;
+       pagefault_out_of_memory();
+       return;
  
  do_sigbus:
        up_read(&mm->mmap_sem);
diff --combined include/linux/magic.h
@@@ -13,6 -13,7 +13,7 @@@
  #define EFS_SUPER_MAGIC               0x414A53
  #define EXT2_SUPER_MAGIC      0xEF53
  #define EXT3_SUPER_MAGIC      0xEF53
+ #define XENFS_SUPER_MAGIC     0xabba1974
  #define EXT4_SUPER_MAGIC      0xEF53
  #define HPFS_SUPER_MAGIC      0xf995e849
  #define ISOFS_SUPER_MAGIC     0x9660
@@@ -46,5 -47,4 +47,5 @@@
  #define FUTEXFS_SUPER_MAGIC   0xBAD1DEA
  #define INOTIFYFS_SUPER_MAGIC 0x2BAD1DEA
  
 +#define STACK_END_MAGIC               0x57AC6E9D
  #endif /* __LINUX_MAGIC_H__ */
diff --combined include/linux/sched.h
@@@ -250,7 -250,7 +250,7 @@@ extern void init_idle_bootup_task(struc
  extern int runqueue_is_locked(void);
  extern void task_rq_unlock_wait(struct task_struct *p);
  
- extern cpumask_t nohz_cpu_mask;
+ extern cpumask_var_t nohz_cpu_mask;
  #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
  extern int select_nohz_load_balancer(int cpu);
  #else
@@@ -284,7 -284,6 +284,6 @@@ long io_schedule_timeout(long timeout)
  
  extern void cpu_init (void);
  extern void trap_init(void);
- extern void account_process_tick(struct task_struct *task, int user);
  extern void update_process_times(int user);
  extern void scheduler_tick(void);
  
@@@ -387,6 -386,9 +386,9 @@@ extern void arch_unmap_area_topdown(str
                (mm)->hiwater_vm = (mm)->total_vm;      \
  } while (0)
  
+ #define get_mm_hiwater_rss(mm)        max((mm)->hiwater_rss, get_mm_rss(mm))
+ #define get_mm_hiwater_vm(mm) max((mm)->hiwater_vm, (mm)->total_vm)
  extern void set_dumpable(struct mm_struct *mm, int value);
  extern int get_dumpable(struct mm_struct *mm);
  
@@@ -758,20 -760,51 +760,51 @@@ enum cpu_idle_type 
  #define SD_SERIALIZE          1024    /* Only a single load balancing instance */
  #define SD_WAKE_IDLE_FAR      2048    /* Gain latency sacrificing cache hit */
  
- #define BALANCE_FOR_MC_POWER  \
-       (sched_smt_power_savings ? SD_POWERSAVINGS_BALANCE : 0)
+ enum powersavings_balance_level {
+       POWERSAVINGS_BALANCE_NONE = 0,  /* No power saving load balance */
+       POWERSAVINGS_BALANCE_BASIC,     /* Fill one thread/core/package
+                                        * first for long running threads
+                                        */
+       POWERSAVINGS_BALANCE_WAKEUP,    /* Also bias task wakeups to semi-idle
+                                        * cpu package for power savings
+                                        */
+       MAX_POWERSAVINGS_BALANCE_LEVELS
+ };
  
- #define BALANCE_FOR_PKG_POWER \
-       ((sched_mc_power_savings || sched_smt_power_savings) ?  \
-        SD_POWERSAVINGS_BALANCE : 0)
+ extern int sched_mc_power_savings, sched_smt_power_savings;
  
- #define test_sd_parent(sd, flag)      ((sd->parent &&         \
-                                        (sd->parent->flags & flag)) ? 1 : 0)
+ static inline int sd_balance_for_mc_power(void)
+ {
+       if (sched_smt_power_savings)
+               return SD_POWERSAVINGS_BALANCE;
  
+       return 0;
+ }
+ static inline int sd_balance_for_package_power(void)
+ {
+       if (sched_mc_power_savings | sched_smt_power_savings)
+               return SD_POWERSAVINGS_BALANCE;
+       return 0;
+ }
+ /*
+  * Optimise SD flags for power savings:
+  * SD_BALANCE_NEWIDLE helps agressive task consolidation and power savings.
+  * Keep default SD flags if sched_{smt,mc}_power_saving=0
+  */
+ static inline int sd_power_saving_flags(void)
+ {
+       if (sched_mc_power_savings | sched_smt_power_savings)
+               return SD_BALANCE_NEWIDLE;
+       return 0;
+ }
  
  struct sched_group {
        struct sched_group *next;       /* Must be a circular list */
-       cpumask_t cpumask;
  
        /*
         * CPU power of this group, SCHED_LOAD_SCALE being max power for a
         * (see include/linux/reciprocal_div.h)
         */
        u32 reciprocal_cpu_power;
+       unsigned long cpumask[];
  };
  
+ static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
+ {
+       return to_cpumask(sg->cpumask);
+ }
  enum sched_domain_level {
        SD_LV_NONE = 0,
        SD_LV_SIBLING,
@@@ -809,7 -849,6 +849,6 @@@ struct sched_domain 
        struct sched_domain *parent;    /* top domain must be null terminated */
        struct sched_domain *child;     /* bottom domain must be null terminated */
        struct sched_group *groups;     /* the balancing groups of the domain */
-       cpumask_t span;                 /* span of all CPUs in this domain */
        unsigned long min_interval;     /* Minimum balance interval ms */
        unsigned long max_interval;     /* Maximum balance interval ms */
        unsigned int busy_factor;       /* less balancing by factor if busy */
  #ifdef CONFIG_SCHED_DEBUG
        char *name;
  #endif
+       /* span of all CPUs in this domain */
+       unsigned long span[];
  };
  
- extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
+ static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
+ {
+       return to_cpumask(sd->span);
+ }
+ extern void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
                                    struct sched_domain_attr *dattr_new);
- extern int arch_reinit_sched_domains(void);
+ /* Test a flag in parent sched domain */
+ static inline int test_sd_parent(struct sched_domain *sd, int flag)
+ {
+       if (sd->parent && (sd->parent->flags & flag))
+               return 1;
+       return 0;
+ }
  
  #else /* CONFIG_SMP */
  
  struct sched_domain_attr;
  
  static inline void
- partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
+ partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
                        struct sched_domain_attr *dattr_new)
  {
  }
@@@ -926,7 -981,7 +981,7 @@@ struct sched_class 
        void (*task_wake_up) (struct rq *this_rq, struct task_struct *task);
  
        void (*set_cpus_allowed)(struct task_struct *p,
-                                const cpumask_t *newmask);
+                                const struct cpumask *newmask);
  
        void (*rq_online)(struct rq *rq);
        void (*rq_offline)(struct rq *rq);
@@@ -1102,9 -1157,10 +1157,9 @@@ struct task_struct 
        pid_t pid;
        pid_t tgid;
  
 -#ifdef CONFIG_CC_STACKPROTECTOR
        /* Canary value for the -fstack-protector gcc feature */
        unsigned long stack_canary;
 -#endif
 +
        /* 
         * pointers to (original) parent process, youngest child, younger sibling,
         * older sibling, respectively.  (p->father can be replaced with 
@@@ -1578,12 -1634,12 +1633,12 @@@ extern cputime_t task_gtime(struct task
  
  #ifdef CONFIG_SMP
  extern int set_cpus_allowed_ptr(struct task_struct *p,
-                               const cpumask_t *new_mask);
+                               const struct cpumask *new_mask);
  #else
  static inline int set_cpus_allowed_ptr(struct task_struct *p,
-                                      const cpumask_t *new_mask)
+                                      const struct cpumask *new_mask)
  {
-       if (!cpu_isset(0, *new_mask))
+       if (!cpumask_test_cpu(0, new_mask))
                return -EINVAL;
        return 0;
  }
@@@ -1650,16 -1706,16 +1705,16 @@@ extern void wake_up_idle_cpu(int cpu)
  static inline void wake_up_idle_cpu(int cpu) { }
  #endif
  
- #ifdef CONFIG_SCHED_DEBUG
  extern unsigned int sysctl_sched_latency;
  extern unsigned int sysctl_sched_min_granularity;
  extern unsigned int sysctl_sched_wakeup_granularity;
+ extern unsigned int sysctl_sched_shares_ratelimit;
+ extern unsigned int sysctl_sched_shares_thresh;
+ #ifdef CONFIG_SCHED_DEBUG
  extern unsigned int sysctl_sched_child_runs_first;
  extern unsigned int sysctl_sched_features;
  extern unsigned int sysctl_sched_migration_cost;
  extern unsigned int sysctl_sched_nr_migrate;
- extern unsigned int sysctl_sched_shares_ratelimit;
- extern unsigned int sysctl_sched_shares_thresh;
  
  int sched_nr_latency_handler(struct ctl_table *table, int write,
                struct file *file, void __user *buffer, size_t *length,
@@@ -2010,19 -2066,6 +2065,19 @@@ static inline int object_is_on_stack(vo
  
  extern void thread_info_cache_init(void);
  
 +#ifdef CONFIG_DEBUG_STACK_USAGE
 +static inline unsigned long stack_not_used(struct task_struct *p)
 +{
 +      unsigned long *n = end_of_stack(p);
 +
 +      do {    /* Skip over canary */
 +              n++;
 +      } while (!*n);
 +
 +      return (unsigned long)n - (unsigned long)end_of_stack(p);
 +}
 +#endif
 +
  /* set thread flags in other task's structures
   * - see asm/thread_info.h for TIF_xxxx flags available
   */
@@@ -2207,10 -2250,8 +2262,8 @@@ __trace_special(void *__tr, void *__dat
  }
  #endif
  
- extern long sched_setaffinity(pid_t pid, const cpumask_t *new_mask);
- extern long sched_getaffinity(pid_t pid, cpumask_t *mask);
- extern int sched_mc_power_savings, sched_smt_power_savings;
+ extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
+ extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
  
  extern void normalize_rt_tasks(void);
  
diff --combined init/main.c
@@@ -14,7 -14,6 +14,7 @@@
  #include <linux/proc_fs.h>
  #include <linux/kernel.h>
  #include <linux/syscalls.h>
 +#include <linux/stackprotector.h>
  #include <linux/string.h>
  #include <linux/ctype.h>
  #include <linux/delay.h>
@@@ -51,7 -50,6 +51,6 @@@
  #include <linux/rmap.h>
  #include <linux/mempolicy.h>
  #include <linux/key.h>
- #include <linux/unwind.h>
  #include <linux/buffer_head.h>
  #include <linux/page_cgroup.h>
  #include <linux/debug_locks.h>
@@@ -64,6 -62,7 +63,7 @@@
  #include <linux/signal.h>
  #include <linux/idr.h>
  #include <linux/ftrace.h>
+ #include <linux/async.h>
  #include <trace/boot.h>
  
  #include <asm/io.h>
  #include <asm/smp.h>
  #endif
  
- /*
-  * This is one of the first .c files built. Error out early if we have compiler
-  * trouble.
-  */
- #if __GNUC__ == 4 && __GNUC_MINOR__ == 1 && __GNUC_PATCHLEVEL__ == 0
- #warning gcc-4.1.0 is known to miscompile the kernel.  A different compiler version is recommended.
- #endif
  static int kernel_init(void *);
  
  extern void init_IRQ(void);
@@@ -118,7 -108,7 +109,7 @@@ EXPORT_SYMBOL(system_state)
  
  extern void time_init(void);
  /* Default late time init is NULL. archs can override this later. */
- void (*late_time_init)(void);
+ void (*__initdata late_time_init)(void);
  extern void softirq_init(void);
  
  /* Untouched command line saved by arch-specific code. */
@@@ -381,12 -371,7 +372,7 @@@ EXPORT_SYMBOL(nr_cpu_ids)
  /* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */
  static void __init setup_nr_cpu_ids(void)
  {
-       int cpu, highest_cpu = 0;
-       for_each_possible_cpu(cpu)
-               highest_cpu = cpu;
-       nr_cpu_ids = highest_cpu + 1;
+       nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1;
  }
  
  #ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
@@@ -462,7 -447,7 +448,7 @@@ static void __init setup_command_line(c
   * gcc-3.4 accidentally inlines this function, so use noinline.
   */
  
- static void noinline __init_refok rest_init(void)
+ static noinline void __init_refok rest_init(void)
        __releases(kernel_lock)
  {
        int pid;
@@@ -528,9 -513,9 +514,9 @@@ static void __init boot_cpu_init(void
  {
        int cpu = smp_processor_id();
        /* Mark the boot cpu "present", "online" etc for SMP and UP case */
-       cpu_set(cpu, cpu_online_map);
-       cpu_set(cpu, cpu_present_map);
-       cpu_set(cpu, cpu_possible_map);
+       set_cpu_online(cpu, true);
+       set_cpu_present(cpu, true);
+       set_cpu_possible(cpu, true);
  }
  
  void __init __weak smp_setup_processor_id(void)
@@@ -541,15 -526,6 +527,6 @@@ void __init __weak thread_info_cache_in
  {
  }
  
- void __init __weak arch_early_irq_init(void)
- {
- }
- void __init __weak early_irq_init(void)
- {
-       arch_early_irq_init();
- }
  asmlinkage void __init start_kernel(void)
  {
        char * command_line;
         * Need to run as early as possible, to initialize the
         * lockdep hash:
         */
-       unwind_init();
        lockdep_init();
        debug_objects_early_init();
 +
 +      /*
 +       * Set up the the initial canary ASAP:
 +       */
 +      boot_init_stack_canary();
 +
        cgroup_init_early();
  
        local_irq_disable();
        setup_arch(&command_line);
        mm_init_owner(&init_mm, &init_task);
        setup_command_line(command_line);
-       unwind_setup();
        setup_per_cpu_areas();
        setup_nr_cpu_ids();
        smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
        sched_clock_init();
        profile_init();
        if (!irqs_disabled())
-               printk("start_kernel(): bug: interrupts were enabled early\n");
+               printk(KERN_CRIT "start_kernel(): bug: interrupts were "
+                                "enabled early\n");
        early_boot_irqs_on();
        local_irq_enable();
  
        rest_init();
  }
  
static int initcall_debug;
+ int initcall_debug;
  core_param(initcall_debug, initcall_debug, bool, 0644);
  
  int do_one_initcall(initcall_t fn)
@@@ -816,8 -785,10 +792,10 @@@ static void run_init_process(char *init
  /* This is a non __init function. Force it to be noinline otherwise gcc
   * makes it inline to init() and it becomes part of init.text section
   */
- static int noinline init_post(void)
+ static noinline int init_post(void)
  {
+       /* need to finish all async __init code before freeing the memory */
+       async_synchronize_full();
        free_initmem();
        unlock_kernel();
        mark_rodata_ro();
diff --combined kernel/exit.c
@@@ -642,35 -642,31 +642,31 @@@ retry
        /*
         * We found no owner yet mm_users > 1: this implies that we are
         * most likely racing with swapoff (try_to_unuse()) or /proc or
-        * ptrace or page migration (get_task_mm()).  Mark owner as NULL,
-        * so that subsystems can understand the callback and take action.
+        * ptrace or page migration (get_task_mm()).  Mark owner as NULL.
         */
-       down_write(&mm->mmap_sem);
-       cgroup_mm_owner_callbacks(mm->owner, NULL);
        mm->owner = NULL;
-       up_write(&mm->mmap_sem);
        return;
  
  assign_new_owner:
        BUG_ON(c == p);
        get_task_struct(c);
-       read_unlock(&tasklist_lock);
-       down_write(&mm->mmap_sem);
        /*
         * The task_lock protects c->mm from changing.
         * We always want mm->owner->mm == mm
         */
        task_lock(c);
+       /*
+        * Delay read_unlock() till we have the task_lock()
+        * to ensure that c does not slip away underneath us
+        */
+       read_unlock(&tasklist_lock);
        if (c->mm != mm) {
                task_unlock(c);
-               up_write(&mm->mmap_sem);
                put_task_struct(c);
                goto retry;
        }
-       cgroup_mm_owner_callbacks(mm->owner, c);
        mm->owner = c;
        task_unlock(c);
-       up_write(&mm->mmap_sem);
        put_task_struct(c);
  }
  #endif /* CONFIG_MM_OWNER */
@@@ -981,9 -977,12 +977,9 @@@ static void check_stack_usage(void
  {
        static DEFINE_SPINLOCK(low_water_lock);
        static int lowest_to_date = THREAD_SIZE;
 -      unsigned long *n = end_of_stack(current);
        unsigned long free;
  
 -      while (*n == 0)
 -              n++;
 -      free = (unsigned long)n - (unsigned long)end_of_stack(current);
 +      free = stack_not_used(current);
  
        if (free >= lowest_to_date)
                return;
@@@ -1052,10 -1051,7 +1048,7 @@@ NORET_TYPE void do_exit(long code
                                preempt_count());
  
        acct_update_integrals(tsk);
-       if (tsk->mm) {
-               update_hiwater_rss(tsk->mm);
-               update_hiwater_vm(tsk->mm);
-       }
        group_dead = atomic_dec_and_test(&tsk->signal->live);
        if (group_dead) {
                hrtimer_cancel(&tsk->signal->real_timer);
diff --combined kernel/fork.c
@@@ -61,7 -61,6 +61,7 @@@
  #include <linux/proc_fs.h>
  #include <linux/blkdev.h>
  #include <trace/sched.h>
 +#include <linux/magic.h>
  
  #include <asm/pgtable.h>
  #include <asm/pgalloc.h>
@@@ -213,8 -212,6 +213,8 @@@ static struct task_struct *dup_task_str
  {
        struct task_struct *tsk;
        struct thread_info *ti;
 +      unsigned long *stackend;
 +
        int err;
  
        prepare_to_copy(orig);
                goto out;
  
        setup_thread_stack(tsk, orig);
 +      stackend = end_of_stack(tsk);
 +      *stackend = STACK_END_MAGIC;    /* for overflow detection */
  
  #ifdef CONFIG_CC_STACKPROTECTOR
        tsk->stack_canary = get_random_int();
@@@ -405,6 -400,18 +405,18 @@@ __cacheline_aligned_in_smp DEFINE_SPINL
  #define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
  #define free_mm(mm)   (kmem_cache_free(mm_cachep, (mm)))
  
+ static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT;
+ static int __init coredump_filter_setup(char *s)
+ {
+       default_dump_filter =
+               (simple_strtoul(s, NULL, 0) << MMF_DUMP_FILTER_SHIFT) &
+               MMF_DUMP_FILTER_MASK;
+       return 1;
+ }
+ __setup("coredump_filter=", coredump_filter_setup);
  #include <linux/init_task.h>
  
  static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
        atomic_set(&mm->mm_count, 1);
        init_rwsem(&mm->mmap_sem);
        INIT_LIST_HEAD(&mm->mmlist);
-       mm->flags = (current->mm) ? current->mm->flags
-                                 : MMF_DUMP_FILTER_DEFAULT;
+       mm->flags = (current->mm) ? current->mm->flags : default_dump_filter;
        mm->core_state = NULL;
        mm->nr_ptes = 0;
        set_mm_counter(mm, file_rss, 0);
@@@ -763,7 -769,7 +774,7 @@@ static int copy_sighand(unsigned long c
  {
        struct sighand_struct *sig;
  
-       if (clone_flags & (CLONE_SIGHAND | CLONE_THREAD)) {
+       if (clone_flags & CLONE_SIGHAND) {
                atomic_inc(&current->sighand->count);
                return 0;
        }
@@@ -1120,12 -1126,12 +1131,12 @@@ static struct task_struct *copy_process
  
        if (pid != &init_struct_pid) {
                retval = -ENOMEM;
-               pid = alloc_pid(task_active_pid_ns(p));
+               pid = alloc_pid(p->nsproxy->pid_ns);
                if (!pid)
                        goto bad_fork_cleanup_io;
  
                if (clone_flags & CLONE_NEWPID) {
-                       retval = pid_ns_prepare_proc(task_active_pid_ns(p));
+                       retval = pid_ns_prepare_proc(p->nsproxy->pid_ns);
                        if (retval < 0)
                                goto bad_fork_free_pid;
                }
@@@ -1475,12 -1481,10 +1486,10 @@@ void __init proc_caches_init(void
        fs_cachep = kmem_cache_create("fs_cache",
                        sizeof(struct fs_struct), 0,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
-       vm_area_cachep = kmem_cache_create("vm_area_struct",
-                       sizeof(struct vm_area_struct), 0,
-                       SLAB_PANIC, NULL);
        mm_cachep = kmem_cache_create("mm_struct",
                        sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+       mmap_init();
  }
  
  /*
diff --combined kernel/panic.c
@@@ -74,9 -74,6 +74,9 @@@ NORET_TYPE void panic(const char * fmt
        vsnprintf(buf, sizeof(buf), fmt, args);
        va_end(args);
        printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf);
 +#ifdef CONFIG_DEBUG_BUGVERBOSE
 +      dump_stack();
 +#endif
        bust_spinlocks(0);
  
        /*
@@@ -302,6 -299,8 +302,8 @@@ static int init_oops_id(void
  {
        if (!oops_id)
                get_random_bytes(&oops_id, sizeof(oops_id));
+       else
+               oops_id++;
  
        return 0;
  }
@@@ -356,22 -355,15 +358,22 @@@ EXPORT_SYMBOL(warn_slowpath)
  #endif
  
  #ifdef CONFIG_CC_STACKPROTECTOR
 +
 +#ifndef GCC_HAS_SP
 +#warning You have selected the CONFIG_CC_STACKPROTECTOR option, but the gcc used does not support this.
 +#endif
 +
  /*
   * Called when gcc's -fstack-protector feature is used, and
   * gcc detects corruption of the on-stack canary value
   */
  void __stack_chk_fail(void)
  {
 -      panic("stack-protector: Kernel stack is corrupted");
 +      panic("stack-protector: Kernel stack is corrupted in: %p\n",
 +              __builtin_return_address(0));
  }
  EXPORT_SYMBOL(__stack_chk_fail);
 +
  #endif
  
  core_param(panic, panic_timeout, int, 0644);
diff --combined kernel/sched.c
@@@ -125,6 -125,9 +125,9 @@@ DEFINE_TRACE(sched_switch)
  DEFINE_TRACE(sched_migrate_task);
  
  #ifdef CONFIG_SMP
+ static void double_rq_lock(struct rq *rq1, struct rq *rq2);
  /*
   * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
   * Since cpu_power is a 'constant', we can use a reciprocal divide.
@@@ -498,18 -501,26 +501,26 @@@ struct rt_rq 
   */
  struct root_domain {
        atomic_t refcount;
-       cpumask_t span;
-       cpumask_t online;
+       cpumask_var_t span;
+       cpumask_var_t online;
  
        /*
         * The "RT overload" flag: it gets set if a CPU has more than
         * one runnable RT task.
         */
-       cpumask_t rto_mask;
+       cpumask_var_t rto_mask;
        atomic_t rto_count;
  #ifdef CONFIG_SMP
        struct cpupri cpupri;
  #endif
+ #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+       /*
+        * Preferred wake up cpu nominated by sched_mc balance that will be
+        * used when most cpus are idle in the system indicating overall very
+        * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2)
+        */
+       unsigned int sched_mc_preferred_wakeup_cpu;
+ #endif
  };
  
  /*
@@@ -1514,7 -1525,7 +1525,7 @@@ static int tg_shares_up(struct task_gro
        struct sched_domain *sd = data;
        int i;
  
-       for_each_cpu_mask(i, sd->span) {
+       for_each_cpu(i, sched_domain_span(sd)) {
                /*
                 * If there are currently no tasks on the cpu pretend there
                 * is one of average load so that when a new task gets to
        if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
                shares = tg->shares;
  
-       for_each_cpu_mask(i, sd->span)
+       for_each_cpu(i, sched_domain_span(sd))
                update_group_shares_cpu(tg, i, shares, rq_weight);
  
        return 0;
@@@ -2101,15 -2112,17 +2112,17 @@@ find_idlest_group(struct sched_domain *
                int i;
  
                /* Skip over this group if it has no CPUs allowed */
-               if (!cpus_intersects(group->cpumask, p->cpus_allowed))
+               if (!cpumask_intersects(sched_group_cpus(group),
+                                       &p->cpus_allowed))
                        continue;
  
-               local_group = cpu_isset(this_cpu, group->cpumask);
+               local_group = cpumask_test_cpu(this_cpu,
+                                              sched_group_cpus(group));
  
                /* Tally up the load of all CPUs in the group */
                avg_load = 0;
  
-               for_each_cpu_mask_nr(i, group->cpumask) {
+               for_each_cpu(i, sched_group_cpus(group)) {
                        /* Bias balancing toward cpus of our domain */
                        if (local_group)
                                load = source_load(i, load_idx);
   * find_idlest_cpu - find the idlest cpu among the cpus in group.
   */
  static int
- find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu,
-               cpumask_t *tmp)
+ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
  {
        unsigned long load, min_load = ULONG_MAX;
        int idlest = -1;
        int i;
  
        /* Traverse only the allowed CPUs */
-       cpus_and(*tmp, group->cpumask, p->cpus_allowed);
-       for_each_cpu_mask_nr(i, *tmp) {
+       for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
                load = weighted_cpuload(i);
  
                if (load < min_load || (load == min_load && i == this_cpu)) {
@@@ -2193,7 -2203,6 +2203,6 @@@ static int sched_balance_self(int cpu, 
                update_shares(sd);
  
        while (sd) {
-               cpumask_t span, tmpmask;
                struct sched_group *group;
                int new_cpu, weight;
  
                        continue;
                }
  
-               span = sd->span;
                group = find_idlest_group(sd, t, cpu);
                if (!group) {
                        sd = sd->child;
                        continue;
                }
  
-               new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask);
+               new_cpu = find_idlest_cpu(group, t, cpu);
                if (new_cpu == -1 || new_cpu == cpu) {
                        /* Now try balancing at a lower domain level of cpu */
                        sd = sd->child;
  
                /* Now try balancing at a lower domain level of new_cpu */
                cpu = new_cpu;
+               weight = cpumask_weight(sched_domain_span(sd));
                sd = NULL;
-               weight = cpus_weight(span);
                for_each_domain(cpu, tmp) {
-                       if (weight <= cpus_weight(tmp->span))
+                       if (weight <= cpumask_weight(sched_domain_span(tmp)))
                                break;
                        if (tmp->flags & flag)
                                sd = tmp;
@@@ -2266,7 -2274,7 +2274,7 @@@ static int try_to_wake_up(struct task_s
                cpu = task_cpu(p);
  
                for_each_domain(this_cpu, sd) {
-                       if (cpu_isset(cpu, sd->span)) {
+                       if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
                                update_shares(sd);
                                break;
                        }
        else {
                struct sched_domain *sd;
                for_each_domain(this_cpu, sd) {
-                       if (cpu_isset(cpu, sd->span)) {
+                       if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
                                schedstat_inc(sd, ttwu_wake_remote);
                                break;
                        }
@@@ -2846,7 -2854,7 +2854,7 @@@ static void sched_migrate_task(struct t
        struct rq *rq;
  
        rq = task_rq_lock(p, &flags);
-       if (!cpu_isset(dest_cpu, p->cpus_allowed)
+       if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)
            || unlikely(!cpu_active(dest_cpu)))
                goto out;
  
@@@ -2911,7 -2919,7 +2919,7 @@@ int can_migrate_task(struct task_struc
         * 2) cannot be migrated to this CPU due to cpus_allowed, or
         * 3) are cache-hot on their current CPU.
         */
-       if (!cpu_isset(this_cpu, p->cpus_allowed)) {
+       if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
                schedstat_inc(p, se.nr_failed_migrations_affine);
                return 0;
        }
@@@ -3086,7 -3094,7 +3094,7 @@@ static int move_one_task(struct rq *thi
  static struct sched_group *
  find_busiest_group(struct sched_domain *sd, int this_cpu,
                   unsigned long *imbalance, enum cpu_idle_type idle,
-                  int *sd_idle, const cpumask_t *cpus, int *balance)
+                  int *sd_idle, const struct cpumask *cpus, int *balance)
  {
        struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
        unsigned long max_load, avg_load, total_load, this_load, total_pwr;
                unsigned long sum_avg_load_per_task;
                unsigned long avg_load_per_task;
  
-               local_group = cpu_isset(this_cpu, group->cpumask);
+               local_group = cpumask_test_cpu(this_cpu,
+                                              sched_group_cpus(group));
  
                if (local_group)
-                       balance_cpu = first_cpu(group->cpumask);
+                       balance_cpu = cpumask_first(sched_group_cpus(group));
  
                /* Tally up the load of all CPUs in the group */
                sum_weighted_load = sum_nr_running = avg_load = 0;
                max_cpu_load = 0;
                min_cpu_load = ~0UL;
  
-               for_each_cpu_mask_nr(i, group->cpumask) {
-                       struct rq *rq;
-                       if (!cpu_isset(i, *cpus))
-                               continue;
-                       rq = cpu_rq(i);
+               for_each_cpu_and(i, sched_group_cpus(group), cpus) {
+                       struct rq *rq = cpu_rq(i);
  
                        if (*sd_idle && rq->nr_running)
                                *sd_idle = 0;
                 */
                if ((sum_nr_running < min_nr_running) ||
                    (sum_nr_running == min_nr_running &&
-                    first_cpu(group->cpumask) <
-                    first_cpu(group_min->cpumask))) {
+                    cpumask_first(sched_group_cpus(group)) >
+                    cpumask_first(sched_group_cpus(group_min)))) {
                        group_min = group;
                        min_nr_running = sum_nr_running;
                        min_load_per_task = sum_weighted_load /
                if (sum_nr_running <= group_capacity - 1) {
                        if (sum_nr_running > leader_nr_running ||
                            (sum_nr_running == leader_nr_running &&
-                            first_cpu(group->cpumask) >
-                             first_cpu(group_leader->cpumask))) {
+                            cpumask_first(sched_group_cpus(group)) <
+                            cpumask_first(sched_group_cpus(group_leader)))) {
                                group_leader = group;
                                leader_nr_running = sum_nr_running;
                        }
@@@ -3394,6 -3398,10 +3398,10 @@@ out_balanced
  
        if (this == group_leader && group_leader != group_min) {
                *imbalance = min_load_per_task;
+               if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
+                       cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
+                               cpumask_first(sched_group_cpus(group_leader));
+               }
                return group_min;
        }
  #endif
   */
  static struct rq *
  find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
-                  unsigned long imbalance, const cpumask_t *cpus)
+                  unsigned long imbalance, const struct cpumask *cpus)
  {
        struct rq *busiest = NULL, *rq;
        unsigned long max_load = 0;
        int i;
  
-       for_each_cpu_mask_nr(i, group->cpumask) {
+       for_each_cpu(i, sched_group_cpus(group)) {
                unsigned long wl;
  
-               if (!cpu_isset(i, *cpus))
+               if (!cpumask_test_cpu(i, cpus))
                        continue;
  
                rq = cpu_rq(i);
   */
  static int load_balance(int this_cpu, struct rq *this_rq,
                        struct sched_domain *sd, enum cpu_idle_type idle,
-                       int *balance, cpumask_t *cpus)
+                       int *balance, struct cpumask *cpus)
  {
        int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
        struct sched_group *group;
        struct rq *busiest;
        unsigned long flags;
  
-       cpus_setall(*cpus);
+       cpumask_setall(cpus);
  
        /*
         * When power savings policy is enabled for the parent domain, idle
@@@ -3514,8 -3522,8 +3522,8 @@@ redo
  
                /* All tasks on this runqueue were pinned by CPU affinity */
                if (unlikely(all_pinned)) {
-                       cpu_clear(cpu_of(busiest), *cpus);
-                       if (!cpus_empty(*cpus))
+                       cpumask_clear_cpu(cpu_of(busiest), cpus);
+                       if (!cpumask_empty(cpus))
                                goto redo;
                        goto out_balanced;
                }
                        /* don't kick the migration_thread, if the curr
                         * task on busiest cpu can't be moved to this_cpu
                         */
-                       if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
+                       if (!cpumask_test_cpu(this_cpu,
+                                             &busiest->curr->cpus_allowed)) {
                                spin_unlock_irqrestore(&busiest->lock, flags);
                                all_pinned = 1;
                                goto out_one_pinned;
@@@ -3607,7 -3616,7 +3616,7 @@@ out
   */
  static int
  load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
-                       cpumask_t *cpus)
+                       struct cpumask *cpus)
  {
        struct sched_group *group;
        struct rq *busiest = NULL;
        int sd_idle = 0;
        int all_pinned = 0;
  
-       cpus_setall(*cpus);
+       cpumask_setall(cpus);
  
        /*
         * When power savings policy is enabled for the parent domain, idle
@@@ -3660,17 -3669,76 +3669,76 @@@ redo
                double_unlock_balance(this_rq, busiest);
  
                if (unlikely(all_pinned)) {
-                       cpu_clear(cpu_of(busiest), *cpus);
-                       if (!cpus_empty(*cpus))
+                       cpumask_clear_cpu(cpu_of(busiest), cpus);
+                       if (!cpumask_empty(cpus))
                                goto redo;
                }
        }
  
        if (!ld_moved) {
+               int active_balance = 0;
                schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
                if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
                    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
                        return -1;
+               if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
+                       return -1;
+               if (sd->nr_balance_failed++ < 2)
+                       return -1;
+               /*
+                * The only task running in a non-idle cpu can be moved to this
+                * cpu in an attempt to completely freeup the other CPU
+                * package. The same method used to move task in load_balance()
+                * have been extended for load_balance_newidle() to speedup
+                * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2)
+                *
+                * The package power saving logic comes from
+                * find_busiest_group().  If there are no imbalance, then
+                * f_b_g() will return NULL.  However when sched_mc={1,2} then
+                * f_b_g() will select a group from which a running task may be
+                * pulled to this cpu in order to make the other package idle.
+                * If there is no opportunity to make a package idle and if
+                * there are no imbalance, then f_b_g() will return NULL and no
+                * action will be taken in load_balance_newidle().
+                *
+                * Under normal task pull operation due to imbalance, there
+                * will be more than one task in the source run queue and
+                * move_tasks() will succeed.  ld_moved will be true and this
+                * active balance code will not be triggered.
+                */
+               /* Lock busiest in correct order while this_rq is held */
+               double_lock_balance(this_rq, busiest);
+               /*
+                * don't kick the migration_thread, if the curr
+                * task on busiest cpu can't be moved to this_cpu
+                */
+               if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
+                       double_unlock_balance(this_rq, busiest);
+                       all_pinned = 1;
+                       return ld_moved;
+               }
+               if (!busiest->active_balance) {
+                       busiest->active_balance = 1;
+                       busiest->push_cpu = this_cpu;
+                       active_balance = 1;
+               }
+               double_unlock_balance(this_rq, busiest);
+               /*
+                * Should not call ttwu while holding a rq->lock
+                */
+               spin_unlock(&this_rq->lock);
+               if (active_balance)
+                       wake_up_process(busiest->migration_thread);
+               spin_lock(&this_rq->lock);
        } else
                sd->nr_balance_failed = 0;
  
@@@ -3696,7 -3764,10 +3764,10 @@@ static void idle_balance(int this_cpu, 
        struct sched_domain *sd;
        int pulled_task = 0;
        unsigned long next_balance = jiffies + HZ;
-       cpumask_t tmpmask;
+       cpumask_var_t tmpmask;
+       if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC))
+               return;
  
        for_each_domain(this_cpu, sd) {
                unsigned long interval;
                if (sd->flags & SD_BALANCE_NEWIDLE)
                        /* If we've pulled tasks over stop searching: */
                        pulled_task = load_balance_newidle(this_cpu, this_rq,
-                                                          sd, &tmpmask);
+                                                          sd, tmpmask);
  
                interval = msecs_to_jiffies(sd->balance_interval);
                if (time_after(next_balance, sd->last_balance + interval))
                 */
                this_rq->next_balance = next_balance;
        }
+       free_cpumask_var(tmpmask);
  }
  
  /*
@@@ -3759,7 -3831,7 +3831,7 @@@ static void active_load_balance(struct 
        /* Search for an sd spanning us and the target CPU. */
        for_each_domain(target_cpu, sd) {
                if ((sd->flags & SD_LOAD_BALANCE) &&
-                   cpu_isset(busiest_cpu, sd->span))
+                   cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
                                break;
        }
  
  #ifdef CONFIG_NO_HZ
  static struct {
        atomic_t load_balancer;
-       cpumask_t cpu_mask;
+       cpumask_var_t cpu_mask;
  } nohz ____cacheline_aligned = {
        .load_balancer = ATOMIC_INIT(-1),
-       .cpu_mask = CPU_MASK_NONE,
  };
  
  /*
@@@ -3809,7 -3880,7 +3880,7 @@@ int select_nohz_load_balancer(int stop_
        int cpu = smp_processor_id();
  
        if (stop_tick) {
-               cpu_set(cpu, nohz.cpu_mask);
+               cpumask_set_cpu(cpu, nohz.cpu_mask);
                cpu_rq(cpu)->in_nohz_recently = 1;
  
                /*
                }
  
                /* time for ilb owner also to sleep */
-               if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
+               if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
                        if (atomic_read(&nohz.load_balancer) == cpu)
                                atomic_set(&nohz.load_balancer, -1);
                        return 0;
                } else if (atomic_read(&nohz.load_balancer) == cpu)
                        return 1;
        } else {
-               if (!cpu_isset(cpu, nohz.cpu_mask))
+               if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
                        return 0;
  
-               cpu_clear(cpu, nohz.cpu_mask);
+               cpumask_clear_cpu(cpu, nohz.cpu_mask);
  
                if (atomic_read(&nohz.load_balancer) == cpu)
                        if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
@@@ -3867,7 -3938,11 +3938,11 @@@ static void rebalance_domains(int cpu, 
        unsigned long next_balance = jiffies + 60*HZ;
        int update_next_balance = 0;
        int need_serialize;
-       cpumask_t tmp;
+       cpumask_var_t tmp;
+       /* Fails alloc?  Rebalancing probably not a priority right now. */
+       if (!alloc_cpumask_var(&tmp, GFP_ATOMIC))
+               return;
  
        for_each_domain(cpu, sd) {
                if (!(sd->flags & SD_LOAD_BALANCE))
                }
  
                if (time_after_eq(jiffies, sd->last_balance + interval)) {
-                       if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) {
+                       if (load_balance(cpu, rq, sd, idle, &balance, tmp)) {
                                /*
                                 * We've pulled tasks over so either we're no
                                 * longer idle, or one of our SMT siblings is
@@@ -3926,6 -4001,8 +4001,8 @@@ out
         */
        if (likely(update_next_balance))
                rq->next_balance = next_balance;
+       free_cpumask_var(tmp);
  }
  
  /*
@@@ -3950,12 -4027,13 +4027,13 @@@ static void run_rebalance_domains(struc
         */
        if (this_rq->idle_at_tick &&
            atomic_read(&nohz.load_balancer) == this_cpu) {
-               cpumask_t cpus = nohz.cpu_mask;
                struct rq *rq;
                int balance_cpu;
  
-               cpu_clear(this_cpu, cpus);
-               for_each_cpu_mask_nr(balance_cpu, cpus) {
+               for_each_cpu(balance_cpu, nohz.cpu_mask) {
+                       if (balance_cpu == this_cpu)
+                               continue;
                        /*
                         * If this cpu gets work to do, stop the load balancing
                         * work being done for other cpus. Next load
@@@ -3993,7 -4071,7 +4071,7 @@@ static inline void trigger_load_balance
                rq->in_nohz_recently = 0;
  
                if (atomic_read(&nohz.load_balancer) == cpu) {
-                       cpu_clear(cpu, nohz.cpu_mask);
+                       cpumask_clear_cpu(cpu, nohz.cpu_mask);
                        atomic_set(&nohz.load_balancer, -1);
                }
  
                         * TBD: Traverse the sched domains and nominate
                         * the nearest cpu in the nohz.cpu_mask.
                         */
-                       int ilb = first_cpu(nohz.cpu_mask);
+                       int ilb = cpumask_first(nohz.cpu_mask);
  
                        if (ilb < nr_cpu_ids)
                                resched_cpu(ilb);
         * cpus with ticks stopped, is it time for that to stop?
         */
        if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
-           cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
+           cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
                resched_cpu(cpu);
                return;
        }
         * someone else, then no need raise the SCHED_SOFTIRQ
         */
        if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
-           cpu_isset(cpu, nohz.cpu_mask))
+           cpumask_test_cpu(cpu, nohz.cpu_mask))
                return;
  #endif
        if (time_after_eq(jiffies, rq->next_balance))
@@@ -4080,13 -4158,17 +4158,17 @@@ unsigned long long task_delta_exec(stru
   * Account user cpu time to a process.
   * @p: the process that the cpu time gets accounted to
   * @cputime: the cpu time spent in user space since the last update
+  * @cputime_scaled: cputime scaled by cpu frequency
   */
- void account_user_time(struct task_struct *p, cputime_t cputime)
+ void account_user_time(struct task_struct *p, cputime_t cputime,
+                      cputime_t cputime_scaled)
  {
        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
        cputime64_t tmp;
  
+       /* Add user time to process. */
        p->utime = cputime_add(p->utime, cputime);
+       p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
        account_group_user_time(p, cputime);
  
        /* Add user time to cpustat. */
   * Account guest cpu time to a process.
   * @p: the process that the cpu time gets accounted to
   * @cputime: the cpu time spent in virtual machine since the last update
+  * @cputime_scaled: cputime scaled by cpu frequency
   */
- static void account_guest_time(struct task_struct *p, cputime_t cputime)
+ static void account_guest_time(struct task_struct *p, cputime_t cputime,
+                              cputime_t cputime_scaled)
  {
        cputime64_t tmp;
        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
  
        tmp = cputime_to_cputime64(cputime);
  
+       /* Add guest time to process. */
        p->utime = cputime_add(p->utime, cputime);
+       p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
        account_group_user_time(p, cputime);
        p->gtime = cputime_add(p->gtime, cputime);
  
+       /* Add guest time to cpustat. */
        cpustat->user = cputime64_add(cpustat->user, tmp);
        cpustat->guest = cputime64_add(cpustat->guest, tmp);
  }
  
  /*
-  * Account scaled user cpu time to a process.
-  * @p: the process that the cpu time gets accounted to
-  * @cputime: the cpu time spent in user space since the last update
-  */
- void account_user_time_scaled(struct task_struct *p, cputime_t cputime)
- {
-       p->utimescaled = cputime_add(p->utimescaled, cputime);
- }
- /*
   * Account system cpu time to a process.
   * @p: the process that the cpu time gets accounted to
   * @hardirq_offset: the offset to subtract from hardirq_count()
   * @cputime: the cpu time spent in kernel space since the last update
+  * @cputime_scaled: cputime scaled by cpu frequency
   */
  void account_system_time(struct task_struct *p, int hardirq_offset,
-                        cputime_t cputime)
+                        cputime_t cputime, cputime_t cputime_scaled)
  {
        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
-       struct rq *rq = this_rq();
        cputime64_t tmp;
  
        if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
-               account_guest_time(p, cputime);
+               account_guest_time(p, cputime, cputime_scaled);
                return;
        }
  
+       /* Add system time to process. */
        p->stime = cputime_add(p->stime, cputime);
+       p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
        account_group_system_time(p, cputime);
  
        /* Add system time to cpustat. */
                cpustat->irq = cputime64_add(cpustat->irq, tmp);
        else if (softirq_count())
                cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
-       else if (p != rq->idle)
-               cpustat->system = cputime64_add(cpustat->system, tmp);
-       else if (atomic_read(&rq->nr_iowait) > 0)
-               cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
        else
-               cpustat->idle = cputime64_add(cpustat->idle, tmp);
+               cpustat->system = cputime64_add(cpustat->system, tmp);
        /* Account for system time used */
        acct_update_integrals(p);
  }
  
  /*
-  * Account scaled system cpu time to a process.
-  * @p: the process that the cpu time gets accounted to
-  * @hardirq_offset: the offset to subtract from hardirq_count()
-  * @cputime: the cpu time spent in kernel space since the last update
+  * Account for involuntary wait time.
+  * @steal: the cpu time spent in involuntary wait
   */
- void account_system_time_scaled(struct task_struct *p, cputime_t cputime)
+ void account_steal_time(cputime_t cputime)
  {
-       p->stimescaled = cputime_add(p->stimescaled, cputime);
+       struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+       cputime64_t cputime64 = cputime_to_cputime64(cputime);
+       cpustat->steal = cputime64_add(cpustat->steal, cputime64);
  }
  
  /*
-  * Account for involuntary wait time.
-  * @p: the process from which the cpu time has been stolen
-  * @steal: the cpu time spent in involuntary wait
+  * Account for idle time.
+  * @cputime: the cpu time spent in idle wait
   */
- void account_steal_time(struct task_struct *p, cputime_t steal)
+ void account_idle_time(cputime_t cputime)
  {
        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
-       cputime64_t tmp = cputime_to_cputime64(steal);
+       cputime64_t cputime64 = cputime_to_cputime64(cputime);
        struct rq *rq = this_rq();
  
-       if (p == rq->idle) {
-               p->stime = cputime_add(p->stime, steal);
-               if (atomic_read(&rq->nr_iowait) > 0)
-                       cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
-               else
-                       cpustat->idle = cputime64_add(cpustat->idle, tmp);
-       } else
-               cpustat->steal = cputime64_add(cpustat->steal, tmp);
+       if (atomic_read(&rq->nr_iowait) > 0)
+               cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);
+       else
+               cpustat->idle = cputime64_add(cpustat->idle, cputime64);
+ }
+ #ifndef CONFIG_VIRT_CPU_ACCOUNTING
+ /*
+  * Account a single tick of cpu time.
+  * @p: the process that the cpu time gets accounted to
+  * @user_tick: indicates if the tick is a user or a system tick
+  */
+ void account_process_tick(struct task_struct *p, int user_tick)
+ {
+       cputime_t one_jiffy = jiffies_to_cputime(1);
+       cputime_t one_jiffy_scaled = cputime_to_scaled(one_jiffy);
+       struct rq *rq = this_rq();
+       if (user_tick)
+               account_user_time(p, one_jiffy, one_jiffy_scaled);
+       else if (p != rq->idle)
+               account_system_time(p, HARDIRQ_OFFSET, one_jiffy,
+                                   one_jiffy_scaled);
+       else
+               account_idle_time(one_jiffy);
  }
  
  /*
+  * Account multiple ticks of steal time.
+  * @p: the process from which the cpu time has been stolen
+  * @ticks: number of stolen ticks
+  */
+ void account_steal_ticks(unsigned long ticks)
+ {
+       account_steal_time(jiffies_to_cputime(ticks));
+ }
+ /*
+  * Account multiple ticks of idle time.
+  * @ticks: number of stolen ticks
+  */
+ void account_idle_ticks(unsigned long ticks)
+ {
+       account_idle_time(jiffies_to_cputime(ticks));
+ }
+ #endif
+ /*
   * Use precise platform statistics if available:
   */
  #ifdef CONFIG_VIRT_CPU_ACCOUNTING
@@@ -5401,10 -5516,9 +5516,9 @@@ out_unlock
        return retval;
  }
  
- long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
+ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
  {
-       cpumask_t cpus_allowed;
-       cpumask_t new_mask = *in_mask;
+       cpumask_var_t cpus_allowed, new_mask;
        struct task_struct *p;
        int retval;
  
        get_task_struct(p);
        read_unlock(&tasklist_lock);
  
+       if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
+               retval = -ENOMEM;
+               goto out_put_task;
+       }
+       if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
+               retval = -ENOMEM;
+               goto out_free_cpus_allowed;
+       }
        retval = -EPERM;
        if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
                goto out_unlock;
        if (retval)
                goto out_unlock;
  
-       cpuset_cpus_allowed(p, &cpus_allowed);
-       cpus_and(new_mask, new_mask, cpus_allowed);
+       cpuset_cpus_allowed(p, cpus_allowed);
+       cpumask_and(new_mask, in_mask, cpus_allowed);
   again:
-       retval = set_cpus_allowed_ptr(p, &new_mask);
+       retval = set_cpus_allowed_ptr(p, new_mask);
  
        if (!retval) {
-               cpuset_cpus_allowed(p, &cpus_allowed);
-               if (!cpus_subset(new_mask, cpus_allowed)) {
+               cpuset_cpus_allowed(p, cpus_allowed);
+               if (!cpumask_subset(new_mask, cpus_allowed)) {
                        /*
                         * We must have raced with a concurrent cpuset
                         * update. Just reset the cpus_allowed to the
                         * cpuset's cpus_allowed
                         */
-                       new_mask = cpus_allowed;
+                       cpumask_copy(new_mask, cpus_allowed);
                        goto again;
                }
        }
  out_unlock:
+       free_cpumask_var(new_mask);
+ out_free_cpus_allowed:
+       free_cpumask_var(cpus_allowed);
+ out_put_task:
        put_task_struct(p);
        put_online_cpus();
        return retval;
  }
  
  static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
-                            cpumask_t *new_mask)
+                            struct cpumask *new_mask)
  {
-       if (len < sizeof(cpumask_t)) {
-               memset(new_mask, 0, sizeof(cpumask_t));
-       } else if (len > sizeof(cpumask_t)) {
-               len = sizeof(cpumask_t);
-       }
+       if (len < cpumask_size())
+               cpumask_clear(new_mask);
+       else if (len > cpumask_size())
+               len = cpumask_size();
        return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
  }
  
  asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
                                      unsigned long __user *user_mask_ptr)
  {
-       cpumask_t new_mask;
+       cpumask_var_t new_mask;
        int retval;
  
-       retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
-       if (retval)
-               return retval;
+       if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
+               return -ENOMEM;
  
-       return sched_setaffinity(pid, &new_mask);
+       retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
+       if (retval == 0)
+               retval = sched_setaffinity(pid, new_mask);
+       free_cpumask_var(new_mask);
+       return retval;
  }
  
- long sched_getaffinity(pid_t pid, cpumask_t *mask)
+ long sched_getaffinity(pid_t pid, struct cpumask *mask)
  {
        struct task_struct *p;
        int retval;
        if (retval)
                goto out_unlock;
  
-       cpus_and(*mask, p->cpus_allowed, cpu_online_map);
+       cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
  
  out_unlock:
        read_unlock(&tasklist_lock);
@@@ -5523,19 -5652,24 +5652,24 @@@ asmlinkage long sys_sched_getaffinity(p
                                      unsigned long __user *user_mask_ptr)
  {
        int ret;
-       cpumask_t mask;
+       cpumask_var_t mask;
  
-       if (len < sizeof(cpumask_t))
+       if (len < cpumask_size())
                return -EINVAL;
  
-       ret = sched_getaffinity(pid, &mask);
-       if (ret < 0)
-               return ret;
+       if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+               return -ENOMEM;
  
-       if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
-               return -EFAULT;
+       ret = sched_getaffinity(pid, mask);
+       if (ret == 0) {
+               if (copy_to_user(user_mask_ptr, mask, cpumask_size()))
+                       ret = -EFAULT;
+               else
+                       ret = cpumask_size();
+       }
+       free_cpumask_var(mask);
  
-       return sizeof(cpumask_t);
+       return ret;
  }
  
  /**
@@@ -5805,7 -5939,12 +5939,7 @@@ void sched_show_task(struct task_struc
                printk(KERN_CONT " %016lx ", thread_saved_pc(p));
  #endif
  #ifdef CONFIG_DEBUG_STACK_USAGE
 -      {
 -              unsigned long *n = end_of_stack(p);
 -              while (!*n)
 -                      n++;
 -              free = (unsigned long)n - (unsigned long)end_of_stack(p);
 -      }
 +      free = stack_not_used(p);
  #endif
        printk(KERN_CONT "%5lu %5d %6d\n", free,
                task_pid_nr(p), task_pid_nr(p->real_parent));
@@@ -5872,7 -6011,7 +6006,7 @@@ void __cpuinit init_idle(struct task_st
        idle->se.exec_start = sched_clock();
  
        idle->prio = idle->normal_prio = MAX_PRIO;
-       idle->cpus_allowed = cpumask_of_cpu(cpu);
+       cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
        __set_task_cpu(idle, cpu);
  
        rq->curr = rq->idle = idle;
   * indicates which cpus entered this state. This is used
   * in the rcu update to wait only for active cpus. For system
   * which do not switch off the HZ timer nohz_cpu_mask should
-  * always be CPU_MASK_NONE.
+  * always be CPU_BITS_NONE.
   */
- cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
+ cpumask_var_t nohz_cpu_mask;
  
  /*
   * Increase the granularity value when there are more CPUs,
@@@ -5956,7 -6095,7 +6090,7 @@@ static inline void sched_init_granulari
   * task must not exit() & deallocate itself prematurely. The
   * call is not atomic; no spinlocks may be held.
   */
- int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
+ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
  {
        struct migration_req req;
        unsigned long flags;
        int ret = 0;
  
        rq = task_rq_lock(p, &flags);
-       if (!cpus_intersects(*new_mask, cpu_online_map)) {
+       if (!cpumask_intersects(new_mask, cpu_online_mask)) {
                ret = -EINVAL;
                goto out;
        }
  
        if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
-                    !cpus_equal(p->cpus_allowed, *new_mask))) {
+                    !cpumask_equal(&p->cpus_allowed, new_mask))) {
                ret = -EINVAL;
                goto out;
        }
        if (p->sched_class->set_cpus_allowed)
                p->sched_class->set_cpus_allowed(p, new_mask);
        else {
-               p->cpus_allowed = *new_mask;
-               p->rt.nr_cpus_allowed = cpus_weight(*new_mask);
+               cpumask_copy(&p->cpus_allowed, new_mask);
+               p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
        }
  
        /* Can the task run on the task's current CPU? If so, we're done */
-       if (cpu_isset(task_cpu(p), *new_mask))
+       if (cpumask_test_cpu(task_cpu(p), new_mask))
                goto out;
  
-       if (migrate_task(p, any_online_cpu(*new_mask), &req)) {
+       if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {
                /* Need help from migration thread: drop lock and wait. */
                task_rq_unlock(rq, &flags);
                wake_up_process(rq->migration_thread);
@@@ -6028,7 -6167,7 +6162,7 @@@ static int __migrate_task(struct task_s
        if (task_cpu(p) != src_cpu)
                goto done;
        /* Affinity changed (again). */
-       if (!cpu_isset(dest_cpu, p->cpus_allowed))
+       if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
                goto fail;
  
        on_rq = p->se.on_rq;
@@@ -6125,50 -6264,41 +6259,41 @@@ static int __migrate_task_irq(struct ta
   */
  static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
  {
-       unsigned long flags;
-       cpumask_t mask;
-       struct rq *rq;
        int dest_cpu;
+       const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(dead_cpu));
  
-       do {
-               /* On same node? */
-               mask = node_to_cpumask(cpu_to_node(dead_cpu));
-               cpus_and(mask, mask, p->cpus_allowed);
-               dest_cpu = any_online_cpu(mask);
+ again:
+       /* Look for allowed, online CPU in same node. */
+       for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask)
+               if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
+                       goto move;
  
-               /* On any allowed CPU? */
-               if (dest_cpu >= nr_cpu_ids)
-                       dest_cpu = any_online_cpu(p->cpus_allowed);
+       /* Any allowed, online CPU? */
+       dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask);
+       if (dest_cpu < nr_cpu_ids)
+               goto move;
  
-               /* No more Mr. Nice Guy. */
-               if (dest_cpu >= nr_cpu_ids) {
-                       cpumask_t cpus_allowed;
+       /* No more Mr. Nice Guy. */
+       if (dest_cpu >= nr_cpu_ids) {
+               cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
+               dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed);
  
-                       cpuset_cpus_allowed_locked(p, &cpus_allowed);
-                       /*
-                        * Try to stay on the same cpuset, where the
-                        * current cpuset may be a subset of all cpus.
-                        * The cpuset_cpus_allowed_locked() variant of
-                        * cpuset_cpus_allowed() will not block. It must be
-                        * called within calls to cpuset_lock/cpuset_unlock.
-                        */
-                       rq = task_rq_lock(p, &flags);
-                       p->cpus_allowed = cpus_allowed;
-                       dest_cpu = any_online_cpu(p->cpus_allowed);
-                       task_rq_unlock(rq, &flags);
-                       /*
-                        * Don't tell them about moving exiting tasks or
-                        * kernel threads (both mm NULL), since they never
-                        * leave kernel.
-                        */
-                       if (p->mm && printk_ratelimit()) {
-                               printk(KERN_INFO "process %d (%s) no "
-                                      "longer affine to cpu%d\n",
-                                       task_pid_nr(p), p->comm, dead_cpu);
-                       }
+               /*
+                * Don't tell them about moving exiting tasks or
+                * kernel threads (both mm NULL), since they never
+                * leave kernel.
+                */
+               if (p->mm && printk_ratelimit()) {
+                       printk(KERN_INFO "process %d (%s) no "
+                              "longer affine to cpu%d\n",
+                              task_pid_nr(p), p->comm, dead_cpu);
                }
-       } while (!__migrate_task_irq(p, dead_cpu, dest_cpu));
+       }
+ move:
+       /* It can have affinity changed while we were choosing. */
+       if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))
+               goto again;
  }
  
  /*
   */
  static void migrate_nr_uninterruptible(struct rq *rq_src)
  {
-       struct rq *rq_dest = cpu_rq(any_online_cpu(*CPU_MASK_ALL_PTR));
+       struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask));
        unsigned long flags;
  
        local_irq_save(flags);
@@@ -6470,7 -6600,7 +6595,7 @@@ static void set_rq_online(struct rq *rq
        if (!rq->online) {
                const struct sched_class *class;
  
-               cpu_set(rq->cpu, rq->rd->online);
+               cpumask_set_cpu(rq->cpu, rq->rd->online);
                rq->online = 1;
  
                for_each_class(class) {
@@@ -6490,7 -6620,7 +6615,7 @@@ static void set_rq_offline(struct rq *r
                                class->rq_offline(rq);
                }
  
-               cpu_clear(rq->cpu, rq->rd->online);
+               cpumask_clear_cpu(rq->cpu, rq->rd->online);
                rq->online = 0;
        }
  }
@@@ -6531,7 -6661,7 +6656,7 @@@ migration_call(struct notifier_block *n
                rq = cpu_rq(cpu);
                spin_lock_irqsave(&rq->lock, flags);
                if (rq->rd) {
-                       BUG_ON(!cpu_isset(cpu, rq->rd->span));
+                       BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
  
                        set_rq_online(rq);
                }
                        break;
                /* Unbind it from offline cpu so it can run. Fall thru. */
                kthread_bind(cpu_rq(cpu)->migration_thread,
-                            any_online_cpu(cpu_online_map));
+                            cpumask_any(cpu_online_mask));
                kthread_stop(cpu_rq(cpu)->migration_thread);
                cpu_rq(cpu)->migration_thread = NULL;
                break;
                rq = cpu_rq(cpu);
                spin_lock_irqsave(&rq->lock, flags);
                if (rq->rd) {
-                       BUG_ON(!cpu_isset(cpu, rq->rd->span));
+                       BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
                        set_rq_offline(rq);
                }
                spin_unlock_irqrestore(&rq->lock, flags);
@@@ -6634,13 -6764,13 +6759,13 @@@ early_initcall(migration_init)
  #ifdef CONFIG_SCHED_DEBUG
  
  static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
-                                 cpumask_t *groupmask)
+                                 struct cpumask *groupmask)
  {
        struct sched_group *group = sd->groups;
        char str[256];
  
-       cpulist_scnprintf(str, sizeof(str), sd->span);
-       cpus_clear(*groupmask);
+       cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
+       cpumask_clear(groupmask);
  
        printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
  
  
        printk(KERN_CONT "span %s level %s\n", str, sd->name);
  
-       if (!cpu_isset(cpu, sd->span)) {
+       if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
                printk(KERN_ERR "ERROR: domain->span does not contain "
                                "CPU%d\n", cpu);
        }
-       if (!cpu_isset(cpu, group->cpumask)) {
+       if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
                printk(KERN_ERR "ERROR: domain->groups does not contain"
                                " CPU%d\n", cpu);
        }
                        break;
                }
  
-               if (!cpus_weight(group->cpumask)) {
+               if (!cpumask_weight(sched_group_cpus(group))) {
                        printk(KERN_CONT "\n");
                        printk(KERN_ERR "ERROR: empty group\n");
                        break;
                }
  
-               if (cpus_intersects(*groupmask, group->cpumask)) {
+               if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
                        printk(KERN_CONT "\n");
                        printk(KERN_ERR "ERROR: repeated CPUs\n");
                        break;
                }
  
-               cpus_or(*groupmask, *groupmask, group->cpumask);
+               cpumask_or(groupmask, groupmask, sched_group_cpus(group));
  
-               cpulist_scnprintf(str, sizeof(str), group->cpumask);
+               cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
                printk(KERN_CONT " %s", str);
  
                group = group->next;
        } while (group != sd->groups);
        printk(KERN_CONT "\n");
  
-       if (!cpus_equal(sd->span, *groupmask))
+       if (!cpumask_equal(sched_domain_span(sd), groupmask))
                printk(KERN_ERR "ERROR: groups don't span domain->span\n");
  
-       if (sd->parent && !cpus_subset(*groupmask, sd->parent->span))
+       if (sd->parent &&
+           !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
                printk(KERN_ERR "ERROR: parent span is not a superset "
                        "of domain->span\n");
        return 0;
  
  static void sched_domain_debug(struct sched_domain *sd, int cpu)
  {
-       cpumask_t *groupmask;
+       cpumask_var_t groupmask;
        int level = 0;
  
        if (!sd) {
  
        printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
  
-       groupmask = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
-       if (!groupmask) {
+       if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {
                printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
                return;
        }
                if (!sd)
                        break;
        }
-       kfree(groupmask);
+       free_cpumask_var(groupmask);
  }
  #else /* !CONFIG_SCHED_DEBUG */
  # define sched_domain_debug(sd, cpu) do { } while (0)
  
  static int sd_degenerate(struct sched_domain *sd)
  {
-       if (cpus_weight(sd->span) == 1)
+       if (cpumask_weight(sched_domain_span(sd)) == 1)
                return 1;
  
        /* Following flags need at least 2 groups */
@@@ -6773,7 -6903,7 +6898,7 @@@ sd_parent_degenerate(struct sched_domai
        if (sd_degenerate(parent))
                return 1;
  
-       if (!cpus_equal(sd->span, parent->span))
+       if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
                return 0;
  
        /* Does parent contain flags not in child? */
        return 1;
  }
  
+ static void free_rootdomain(struct root_domain *rd)
+ {
+       cpupri_cleanup(&rd->cpupri);
+       free_cpumask_var(rd->rto_mask);
+       free_cpumask_var(rd->online);
+       free_cpumask_var(rd->span);
+       kfree(rd);
+ }
  static void rq_attach_root(struct rq *rq, struct root_domain *rd)
  {
        unsigned long flags;
        if (rq->rd) {
                struct root_domain *old_rd = rq->rd;
  
-               if (cpu_isset(rq->cpu, old_rd->online))
+               if (cpumask_test_cpu(rq->cpu, old_rd->online))
                        set_rq_offline(rq);
  
-               cpu_clear(rq->cpu, old_rd->span);
+               cpumask_clear_cpu(rq->cpu, old_rd->span);
  
                if (atomic_dec_and_test(&old_rd->refcount))
-                       kfree(old_rd);
+                       free_rootdomain(old_rd);
        }
  
        atomic_inc(&rd->refcount);
        rq->rd = rd;
  
-       cpu_set(rq->cpu, rd->span);
-       if (cpu_isset(rq->cpu, cpu_online_map))
+       cpumask_set_cpu(rq->cpu, rd->span);
+       if (cpumask_test_cpu(rq->cpu, cpu_online_mask))
                set_rq_online(rq);
  
        spin_unlock_irqrestore(&rq->lock, flags);
  }
  
- static void init_rootdomain(struct root_domain *rd)
+ static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem)
  {
        memset(rd, 0, sizeof(*rd));
  
-       cpus_clear(rd->span);
-       cpus_clear(rd->online);
+       if (bootmem) {
+               alloc_bootmem_cpumask_var(&def_root_domain.span);
+               alloc_bootmem_cpumask_var(&def_root_domain.online);
+               alloc_bootmem_cpumask_var(&def_root_domain.rto_mask);
+               cpupri_init(&rd->cpupri, true);
+               return 0;
+       }
+       if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
+               goto out;
+       if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
+               goto free_span;
+       if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+               goto free_online;
+       if (cpupri_init(&rd->cpupri, false) != 0)
+               goto free_rto_mask;
+       return 0;
  
-       cpupri_init(&rd->cpupri);
+ free_rto_mask:
+       free_cpumask_var(rd->rto_mask);
+ free_online:
+       free_cpumask_var(rd->online);
+ free_span:
+       free_cpumask_var(rd->span);
+ out:
+       return -ENOMEM;
  }
  
  static void init_defrootdomain(void)
  {
-       init_rootdomain(&def_root_domain);
+       init_rootdomain(&def_root_domain, true);
        atomic_set(&def_root_domain.refcount, 1);
  }
  
@@@ -6849,7 -7013,10 +7008,10 @@@ static struct root_domain *alloc_rootdo
        if (!rd)
                return NULL;
  
-       init_rootdomain(rd);
+       if (init_rootdomain(rd, false) != 0) {
+               kfree(rd);
+               return NULL;
+       }
  
        return rd;
  }
@@@ -6891,19 -7058,12 +7053,12 @@@ cpu_attach_domain(struct sched_domain *
  }
  
  /* cpus with isolated domains */
- static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
+ static cpumask_var_t cpu_isolated_map;
  
  /* Setup the mask of cpus configured for isolated domains */
  static int __init isolated_cpu_setup(char *str)
  {
-       static int __initdata ints[NR_CPUS];
-       int i;
-       str = get_options(str, ARRAY_SIZE(ints), ints);
-       cpus_clear(cpu_isolated_map);
-       for (i = 1; i <= ints[0]; i++)
-               if (ints[i] < NR_CPUS)
-                       cpu_set(ints[i], cpu_isolated_map);
+       cpulist_parse(str, cpu_isolated_map);
        return 1;
  }
  
@@@ -6912,42 -7072,43 +7067,43 @@@ __setup("isolcpus=", isolated_cpu_setup
  /*
   * init_sched_build_groups takes the cpumask we wish to span, and a pointer
   * to a function which identifies what group(along with sched group) a CPU
-  * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
-  * (due to the fact that we keep track of groups covered with a cpumask_t).
+  * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
+  * (due to the fact that we keep track of groups covered with a struct cpumask).
   *
   * init_sched_build_groups will build a circular linked list of the groups
   * covered by the given span, and will set each group's ->cpumask correctly,
   * and ->cpu_power to 0.
   */
  static void
- init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
-                       int (*group_fn)(int cpu, const cpumask_t *cpu_map,
+ init_sched_build_groups(const struct cpumask *span,
+                       const struct cpumask *cpu_map,
+                       int (*group_fn)(int cpu, const struct cpumask *cpu_map,
                                        struct sched_group **sg,
-                                       cpumask_t *tmpmask),
-                       cpumask_t *covered, cpumask_t *tmpmask)
+                                       struct cpumask *tmpmask),
+                       struct cpumask *covered, struct cpumask *tmpmask)
  {
        struct sched_group *first = NULL, *last = NULL;
        int i;
  
-       cpus_clear(*covered);
+       cpumask_clear(covered);
  
-       for_each_cpu_mask_nr(i, *span) {
+       for_each_cpu(i, span) {
                struct sched_group *sg;
                int group = group_fn(i, cpu_map, &sg, tmpmask);
                int j;
  
-               if (cpu_isset(i, *covered))
+               if (cpumask_test_cpu(i, covered))
                        continue;
  
-               cpus_clear(sg->cpumask);
+               cpumask_clear(sched_group_cpus(sg));
                sg->__cpu_power = 0;
  
-               for_each_cpu_mask_nr(j, *span) {
+               for_each_cpu(j, span) {
                        if (group_fn(j, cpu_map, NULL, tmpmask) != group)
                                continue;
  
-                       cpu_set(j, *covered);
-                       cpu_set(j, sg->cpumask);
+                       cpumask_set_cpu(j, covered);
+                       cpumask_set_cpu(j, sched_group_cpus(sg));
                }
                if (!first)
                        first = sg;
@@@ -7011,23 -7172,21 +7167,21 @@@ static int find_next_best_node(int node
   * should be one that prevents unnecessary balancing, but also spreads tasks
   * out optimally.
   */
- static void sched_domain_node_span(int node, cpumask_t *span)
+ static void sched_domain_node_span(int node, struct cpumask *span)
  {
        nodemask_t used_nodes;
-       node_to_cpumask_ptr(nodemask, node);
        int i;
  
-       cpus_clear(*span);
+       cpumask_clear(span);
        nodes_clear(used_nodes);
  
-       cpus_or(*span, *span, *nodemask);
+       cpumask_or(span, span, cpumask_of_node(node));
        node_set(node, used_nodes);
  
        for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
                int next_node = find_next_best_node(node, &used_nodes);
  
-               node_to_cpumask_ptr_next(nodemask, next_node);
-               cpus_or(*span, *span, *nodemask);
+               cpumask_or(span, span, cpumask_of_node(next_node));
        }
  }
  #endif /* CONFIG_NUMA */
  int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
  
  /*
+  * The cpus mask in sched_group and sched_domain hangs off the end.
+  * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space
+  * for nr_cpu_ids < CONFIG_NR_CPUS.
+  */
+ struct static_sched_group {
+       struct sched_group sg;
+       DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
+ };
+ struct static_sched_domain {
+       struct sched_domain sd;
+       DECLARE_BITMAP(span, CONFIG_NR_CPUS);
+ };
+ /*
   * SMT sched-domains:
   */
  #ifdef CONFIG_SCHED_SMT
- static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
- static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
+ static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
+ static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus);
  
  static int
- cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
-                cpumask_t *unused)
+ cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
+                struct sched_group **sg, struct cpumask *unused)
  {
        if (sg)
-               *sg = &per_cpu(sched_group_cpus, cpu);
+               *sg = &per_cpu(sched_group_cpus, cpu).sg;
        return cpu;
  }
  #endif /* CONFIG_SCHED_SMT */
   * multi-core sched-domains:
   */
  #ifdef CONFIG_SCHED_MC
- static DEFINE_PER_CPU(struct sched_domain, core_domains);
- static DEFINE_PER_CPU(struct sched_group, sched_group_core);
+ static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
+ static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
  #endif /* CONFIG_SCHED_MC */
  
  #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
  static int
- cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
-                 cpumask_t *mask)
+ cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
+                 struct sched_group **sg, struct cpumask *mask)
  {
        int group;
  
-       *mask = per_cpu(cpu_sibling_map, cpu);
-       cpus_and(*mask, *mask, *cpu_map);
-       group = first_cpu(*mask);
+       cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
+       group = cpumask_first(mask);
        if (sg)
-               *sg = &per_cpu(sched_group_core, group);
+               *sg = &per_cpu(sched_group_core, group).sg;
        return group;
  }
  #elif defined(CONFIG_SCHED_MC)
  static int
- cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
-                 cpumask_t *unused)
+ cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
+                 struct sched_group **sg, struct cpumask *unused)
  {
        if (sg)
-               *sg = &per_cpu(sched_group_core, cpu);
+               *sg = &per_cpu(sched_group_core, cpu).sg;
        return cpu;
  }
  #endif
  
- static DEFINE_PER_CPU(struct sched_domain, phys_domains);
- static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
+ static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
+ static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
  
  static int
- cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
-                 cpumask_t *mask)
+ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
+                 struct sched_group **sg, struct cpumask *mask)
  {
        int group;
  #ifdef CONFIG_SCHED_MC
-       *mask = cpu_coregroup_map(cpu);
-       cpus_and(*mask, *mask, *cpu_map);
-       group = first_cpu(*mask);
+       cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
+       group = cpumask_first(mask);
  #elif defined(CONFIG_SCHED_SMT)
-       *mask = per_cpu(cpu_sibling_map, cpu);
-       cpus_and(*mask, *mask, *cpu_map);
-       group = first_cpu(*mask);
+       cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
+       group = cpumask_first(mask);
  #else
        group = cpu;
  #endif
        if (sg)
-               *sg = &per_cpu(sched_group_phys, group);
+               *sg = &per_cpu(sched_group_phys, group).sg;
        return group;
  }
  
   * groups, so roll our own. Now each node has its own list of groups which
   * gets dynamically allocated.
   */
- static DEFINE_PER_CPU(struct sched_domain, node_domains);
+ static DEFINE_PER_CPU(struct static_sched_domain, node_domains);
  static struct sched_group ***sched_group_nodes_bycpu;
  
- static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
- static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
+ static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);
+ static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
  
- static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
-                                struct sched_group **sg, cpumask_t *nodemask)
+ static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
+                                struct sched_group **sg,
+                                struct cpumask *nodemask)
  {
        int group;
  
-       *nodemask = node_to_cpumask(cpu_to_node(cpu));
-       cpus_and(*nodemask, *nodemask, *cpu_map);
-       group = first_cpu(*nodemask);
+       cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map);
+       group = cpumask_first(nodemask);
  
        if (sg)
-               *sg = &per_cpu(sched_group_allnodes, group);
+               *sg = &per_cpu(sched_group_allnodes, group).sg;
        return group;
  }
  
@@@ -7142,11 -7313,11 +7308,11 @@@ static void init_numa_sched_groups_powe
        if (!sg)
                return;
        do {
-               for_each_cpu_mask_nr(j, sg->cpumask) {
+               for_each_cpu(j, sched_group_cpus(sg)) {
                        struct sched_domain *sd;
  
-                       sd = &per_cpu(phys_domains, j);
-                       if (j != first_cpu(sd->groups->cpumask)) {
+                       sd = &per_cpu(phys_domains, j).sd;
+                       if (j != cpumask_first(sched_group_cpus(sd->groups))) {
                                /*
                                 * Only add "power" once for each
                                 * physical package.
  
  #ifdef CONFIG_NUMA
  /* Free memory allocated for various sched_group structures */
- static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
+ static void free_sched_groups(const struct cpumask *cpu_map,
+                             struct cpumask *nodemask)
  {
        int cpu, i;
  
-       for_each_cpu_mask_nr(cpu, *cpu_map) {
+       for_each_cpu(cpu, cpu_map) {
                struct sched_group **sched_group_nodes
                        = sched_group_nodes_bycpu[cpu];
  
                for (i = 0; i < nr_node_ids; i++) {
                        struct sched_group *oldsg, *sg = sched_group_nodes[i];
  
-                       *nodemask = node_to_cpumask(i);
-                       cpus_and(*nodemask, *nodemask, *cpu_map);
-                       if (cpus_empty(*nodemask))
+                       cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
+                       if (cpumask_empty(nodemask))
                                continue;
  
                        if (sg == NULL)
@@@ -7197,7 -7368,8 +7363,8 @@@ next_sg
        }
  }
  #else /* !CONFIG_NUMA */
- static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
+ static void free_sched_groups(const struct cpumask *cpu_map,
+                             struct cpumask *nodemask)
  {
  }
  #endif /* CONFIG_NUMA */
@@@ -7223,7 -7395,7 +7390,7 @@@ static void init_sched_groups_power(in
  
        WARN_ON(!sd || !sd->groups);
  
-       if (cpu != first_cpu(sd->groups->cpumask))
+       if (cpu != cpumask_first(sched_group_cpus(sd->groups)))
                return;
  
        child = sd->child;
@@@ -7288,48 -7460,6 +7455,6 @@@ SD_INIT_FUNC(CPU
   SD_INIT_FUNC(MC)
  #endif
  
- /*
-  * To minimize stack usage kmalloc room for cpumasks and share the
-  * space as the usage in build_sched_domains() dictates.  Used only
-  * if the amount of space is significant.
-  */
- struct allmasks {
-       cpumask_t tmpmask;                      /* make this one first */
-       union {
-               cpumask_t nodemask;
-               cpumask_t this_sibling_map;
-               cpumask_t this_core_map;
-       };
-       cpumask_t send_covered;
- #ifdef CONFIG_NUMA
-       cpumask_t domainspan;
-       cpumask_t covered;
-       cpumask_t notcovered;
- #endif
- };
- #if   NR_CPUS > 128
- #define SCHED_CPUMASK_DECLARE(v)      struct allmasks *v
- static inline void sched_cpumask_alloc(struct allmasks **masks)
- {
-       *masks = kmalloc(sizeof(**masks), GFP_KERNEL);
- }
- static inline void sched_cpumask_free(struct allmasks *masks)
- {
-       kfree(masks);
- }
- #else
- #define SCHED_CPUMASK_DECLARE(v)      struct allmasks _v, *v = &_v
- static inline void sched_cpumask_alloc(struct allmasks **masks)
- { }
- static inline void sched_cpumask_free(struct allmasks *masks)
- { }
- #endif
- #define       SCHED_CPUMASK_VAR(v, a)         cpumask_t *v = (cpumask_t *) \
-                       ((unsigned long)(a) + offsetof(struct allmasks, v))
  static int default_relax_domain_level = -1;
  
  static int __init setup_relax_domain_level(char *str)
@@@ -7369,17 -7499,38 +7494,38 @@@ static void set_domain_attribute(struc
   * Build sched domains for a given set of cpus and attach the sched domains
   * to the individual cpus
   */
- static int __build_sched_domains(const cpumask_t *cpu_map,
+ static int __build_sched_domains(const struct cpumask *cpu_map,
                                 struct sched_domain_attr *attr)
  {
-       int i;
+       int i, err = -ENOMEM;
        struct root_domain *rd;
-       SCHED_CPUMASK_DECLARE(allmasks);
-       cpumask_t *tmpmask;
+       cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered,
+               tmpmask;
  #ifdef CONFIG_NUMA
+       cpumask_var_t domainspan, covered, notcovered;
        struct sched_group **sched_group_nodes = NULL;
        int sd_allnodes = 0;
  
+       if (!alloc_cpumask_var(&domainspan, GFP_KERNEL))
+               goto out;
+       if (!alloc_cpumask_var(&covered, GFP_KERNEL))
+               goto free_domainspan;
+       if (!alloc_cpumask_var(&notcovered, GFP_KERNEL))
+               goto free_covered;
+ #endif
+       if (!alloc_cpumask_var(&nodemask, GFP_KERNEL))
+               goto free_notcovered;
+       if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL))
+               goto free_nodemask;
+       if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL))
+               goto free_this_sibling_map;
+       if (!alloc_cpumask_var(&send_covered, GFP_KERNEL))
+               goto free_this_core_map;
+       if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
+               goto free_send_covered;
+ #ifdef CONFIG_NUMA
        /*
         * Allocate the per-node list of sched groups
         */
                                    GFP_KERNEL);
        if (!sched_group_nodes) {
                printk(KERN_WARNING "Can not alloc sched group node list\n");
-               return -ENOMEM;
+               goto free_tmpmask;
        }
  #endif
  
        rd = alloc_rootdomain();
        if (!rd) {
                printk(KERN_WARNING "Cannot alloc root domain\n");
- #ifdef CONFIG_NUMA
-               kfree(sched_group_nodes);
- #endif
-               return -ENOMEM;
-       }
-       /* get space for all scratch cpumask variables */
-       sched_cpumask_alloc(&allmasks);
-       if (!allmasks) {
-               printk(KERN_WARNING "Cannot alloc cpumask array\n");
-               kfree(rd);
- #ifdef CONFIG_NUMA
-               kfree(sched_group_nodes);
- #endif
-               return -ENOMEM;
+               goto free_sched_groups;
        }
  
-       tmpmask = (cpumask_t *)allmasks;
  #ifdef CONFIG_NUMA
-       sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
+       sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes;
  #endif
  
        /*
         * Set up domains for cpus specified by the cpu_map.
         */
-       for_each_cpu_mask_nr(i, *cpu_map) {
+       for_each_cpu(i, cpu_map) {
                struct sched_domain *sd = NULL, *p;
-               SCHED_CPUMASK_VAR(nodemask, allmasks);
  
-               *nodemask = node_to_cpumask(cpu_to_node(i));
-               cpus_and(*nodemask, *nodemask, *cpu_map);
+               cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map);
  
  #ifdef CONFIG_NUMA
-               if (cpus_weight(*cpu_map) >
-                               SD_NODES_PER_DOMAIN*cpus_weight(*nodemask)) {
-                       sd = &per_cpu(allnodes_domains, i);
+               if (cpumask_weight(cpu_map) >
+                               SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) {
+                       sd = &per_cpu(allnodes_domains, i).sd;
                        SD_INIT(sd, ALLNODES);
                        set_domain_attribute(sd, attr);
-                       sd->span = *cpu_map;
+                       cpumask_copy(sched_domain_span(sd), cpu_map);
                        cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
                        p = sd;
                        sd_allnodes = 1;
                } else
                        p = NULL;
  
-               sd = &per_cpu(node_domains, i);
+               sd = &per_cpu(node_domains, i).sd;
                SD_INIT(sd, NODE);
                set_domain_attribute(sd, attr);
-               sched_domain_node_span(cpu_to_node(i), &sd->span);
+               sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
                sd->parent = p;
                if (p)
                        p->child = sd;
-               cpus_and(sd->span, sd->span, *cpu_map);
+               cpumask_and(sched_domain_span(sd),
+                           sched_domain_span(sd), cpu_map);
  #endif
  
                p = sd;
-               sd = &per_cpu(phys_domains, i);
+               sd = &per_cpu(phys_domains, i).sd;
                SD_INIT(sd, CPU);
                set_domain_attribute(sd, attr);
-               sd->span = *nodemask;
+               cpumask_copy(sched_domain_span(sd), nodemask);
                sd->parent = p;
                if (p)
                        p->child = sd;
  
  #ifdef CONFIG_SCHED_MC
                p = sd;
-               sd = &per_cpu(core_domains, i);
+               sd = &per_cpu(core_domains, i).sd;
                SD_INIT(sd, MC);
                set_domain_attribute(sd, attr);
-               sd->span = cpu_coregroup_map(i);
-               cpus_and(sd->span, sd->span, *cpu_map);
+               cpumask_and(sched_domain_span(sd), cpu_map,
+                                                  cpu_coregroup_mask(i));
                sd->parent = p;
                p->child = sd;
                cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
  
  #ifdef CONFIG_SCHED_SMT
                p = sd;
-               sd = &per_cpu(cpu_domains, i);
+               sd = &per_cpu(cpu_domains, i).sd;
                SD_INIT(sd, SIBLING);
                set_domain_attribute(sd, attr);
-               sd->span = per_cpu(cpu_sibling_map, i);
-               cpus_and(sd->span, sd->span, *cpu_map);
+               cpumask_and(sched_domain_span(sd),
+                           &per_cpu(cpu_sibling_map, i), cpu_map);
                sd->parent = p;
                p->child = sd;
                cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
  
  #ifdef CONFIG_SCHED_SMT
        /* Set up CPU (sibling) groups */
-       for_each_cpu_mask_nr(i, *cpu_map) {
-               SCHED_CPUMASK_VAR(this_sibling_map, allmasks);
-               SCHED_CPUMASK_VAR(send_covered, allmasks);
-               *this_sibling_map = per_cpu(cpu_sibling_map, i);
-               cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map);
-               if (i != first_cpu(*this_sibling_map))
+       for_each_cpu(i, cpu_map) {
+               cpumask_and(this_sibling_map,
+                           &per_cpu(cpu_sibling_map, i), cpu_map);
+               if (i != cpumask_first(this_sibling_map))
                        continue;
  
                init_sched_build_groups(this_sibling_map, cpu_map,
  
  #ifdef CONFIG_SCHED_MC
        /* Set up multi-core groups */
-       for_each_cpu_mask_nr(i, *cpu_map) {
-               SCHED_CPUMASK_VAR(this_core_map, allmasks);
-               SCHED_CPUMASK_VAR(send_covered, allmasks);
-               *this_core_map = cpu_coregroup_map(i);
-               cpus_and(*this_core_map, *this_core_map, *cpu_map);
-               if (i != first_cpu(*this_core_map))
+       for_each_cpu(i, cpu_map) {
+               cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map);
+               if (i != cpumask_first(this_core_map))
                        continue;
  
                init_sched_build_groups(this_core_map, cpu_map,
  
        /* Set up physical groups */
        for (i = 0; i < nr_node_ids; i++) {
-               SCHED_CPUMASK_VAR(nodemask, allmasks);
-               SCHED_CPUMASK_VAR(send_covered, allmasks);
-               *nodemask = node_to_cpumask(i);
-               cpus_and(*nodemask, *nodemask, *cpu_map);
-               if (cpus_empty(*nodemask))
+               cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
+               if (cpumask_empty(nodemask))
                        continue;
  
                init_sched_build_groups(nodemask, cpu_map,
  #ifdef CONFIG_NUMA
        /* Set up node groups */
        if (sd_allnodes) {
-               SCHED_CPUMASK_VAR(send_covered, allmasks);
                init_sched_build_groups(cpu_map, cpu_map,
                                        &cpu_to_allnodes_group,
                                        send_covered, tmpmask);
        for (i = 0; i < nr_node_ids; i++) {
                /* Set up node groups */
                struct sched_group *sg, *prev;
-               SCHED_CPUMASK_VAR(nodemask, allmasks);
-               SCHED_CPUMASK_VAR(domainspan, allmasks);
-               SCHED_CPUMASK_VAR(covered, allmasks);
                int j;
  
-               *nodemask = node_to_cpumask(i);
-               cpus_clear(*covered);
-               cpus_and(*nodemask, *nodemask, *cpu_map);
-               if (cpus_empty(*nodemask)) {
+               cpumask_clear(covered);
+               cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
+               if (cpumask_empty(nodemask)) {
                        sched_group_nodes[i] = NULL;
                        continue;
                }
  
                sched_domain_node_span(i, domainspan);
-               cpus_and(*domainspan, *domainspan, *cpu_map);
+               cpumask_and(domainspan, domainspan, cpu_map);
  
-               sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
+               sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
+                                 GFP_KERNEL, i);
                if (!sg) {
                        printk(KERN_WARNING "Can not alloc domain group for "
                                "node %d\n", i);
                        goto error;
                }
                sched_group_nodes[i] = sg;
-               for_each_cpu_mask_nr(j, *nodemask) {
+               for_each_cpu(j, nodemask) {
                        struct sched_domain *sd;
  
-                       sd = &per_cpu(node_domains, j);
+                       sd = &per_cpu(node_domains, j).sd;
                        sd->groups = sg;
                }
                sg->__cpu_power = 0;
-               sg->cpumask = *nodemask;
+               cpumask_copy(sched_group_cpus(sg), nodemask);
                sg->next = sg;
-               cpus_or(*covered, *covered, *nodemask);
+               cpumask_or(covered, covered, nodemask);
                prev = sg;
  
                for (j = 0; j < nr_node_ids; j++) {
-                       SCHED_CPUMASK_VAR(notcovered, allmasks);
                        int n = (i + j) % nr_node_ids;
-                       node_to_cpumask_ptr(pnodemask, n);
  
-                       cpus_complement(*notcovered, *covered);
-                       cpus_and(*tmpmask, *notcovered, *cpu_map);
-                       cpus_and(*tmpmask, *tmpmask, *domainspan);
-                       if (cpus_empty(*tmpmask))
+                       cpumask_complement(notcovered, covered);
+                       cpumask_and(tmpmask, notcovered, cpu_map);
+                       cpumask_and(tmpmask, tmpmask, domainspan);
+                       if (cpumask_empty(tmpmask))
                                break;
  
-                       cpus_and(*tmpmask, *tmpmask, *pnodemask);
-                       if (cpus_empty(*tmpmask))
+                       cpumask_and(tmpmask, tmpmask, cpumask_of_node(n));
+                       if (cpumask_empty(tmpmask))
                                continue;
  
-                       sg = kmalloc_node(sizeof(struct sched_group),
+                       sg = kmalloc_node(sizeof(struct sched_group) +
+                                         cpumask_size(),
                                          GFP_KERNEL, i);
                        if (!sg) {
                                printk(KERN_WARNING
                                goto error;
                        }
                        sg->__cpu_power = 0;
-                       sg->cpumask = *tmpmask;
+                       cpumask_copy(sched_group_cpus(sg), tmpmask);
                        sg->next = prev->next;
-                       cpus_or(*covered, *covered, *tmpmask);
+                       cpumask_or(covered, covered, tmpmask);
                        prev->next = sg;
                        prev = sg;
                }
  
        /* Calculate CPU power for physical packages and nodes */
  #ifdef CONFIG_SCHED_SMT
-       for_each_cpu_mask_nr(i, *cpu_map) {
-               struct sched_domain *sd = &per_cpu(cpu_domains, i);
+       for_each_cpu(i, cpu_map) {
+               struct sched_domain *sd = &per_cpu(cpu_domains, i).sd;
  
                init_sched_groups_power(i, sd);
        }
  #endif
  #ifdef CONFIG_SCHED_MC
-       for_each_cpu_mask_nr(i, *cpu_map) {
-               struct sched_domain *sd = &per_cpu(core_domains, i);
+       for_each_cpu(i, cpu_map) {
+               struct sched_domain *sd = &per_cpu(core_domains, i).sd;
  
                init_sched_groups_power(i, sd);
        }
  #endif
  
-       for_each_cpu_mask_nr(i, *cpu_map) {
-               struct sched_domain *sd = &per_cpu(phys_domains, i);
+       for_each_cpu(i, cpu_map) {
+               struct sched_domain *sd = &per_cpu(phys_domains, i).sd;
  
                init_sched_groups_power(i, sd);
        }
        if (sd_allnodes) {
                struct sched_group *sg;
  
-               cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg,
+               cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
                                                                tmpmask);
                init_numa_sched_groups_power(sg);
        }
  #endif
  
        /* Attach the domains */
-       for_each_cpu_mask_nr(i, *cpu_map) {
+       for_each_cpu(i, cpu_map) {
                struct sched_domain *sd;
  #ifdef CONFIG_SCHED_SMT
-               sd = &per_cpu(cpu_domains, i);
+               sd = &per_cpu(cpu_domains, i).sd;
  #elif defined(CONFIG_SCHED_MC)
-               sd = &per_cpu(core_domains, i);
+               sd = &per_cpu(core_domains, i).sd;
  #else
-               sd = &per_cpu(phys_domains, i);
+               sd = &per_cpu(phys_domains, i).sd;
  #endif
                cpu_attach_domain(sd, rd, i);
        }
  
-       sched_cpumask_free(allmasks);
-       return 0;
+       err = 0;
+ free_tmpmask:
+       free_cpumask_var(tmpmask);
+ free_send_covered:
+       free_cpumask_var(send_covered);
+ free_this_core_map:
+       free_cpumask_var(this_core_map);
+ free_this_sibling_map:
+       free_cpumask_var(this_sibling_map);
+ free_nodemask:
+       free_cpumask_var(nodemask);
+ free_notcovered:
+ #ifdef CONFIG_NUMA
+       free_cpumask_var(notcovered);
+ free_covered:
+       free_cpumask_var(covered);
+ free_domainspan:
+       free_cpumask_var(domainspan);
+ out:
+ #endif
+       return err;
+ free_sched_groups:
+ #ifdef CONFIG_NUMA
+       kfree(sched_group_nodes);
+ #endif
+       goto free_tmpmask;
  
  #ifdef CONFIG_NUMA
  error:
        free_sched_groups(cpu_map, tmpmask);
-       sched_cpumask_free(allmasks);
-       kfree(rd);
-       return -ENOMEM;
+       free_rootdomain(rd);
+       goto free_tmpmask;
  #endif
  }
  
- static int build_sched_domains(const cpumask_t *cpu_map)
+ static int build_sched_domains(const struct cpumask *cpu_map)
  {
        return __build_sched_domains(cpu_map, NULL);
  }
  
- static cpumask_t *doms_cur;   /* current sched domains */
+ static struct cpumask *doms_cur;      /* current sched domains */
  static int ndoms_cur;         /* number of sched domains in 'doms_cur' */
  static struct sched_domain_attr *dattr_cur;
                                /* attribues of custom domains in 'doms_cur' */
  
  /*
   * Special case: If a kmalloc of a doms_cur partition (array of
-  * cpumask_t) fails, then fallback to a single sched domain,
-  * as determined by the single cpumask_t fallback_doms.
+  * cpumask) fails, then fallback to a single sched domain,
+  * as determined by the single cpumask fallback_doms.
   */
- static cpumask_t fallback_doms;
+ static cpumask_var_t fallback_doms;
  
  /*
   * arch_update_cpu_topology lets virtualized architectures update the
@@@ -7708,16 -7848,16 +7843,16 @@@ int __attribute__((weak)) arch_update_c
   * For now this just excludes isolated cpus, but could be used to
   * exclude other special cases in the future.
   */
- static int arch_init_sched_domains(const cpumask_t *cpu_map)
+ static int arch_init_sched_domains(const struct cpumask *cpu_map)
  {
        int err;
  
        arch_update_cpu_topology();
        ndoms_cur = 1;
-       doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
+       doms_cur = kmalloc(cpumask_size(), GFP_KERNEL);
        if (!doms_cur)
-               doms_cur = &fallback_doms;
-       cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map);
+               doms_cur = fallback_doms;
+       cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map);
        dattr_cur = NULL;
        err = build_sched_domains(doms_cur);
        register_sched_domain_sysctl();
        return err;
  }
  
- static void arch_destroy_sched_domains(const cpumask_t *cpu_map,
-                                      cpumask_t *tmpmask)
+ static void arch_destroy_sched_domains(const struct cpumask *cpu_map,
+                                      struct cpumask *tmpmask)
  {
        free_sched_groups(cpu_map, tmpmask);
  }
   * Detach sched domains from a group of cpus specified in cpu_map
   * These cpus will now be attached to the NULL domain
   */
- static void detach_destroy_domains(const cpumask_t *cpu_map)
+ static void detach_destroy_domains(const struct cpumask *cpu_map)
  {
-       cpumask_t tmpmask;
+       /* Save because hotplug lock held. */
+       static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);
        int i;
  
-       for_each_cpu_mask_nr(i, *cpu_map)
+       for_each_cpu(i, cpu_map)
                cpu_attach_domain(NULL, &def_root_domain, i);
        synchronize_sched();
-       arch_destroy_sched_domains(cpu_map, &tmpmask);
+       arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));
  }
  
  /* handle null as "default" */
@@@ -7768,7 -7909,7 +7904,7 @@@ static int dattrs_equal(struct sched_do
   * doms_new[] to the current sched domain partitioning, doms_cur[].
   * It destroys each deleted domain and builds each new domain.
   *
-  * 'doms_new' is an array of cpumask_t's of length 'ndoms_new'.
+  * 'doms_new' is an array of cpumask's of length 'ndoms_new'.
   * The masks don't intersect (don't overlap.) We should setup one
   * sched domain for each mask. CPUs not in any of the cpumasks will
   * not be load balanced. If the same cpumask appears both in the
   * the single partition 'fallback_doms', it also forces the domains
   * to be rebuilt.
   *
-  * If doms_new == NULL it will be replaced with cpu_online_map.
+  * If doms_new == NULL it will be replaced with cpu_online_mask.
   * ndoms_new == 0 is a special case for destroying existing domains,
   * and it will not create the default domain.
   *
   * Call with hotplug lock held
   */
- void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
+ /* FIXME: Change to struct cpumask *doms_new[] */
+ void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
                             struct sched_domain_attr *dattr_new)
  {
        int i, j, n;
        /* Destroy deleted domains */
        for (i = 0; i < ndoms_cur; i++) {
                for (j = 0; j < n && !new_topology; j++) {
-                       if (cpus_equal(doms_cur[i], doms_new[j])
+                       if (cpumask_equal(&doms_cur[i], &doms_new[j])
                            && dattrs_equal(dattr_cur, i, dattr_new, j))
                                goto match1;
                }
@@@ -7819,15 -7961,15 +7956,15 @@@ match1
  
        if (doms_new == NULL) {
                ndoms_cur = 0;
-               doms_new = &fallback_doms;
-               cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
+               doms_new = fallback_doms;
+               cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map);
                WARN_ON_ONCE(dattr_new);
        }
  
        /* Build new domains */
        for (i = 0; i < ndoms_new; i++) {
                for (j = 0; j < ndoms_cur && !new_topology; j++) {
-                       if (cpus_equal(doms_new[i], doms_cur[j])
+                       if (cpumask_equal(&doms_new[i], &doms_cur[j])
                            && dattrs_equal(dattr_new, i, dattr_cur, j))
                                goto match2;
                }
@@@ -7839,7 -7981,7 +7976,7 @@@ match2
        }
  
        /* Remember the new sched domains */
-       if (doms_cur != &fallback_doms)
+       if (doms_cur != fallback_doms)
                kfree(doms_cur);
        kfree(dattr_cur);       /* kfree(NULL) is safe */
        doms_cur = doms_new;
  }
  
  #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
int arch_reinit_sched_domains(void)
static void arch_reinit_sched_domains(void)
  {
        get_online_cpus();
  
  
        rebuild_sched_domains();
        put_online_cpus();
-       return 0;
  }
  
  static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
  {
-       int ret;
+       unsigned int level = 0;
+       if (sscanf(buf, "%u", &level) != 1)
+               return -EINVAL;
+       /*
+        * level is always be positive so don't check for
+        * level < POWERSAVINGS_BALANCE_NONE which is 0
+        * What happens on 0 or 1 byte write,
+        * need to check for count as well?
+        */
  
-       if (buf[0] != '0' && buf[0] != '1')
+       if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
                return -EINVAL;
  
        if (smt)
-               sched_smt_power_savings = (buf[0] == '1');
+               sched_smt_power_savings = level;
        else
-               sched_mc_power_savings = (buf[0] == '1');
+               sched_mc_power_savings = level;
  
-       ret = arch_reinit_sched_domains();
+       arch_reinit_sched_domains();
  
-       return ret ? ret : count;
+       return count;
  }
  
  #ifdef CONFIG_SCHED_MC
@@@ -7914,7 -8064,7 +8059,7 @@@ static SYSDEV_CLASS_ATTR(sched_smt_powe
                   sched_smt_power_savings_store);
  #endif
  
- int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
+ int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
  {
        int err = 0;
  
@@@ -7979,7 -8129,9 +8124,9 @@@ static int update_runtime(struct notifi
  
  void __init sched_init_smp(void)
  {
-       cpumask_t non_isolated_cpus;
+       cpumask_var_t non_isolated_cpus;
+       alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
  
  #if defined(CONFIG_NUMA)
        sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
  #endif
        get_online_cpus();
        mutex_lock(&sched_domains_mutex);
-       arch_init_sched_domains(&cpu_online_map);
-       cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
-       if (cpus_empty(non_isolated_cpus))
-               cpu_set(smp_processor_id(), non_isolated_cpus);
+       arch_init_sched_domains(cpu_online_mask);
+       cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
+       if (cpumask_empty(non_isolated_cpus))
+               cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
        mutex_unlock(&sched_domains_mutex);
        put_online_cpus();
  
        init_hrtick();
  
        /* Move init over to a non-isolated CPU */
-       if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0)
+       if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
                BUG();
        sched_init_granularity();
+       free_cpumask_var(non_isolated_cpus);
+       alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
+       init_sched_rt_class();
  }
  #else
  void __init sched_init_smp(void)
@@@ -8323,6 -8479,15 +8474,15 @@@ void __init sched_init(void
         */
        current->sched_class = &fair_sched_class;
  
+       /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
+       alloc_bootmem_cpumask_var(&nohz_cpu_mask);
+ #ifdef CONFIG_SMP
+ #ifdef CONFIG_NO_HZ
+       alloc_bootmem_cpumask_var(&nohz.cpu_mask);
+ #endif
+       alloc_bootmem_cpumask_var(&cpu_isolated_map);
+ #endif /* SMP */
        scheduler_running = 1;
  }