Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu
authorLinus Torvalds <torvalds@linux-foundation.org>
Tue, 15 Sep 2009 16:39:44 +0000 (09:39 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 15 Sep 2009 16:39:44 +0000 (09:39 -0700)
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu: (46 commits)
  powerpc64: convert to dynamic percpu allocator
  sparc64: use embedding percpu first chunk allocator
  percpu: kill lpage first chunk allocator
  x86,percpu: use embedding for 64bit NUMA and page for 32bit NUMA
  percpu: update embedding first chunk allocator to handle sparse units
  percpu: use group information to allocate vmap areas sparsely
  vmalloc: implement pcpu_get_vm_areas()
  vmalloc: separate out insert_vmalloc_vm()
  percpu: add chunk->base_addr
  percpu: add pcpu_unit_offsets[]
  percpu: introduce pcpu_alloc_info and pcpu_group_info
  percpu: move pcpu_lpage_build_unit_map() and pcpul_lpage_dump_cfg() upward
  percpu: add @align to pcpu_fc_alloc_fn_t
  percpu: make @dyn_size mandatory for pcpu_setup_first_chunk()
  percpu: drop @static_size from first chunk allocators
  percpu: generalize first chunk allocator selection
  percpu: build first chunk allocators selectively
  percpu: rename 4k first chunk allocator to page
  percpu: improve boot messages
  percpu: fix pcpu_reclaim() locking
  ...

Fix trivial conflict as by Tejun Heo in kernel/sched.c

27 files changed:
1  2 
Documentation/kernel-parameters.txt
Makefile
arch/ia64/Kconfig
arch/powerpc/mm/stab.c
arch/s390/kernel/vmlinux.lds.S
arch/sparc/Kconfig
arch/x86/Kconfig
arch/x86/include/asm/percpu.h
arch/x86/kernel/cpu/mcheck/mce.c
arch/x86/kernel/cpu/mcheck/mce_amd.c
arch/x86/kernel/cpu/perf_counter.c
arch/x86/kernel/vmlinux.lds.S
arch/x86/mm/pageattr.c
block/cfq-iosched.c
include/linux/percpu-defs.h
init/main.c
kernel/module.c
kernel/perf_counter.c
kernel/sched.c
kernel/trace/trace_events.c
lib/Kconfig.debug
mm/Makefile
mm/page-writeback.c
mm/slub.c
net/rds/ib_stats.c
net/rds/iw_stats.c
net/rds/page.c

@@@ -57,7 -57,6 +57,7 @@@ parameter is applicable
        ISAPNP  ISA PnP code is enabled.
        ISDN    Appropriate ISDN support is enabled.
        JOY     Appropriate joystick support is enabled.
 +      KVM     Kernel Virtual Machine support is enabled.
        LIBATA  Libata driver is enabled
        LP      Printer support is enabled.
        LOOP    Loopback device support is enabled.
@@@ -1099,44 -1098,6 +1099,44 @@@ and is between 256 and 4096 characters
        kstack=N        [X86] Print N words from the kernel stack
                        in oops dumps.
  
 +      kvm.ignore_msrs=[KVM] Ignore guest accesses to unhandled MSRs.
 +                      Default is 0 (don't ignore, but inject #GP)
 +
 +      kvm.oos_shadow= [KVM] Disable out-of-sync shadow paging.
 +                      Default is 1 (enabled)
 +
 +      kvm-amd.nested= [KVM,AMD] Allow nested virtualization in KVM/SVM.
 +                      Default is 0 (off)
 +
 +      kvm-amd.npt=    [KVM,AMD] Disable nested paging (virtualized MMU)
 +                      for all guests.
 +                      Default is 1 (enabled) if in 64bit or 32bit-PAE mode
 +
 +      kvm-intel.bypass_guest_pf=
 +                      [KVM,Intel] Disables bypassing of guest page faults
 +                      on Intel chips. Default is 1 (enabled)
 +
 +      kvm-intel.ept=  [KVM,Intel] Disable extended page tables
 +                      (virtualized MMU) support on capable Intel chips.
 +                      Default is 1 (enabled)
 +
 +      kvm-intel.emulate_invalid_guest_state=
 +                      [KVM,Intel] Enable emulation of invalid guest states
 +                      Default is 0 (disabled)
 +
 +      kvm-intel.flexpriority=
 +                      [KVM,Intel] Disable FlexPriority feature (TPR shadow).
 +                      Default is 1 (enabled)
 +
 +      kvm-intel.unrestricted_guest=
 +                      [KVM,Intel] Disable unrestricted guest feature
 +                      (virtualized real and unpaged mode) on capable
 +                      Intel chips. Default is 1 (enabled)
 +
 +      kvm-intel.vpid= [KVM,Intel] Disable Virtual Processor Identification
 +                      feature (tagged TLBs) on capable Intel chips.
 +                      Default is 1 (enabled)
 +
        l2cr=           [PPC]
  
        l3cr=           [PPC]
                        [NFS] set the TCP port on which the NFSv4 callback
                        channel should listen.
  
 +      nfs.cache_getent=
 +                      [NFS] sets the pathname to the program which is used
 +                      to update the NFS client cache entries.
 +
 +      nfs.cache_getent_timeout=
 +                      [NFS] sets the timeout after which an attempt to
 +                      update a cache entry is deemed to have failed.
 +
        nfs.idmap_cache_timeout=
                        [NFS] set the maximum lifetime for idmapper cache
                        entries.
                        symbolic names: lapic and ioapic
                        Example: nmi_watchdog=2 or nmi_watchdog=panic,lapic
  
 +      netpoll.carrier_timeout=
 +                      [NET] Specifies amount of time (in seconds) that
 +                      netpoll should wait for a carrier. By default netpoll
 +                      waits 4 seconds.
 +
        no387           [BUGS=X86-32] Tells the kernel to use the 387 maths
                        emulation library even if a 387 maths coprocessor
                        is present.
                        Format: { 0 | 1 }
                        See arch/parisc/kernel/pdc_chassis.c
  
-       percpu_alloc=   [X86] Select which percpu first chunk allocator to use.
-                       Allowed values are one of "lpage", "embed" and "4k".
-                       See comments in arch/x86/kernel/setup_percpu.c for
-                       details on each allocator.  This parameter is primarily
-                       for debugging and performance comparison.
+       percpu_alloc=   Select which percpu first chunk allocator to use.
+                       Currently supported values are "embed" and "page".
+                       Archs may support subset or none of the selections.
+                       See comments in mm/percpu.c for details on each
+                       allocator.  This parameter is primarily for debugging
+                       and performance comparison.
  
        pf.             [PARIDE]
                        See Documentation/blockdev/paride.txt.
        stifb=          [HW]
                        Format: bpp:<bpp1>[:<bpp2>[:<bpp3>...]]
  
 +      sunrpc.min_resvport=
 +      sunrpc.max_resvport=
 +                      [NFS,SUNRPC]
 +                      SunRPC servers often require that client requests
 +                      originate from a privileged port (i.e. a port in the
 +                      range 0 < portnr < 1024).
 +                      An administrator who wishes to reserve some of these
 +                      ports for other uses may adjust the range that the
 +                      kernel's sunrpc client considers to be privileged
 +                      using these two parameters to set the minimum and
 +                      maximum port values.
 +
        sunrpc.pool_mode=
                        [NFS]
                        Control how the NFS server code allocates CPUs to
                        pernode     one pool for each NUMA node (equivalent
                                    to global on non-NUMA machines)
  
 +      sunrpc.tcp_slot_table_entries=
 +      sunrpc.udp_slot_table_entries=
 +                      [NFS,SUNRPC]
 +                      Sets the upper limit on the number of simultaneous
 +                      RPC calls that can be sent from the client to a
 +                      server. Increasing these values may allow you to
 +                      improve throughput, but will also increase the
 +                      amount of memory reserved for use by the client.
 +
        swiotlb=        [IA-64] Number of I/O TLB slabs
  
        switches=       [HW,M68k]
        trace_buf_size=nn[KMG]
                        [FTRACE] will set tracing buffer size.
  
 +      trace_event=[event-list]
 +                      [FTRACE] Set and start specified trace events in order
 +                      to facilitate early boot debugging.
 +                      See also Documentation/trace/events.txt
 +
        trix=           [HW,OSS] MediaTrix AudioTrix Pro
                        Format:
                        <io>,<irq>,<dma>,<dma2>,<sb_io>,<sb_irq>,<sb_dma>,<mpu_io>,<mpu_irq>
diff --combined Makefile
+++ b/Makefile
@@@ -1,7 -1,7 +1,7 @@@
  VERSION = 2
  PATCHLEVEL = 6
  SUBLEVEL = 31
 -EXTRAVERSION = -rc6
 +EXTRAVERSION =
  NAME = Man-Eating Seals of Antiquity
  
  # *DOCUMENTATION*
@@@ -325,7 -325,7 +325,7 @@@ CHECKFLAGS     := -D__linux__ -Dlinux -
  MODFLAGS      = -DMODULE
  CFLAGS_MODULE   = $(MODFLAGS)
  AFLAGS_MODULE   = $(MODFLAGS)
- LDFLAGS_MODULE  =
+ LDFLAGS_MODULE  = -T $(srctree)/scripts/module-common.lds
  CFLAGS_KERNEL =
  AFLAGS_KERNEL =
  CFLAGS_GCOV   = -fprofile-arcs -ftest-coverage
diff --combined arch/ia64/Kconfig
@@@ -89,6 -89,9 +89,9 @@@ config GENERIC_TIME_VSYSCAL
        bool
        default y
  
+ config HAVE_LEGACY_PER_CPU_AREA
+       def_bool y
  config HAVE_SETUP_PER_CPU_AREA
        def_bool y
  
@@@ -112,10 -115,6 +115,10 @@@ config IA64_UNCACHED_ALLOCATO
        bool
        select GENERIC_ALLOCATOR
  
 +config ARCH_USES_PG_UNCACHED
 +      def_bool y
 +      depends on IA64_UNCACHED_ALLOCATOR
 +
  config AUDIT_ARCH
        bool
        default y
diff --combined arch/powerpc/mm/stab.c
@@@ -31,7 -31,7 +31,7 @@@ struct stab_entry 
  
  #define NR_STAB_CACHE_ENTRIES 8
  static DEFINE_PER_CPU(long, stab_cache_ptr);
- static DEFINE_PER_CPU(long, stab_cache[NR_STAB_CACHE_ENTRIES]);
+ static DEFINE_PER_CPU(long [NR_STAB_CACHE_ENTRIES], stab_cache);
  
  /*
   * Create a segment table entry for the given esid/vsid pair.
@@@ -164,7 -164,7 +164,7 @@@ void switch_stab(struct task_struct *ts
  {
        struct stab_entry *stab = (struct stab_entry *) get_paca()->stab_addr;
        struct stab_entry *ste;
 -      unsigned long offset = __get_cpu_var(stab_cache_ptr);
 +      unsigned long offset;
        unsigned long pc = KSTK_EIP(tsk);
        unsigned long stack = KSTK_ESP(tsk);
        unsigned long unmapped_base;
        /* Force previous translations to complete. DRENG */
        asm volatile("isync" : : : "memory");
  
 +      /*
 +       * We need interrupts hard-disabled here, not just soft-disabled,
 +       * so that a PMU interrupt can't occur, which might try to access
 +       * user memory (to get a stack trace) and possible cause an STAB miss
 +       * which would update the stab_cache/stab_cache_ptr per-cpu variables.
 +       */
 +      hard_irq_disable();
 +
 +      offset = __get_cpu_var(stab_cache_ptr);
        if (offset <= NR_STAB_CACHE_ENTRIES) {
                int i;
  
@@@ -52,18 -52,55 +52,18 @@@ SECTION
        . = ALIGN(PAGE_SIZE);
        _eshared = .;           /* End of shareable data */
  
 -      . = ALIGN(16);          /* Exception table */
 -      __ex_table : {
 -              __start___ex_table = .;
 -              *(__ex_table)
 -              __stop___ex_table = .;
 -      } :data
 -
 -      .data : {               /* Data */
 -              DATA_DATA
 -              CONSTRUCTORS
 -      }
 -
 -      . = ALIGN(PAGE_SIZE);
 -      .data_nosave : {
 -      __nosave_begin = .;
 -              *(.data.nosave)
 -      }
 -      . = ALIGN(PAGE_SIZE);
 -      __nosave_end = .;
 -
 -      . = ALIGN(PAGE_SIZE);
 -      .data.page_aligned : {
 -              *(.data.idt)
 -      }
 +      EXCEPTION_TABLE(16) :data
  
 -      . = ALIGN(0x100);
 -      .data.cacheline_aligned : {
 -              *(.data.cacheline_aligned)
 -      }
 +      RW_DATA_SECTION(0x100, PAGE_SIZE, THREAD_SIZE)
  
 -      . = ALIGN(0x100);
 -      .data.read_mostly : {
 -              *(.data.read_mostly)
 -      }
        _edata = .;             /* End of data section */
  
 -      . = ALIGN(THREAD_SIZE); /* init_task */
 -      .data.init_task : {
 -              *(.data.init_task)
 -      }
 -
        /* will be freed after init */
        . = ALIGN(PAGE_SIZE);   /* Init code and data */
        __init_begin = .;
 -      .init.text : {
 -              _sinittext = .;
 -              INIT_TEXT
 -              _einittext = .;
 -      }
 +
 +      INIT_TEXT_SECTION(PAGE_SIZE)
 +
        /*
         * .exit.text is discarded at runtime, not link time,
         * to deal with references from __bug_table
  
        /* early.c uses stsi, which requires page aligned data. */
        . = ALIGN(PAGE_SIZE);
 -      .init.data : {
 -              INIT_DATA
 -      }
 -      . = ALIGN(0x100);
 -      .init.setup : {
 -              __setup_start = .;
 -              *(.init.setup)
 -              __setup_end = .;
 -      }
 -      .initcall.init : {
 -              __initcall_start = .;
 -              INITCALLS
 -              __initcall_end = .;
 -      }
 -
 -      .con_initcall.init : {
 -              __con_initcall_start = .;
 -              *(.con_initcall.init)
 -              __con_initcall_end = .;
 -      }
 -      SECURITY_INIT
 -
 -#ifdef CONFIG_BLK_DEV_INITRD
 -      . = ALIGN(0x100);
 -      .init.ramfs : {
 -              __initramfs_start = .;
 -              *(.init.ramfs)
 -              . = ALIGN(2);
 -              __initramfs_end = .;
 -      }
 -#endif
 +      INIT_DATA_SECTION(0x100)
  
        PERCPU(PAGE_SIZE)
        . = ALIGN(PAGE_SIZE);
        __init_end = .;         /* freed after init ends here */
  
 -      /* BSS */
 -      .bss : {
 -              __bss_start = .;
 -              *(.bss)
 -              . = ALIGN(2);
 -              __bss_stop = .;
 -      }
 +      BSS_SECTION(0, 2, 0)
  
        _end = . ;
  
-       /* Sections to be discarded */
-       /DISCARD/ : {
-               EXIT_DATA
-               *(.exitcall.exit)
-       }
        /* Debugging sections.  */
        STABS_DEBUG
        DWARF_DEBUG
+       /* Sections to be discarded */
+       DISCARDS
  }
diff --combined arch/sparc/Kconfig
@@@ -25,9 -25,6 +25,9 @@@ config SPAR
        select ARCH_WANT_OPTIONAL_GPIOLIB
        select RTC_CLASS
        select RTC_DRV_M48T59
 +      select HAVE_PERF_COUNTERS
 +      select HAVE_DMA_ATTRS
 +      select HAVE_DMA_API_DEBUG
  
  config SPARC32
        def_bool !64BIT
@@@ -47,7 -44,6 +47,7 @@@ config SPARC6
        select RTC_DRV_BQ4802
        select RTC_DRV_SUN4V
        select RTC_DRV_STARFIRE
 +      select HAVE_PERF_COUNTERS
  
  config ARCH_DEFCONFIG
        string
@@@ -99,7 -95,7 +99,7 @@@ config AUDIT_ARC
  config HAVE_SETUP_PER_CPU_AREA
        def_bool y if SPARC64
  
- config HAVE_DYNAMIC_PER_CPU_AREA
+ config NEED_PER_CPU_EMBED_FIRST_CHUNK
        def_bool y if SPARC64
  
  config GENERIC_HARDIRQS_NO__DO_IRQ
@@@ -441,17 -437,6 +441,17 @@@ config SERIAL_CONSOL
  
          If unsure, say N.
  
 +config SPARC_LEON
 +      bool "Sparc Leon processor family"
 +      depends on SPARC32
 +      ---help---
 +        If you say Y here if you are running on a SPARC-LEON processor.
 +        The LEON processor is a synthesizable VHDL model of the
 +        SPARC-v8 standard. LEON is  part of the GRLIB collection of
 +        IP cores that are distributed under GPL. GRLIB can be downloaded
 +        from www.gaisler.com. You can download a sparc-linux cross-compilation
 +        toolchain at www.gaisler.com.
 +
  endmenu
  
  menu "Bus options (PCI etc.)"
diff --combined arch/x86/Kconfig
@@@ -38,7 -38,7 +38,7 @@@ config X8
        select HAVE_FUNCTION_GRAPH_FP_TEST
        select HAVE_FUNCTION_TRACE_MCOUNT_TEST
        select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE
 -      select HAVE_FTRACE_SYSCALLS
 +      select HAVE_SYSCALL_TRACEPOINTS
        select HAVE_KVM
        select HAVE_ARCH_KGDB
        select HAVE_ARCH_TRACEHOOK
@@@ -150,7 -150,10 +150,10 @@@ config ARCH_HAS_CACHE_LINE_SIZ
  config HAVE_SETUP_PER_CPU_AREA
        def_bool y
  
- config HAVE_DYNAMIC_PER_CPU_AREA
+ config NEED_PER_CPU_EMBED_FIRST_CHUNK
+       def_bool y
+ config NEED_PER_CPU_PAGE_FIRST_CHUNK
        def_bool y
  
  config HAVE_CPUMASK_OF_CPU_MAP
@@@ -179,10 -182,6 +182,10 @@@ config ARCH_SUPPORTS_OPTIMIZED_INLININ
  config ARCH_SUPPORTS_DEBUG_PAGEALLOC
        def_bool y
  
 +config HAVE_INTEL_TXT
 +      def_bool y
 +      depends on EXPERIMENTAL && DMAR && ACPI
 +
  # Use the generic interrupt handling code in kernel/irq/:
  config GENERIC_HARDIRQS
        bool
@@@ -590,6 -589,7 +593,6 @@@ config GART_IOMM
        bool "GART IOMMU support" if EMBEDDED
        default y
        select SWIOTLB
 -      select AGP
        depends on X86_64 && PCI
        ---help---
          Support for full DMA access of devices with 32bit memory access only
@@@ -1417,10 -1417,6 +1420,10 @@@ config X86_PA
  
          If unsure, say Y.
  
 +config ARCH_USES_PG_UNCACHED
 +      def_bool y
 +      depends on X86_PAT
 +
  config EFI
        bool "EFI runtime service support"
        depends on ACPI
@@@ -49,7 -49,7 +49,7 @@@
  #define __percpu_arg(x)               "%%"__stringify(__percpu_seg)":%P" #x
  #define __my_cpu_offset               percpu_read(this_cpu_off)
  #else
 -#define __percpu_arg(x)               "%" #x
 +#define __percpu_arg(x)               "%P" #x
  #endif
  
  /*
@@@ -104,48 -104,36 +104,48 @@@ do {                                                    
        }                                               \
  } while (0)
  
 -#define percpu_from_op(op, var)                               \
 +#define percpu_from_op(op, var, constraint)           \
  ({                                                    \
        typeof(var) ret__;                              \
        switch (sizeof(var)) {                          \
        case 1:                                         \
                asm(op "b "__percpu_arg(1)",%0"         \
                    : "=q" (ret__)                      \
 -                  : "m" (var));                       \
 +                  : constraint);                      \
                break;                                  \
        case 2:                                         \
                asm(op "w "__percpu_arg(1)",%0"         \
                    : "=r" (ret__)                      \
 -                  : "m" (var));                       \
 +                  : constraint);                      \
                break;                                  \
        case 4:                                         \
                asm(op "l "__percpu_arg(1)",%0"         \
                    : "=r" (ret__)                      \
 -                  : "m" (var));                       \
 +                  : constraint);                      \
                break;                                  \
        case 8:                                         \
                asm(op "q "__percpu_arg(1)",%0"         \
                    : "=r" (ret__)                      \
 -                  : "m" (var));                       \
 +                  : constraint);                      \
                break;                                  \
        default: __bad_percpu_size();                   \
        }                                               \
        ret__;                                          \
  })
  
 -#define percpu_read(var)      percpu_from_op("mov", per_cpu__##var)
 +/*
 + * percpu_read() makes gcc load the percpu variable every time it is
 + * accessed while percpu_read_stable() allows the value to be cached.
 + * percpu_read_stable() is more efficient and can be used if its value
 + * is guaranteed to be valid across cpus.  The current users include
 + * get_current() and get_thread_info() both of which are actually
 + * per-thread variables implemented as per-cpu variables and thus
 + * stable for the duration of the respective task.
 + */
 +#define percpu_read(var)      percpu_from_op("mov", per_cpu__##var,   \
 +                                             "m" (per_cpu__##var))
 +#define percpu_read_stable(var)       percpu_from_op("mov", per_cpu__##var,   \
 +                                             "p" (&per_cpu__##var))
  #define percpu_write(var, val)        percpu_to_op("mov", per_cpu__##var, val)
  #define percpu_add(var, val)  percpu_to_op("add", per_cpu__##var, val)
  #define percpu_sub(var, val)  percpu_to_op("sub", per_cpu__##var, val)
  /* We can use this directly for local CPU (faster). */
  DECLARE_PER_CPU(unsigned long, this_cpu_off);
  
- #ifdef CONFIG_NEED_MULTIPLE_NODES
- void *pcpu_lpage_remapped(void *kaddr);
- #else
- static inline void *pcpu_lpage_remapped(void *kaddr)
- {
-       return NULL;
- }
- #endif
  #endif /* !__ASSEMBLY__ */
  
  #ifdef CONFIG_SMP
@@@ -183,11 -183,6 +183,11 @@@ void mce_log(struct mce *mce
        set_bit(0, &mce_need_notify);
  }
  
 +void __weak decode_mce(struct mce *m)
 +{
 +      return;
 +}
 +
  static void print_mce(struct mce *m)
  {
        printk(KERN_EMERG
        printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
                        m->cpuvendor, m->cpuid, m->time, m->socketid,
                        m->apicid);
 +
 +      decode_mce(m);
  }
  
  static void print_mce_head(void)
  static void print_mce_tail(void)
  {
        printk(KERN_EMERG "This is not a software problem!\n"
 -             "Run through mcelog --ascii to decode and contact your hardware vendor\n");
 +#if (!defined(CONFIG_EDAC) || !defined(CONFIG_CPU_SUP_AMD))
 +             "Run through mcelog --ascii to decode and contact your hardware vendor\n"
 +#endif
 +             );
  }
  
  #define PANIC_TIMEOUT 5 /* 5 seconds */
@@@ -1101,7 -1091,7 +1101,7 @@@ void mce_log_therm_throt_event(__u64 st
   */
  static int check_interval = 5 * 60; /* 5 minutes */
  
- static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
+ static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */
  static DEFINE_PER_CPU(struct timer_list, mce_timer);
  
  static void mcheck_timer(unsigned long data)
         * Alert userspace if needed.  If we logged an MCE, reduce the
         * polling interval, otherwise increase the polling interval.
         */
-       n = &__get_cpu_var(next_interval);
+       n = &__get_cpu_var(mce_next_interval);
        if (mce_notify_irq())
                *n = max(*n/2, HZ/100);
        else
@@@ -1236,13 -1226,8 +1236,13 @@@ static void mce_init(void
  }
  
  /* Add per CPU specific workarounds here */
 -static void mce_cpu_quirks(struct cpuinfo_x86 *c)
 +static int mce_cpu_quirks(struct cpuinfo_x86 *c)
  {
 +      if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
 +              pr_info("MCE: unknown CPU type - not enabling MCE support.\n");
 +              return -EOPNOTSUPP;
 +      }
 +
        /* This should be disabled by the BIOS, but isn't always */
        if (c->x86_vendor == X86_VENDOR_AMD) {
                if (c->x86 == 15 && banks > 4) {
                if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
                        monarch_timeout < 0)
                        monarch_timeout = USEC_PER_SEC;
 +
 +              /*
 +               * There are also broken BIOSes on some Pentium M and
 +               * earlier systems:
 +               */
 +              if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0)
 +                      mce_bootlog = 0;
        }
        if (monarch_timeout < 0)
                monarch_timeout = 0;
        if (mce_bootlog != 0)
                mce_panic_timeout = 30;
 +
 +      return 0;
  }
  
  static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
@@@ -1335,7 -1311,7 +1335,7 @@@ static void mce_cpu_features(struct cpu
  static void mce_init_timer(void)
  {
        struct timer_list *t = &__get_cpu_var(mce_timer);
-       int *n = &__get_cpu_var(next_interval);
+       int *n = &__get_cpu_var(mce_next_interval);
  
        if (mce_ignore_ce)
                return;
@@@ -1362,10 -1338,11 +1362,10 @@@ void __cpuinit mcheck_init(struct cpuin
        if (!mce_available(c))
                return;
  
 -      if (mce_cap_init() < 0) {
 +      if (mce_cap_init() < 0 || mce_cpu_quirks(c) < 0) {
                mce_disabled = 1;
                return;
        }
 -      mce_cpu_quirks(c);
  
        machine_check_vector = do_machine_check;
  
@@@ -1935,7 -1912,7 +1935,7 @@@ mce_cpu_callback(struct notifier_block 
        case CPU_DOWN_FAILED:
        case CPU_DOWN_FAILED_FROZEN:
                t->expires = round_jiffies(jiffies +
-                                               __get_cpu_var(next_interval));
+                                          __get_cpu_var(mce_next_interval));
                add_timer_on(t, cpu);
                smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
                break;
@@@ -69,7 -69,7 +69,7 @@@ struct threshold_bank 
        struct threshold_block  *blocks;
        cpumask_var_t           cpus;
  };
- static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]);
+ static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks);
  
  #ifdef CONFIG_SMP
  static unsigned char shared_bank[NR_BANKS] = {
@@@ -489,14 -489,12 +489,14 @@@ static __cpuinit int threshold_create_b
        int i, err = 0;
        struct threshold_bank *b = NULL;
        char name[32];
 +      struct cpuinfo_x86 *c = &cpu_data(cpu);
 +
  
        sprintf(name, "threshold_bank%i", bank);
  
  #ifdef CONFIG_SMP
        if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) {   /* symlink */
 -              i = cpumask_first(cpu_core_mask(cpu));
 +              i = cpumask_first(c->llc_shared_map);
  
                /* first core not up yet */
                if (cpu_data(i).cpu_core_id)
                if (err)
                        goto out;
  
 -              cpumask_copy(b->cpus, cpu_core_mask(cpu));
 +              cpumask_copy(b->cpus, c->llc_shared_map);
                per_cpu(threshold_banks, cpu)[bank] = b;
  
                goto out;
  #ifndef CONFIG_SMP
        cpumask_setall(b->cpus);
  #else
 -      cpumask_copy(b->cpus, cpu_core_mask(cpu));
 +      cpumask_copy(b->cpus, c->llc_shared_map);
  #endif
  
        per_cpu(threshold_banks, cpu)[bank] = b;
@@@ -6,7 -6,6 +6,7 @@@
   *  Copyright (C) 2009 Jaswinder Singh Rajput
   *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
   *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
 + *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
   *
   *  For licencing details see kernel-base/COPYING
   */
@@@ -21,7 -20,6 +21,7 @@@
  #include <linux/sched.h>
  #include <linux/uaccess.h>
  #include <linux/highmem.h>
 +#include <linux/cpu.h>
  
  #include <asm/apic.h>
  #include <asm/stacktrace.h>
  
  static u64 perf_counter_mask __read_mostly;
  
 +/* The maximal number of PEBS counters: */
 +#define MAX_PEBS_COUNTERS     4
 +
 +/* The size of a BTS record in bytes: */
 +#define BTS_RECORD_SIZE               24
 +
 +/* The size of a per-cpu BTS buffer in bytes: */
 +#define BTS_BUFFER_SIZE               (BTS_RECORD_SIZE * 1024)
 +
 +/* The BTS overflow threshold in bytes from the end of the buffer: */
 +#define BTS_OVFL_TH           (BTS_RECORD_SIZE * 64)
 +
 +
 +/*
 + * Bits in the debugctlmsr controlling branch tracing.
 + */
 +#define X86_DEBUGCTL_TR                       (1 << 6)
 +#define X86_DEBUGCTL_BTS              (1 << 7)
 +#define X86_DEBUGCTL_BTINT            (1 << 8)
 +#define X86_DEBUGCTL_BTS_OFF_OS               (1 << 9)
 +#define X86_DEBUGCTL_BTS_OFF_USR      (1 << 10)
 +
 +/*
 + * A debug store configuration.
 + *
 + * We only support architectures that use 64bit fields.
 + */
 +struct debug_store {
 +      u64     bts_buffer_base;
 +      u64     bts_index;
 +      u64     bts_absolute_maximum;
 +      u64     bts_interrupt_threshold;
 +      u64     pebs_buffer_base;
 +      u64     pebs_index;
 +      u64     pebs_absolute_maximum;
 +      u64     pebs_interrupt_threshold;
 +      u64     pebs_counter_reset[MAX_PEBS_COUNTERS];
 +};
 +
  struct cpu_hw_counters {
        struct perf_counter     *counters[X86_PMC_IDX_MAX];
        unsigned long           used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
        unsigned long           active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
        unsigned long           interrupts;
        int                     enabled;
 +      struct debug_store      *ds;
  };
  
  /*
@@@ -100,8 -58,6 +100,8 @@@ struct x86_pmu 
        int             apic;
        u64             max_period;
        u64             intel_ctrl;
 +      void            (*enable_bts)(u64 config);
 +      void            (*disable_bts)(void);
  };
  
  static struct x86_pmu x86_pmu __read_mostly;
@@@ -621,9 -577,6 +621,9 @@@ x86_perf_counter_update(struct perf_cou
        u64 prev_raw_count, new_raw_count;
        s64 delta;
  
 +      if (idx == X86_PMC_IDX_FIXED_BTS)
 +              return 0;
 +
        /*
         * Careful: an NMI might modify the previous counter value.
         *
@@@ -713,110 -666,10 +713,110 @@@ static void release_pmc_hardware(void
  #endif
  }
  
 +static inline bool bts_available(void)
 +{
 +      return x86_pmu.enable_bts != NULL;
 +}
 +
 +static inline void init_debug_store_on_cpu(int cpu)
 +{
 +      struct debug_store *ds = per_cpu(cpu_hw_counters, cpu).ds;
 +
 +      if (!ds)
 +              return;
 +
 +      wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
 +                   (u32)((u64)(unsigned long)ds),
 +                   (u32)((u64)(unsigned long)ds >> 32));
 +}
 +
 +static inline void fini_debug_store_on_cpu(int cpu)
 +{
 +      if (!per_cpu(cpu_hw_counters, cpu).ds)
 +              return;
 +
 +      wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
 +}
 +
 +static void release_bts_hardware(void)
 +{
 +      int cpu;
 +
 +      if (!bts_available())
 +              return;
 +
 +      get_online_cpus();
 +
 +      for_each_online_cpu(cpu)
 +              fini_debug_store_on_cpu(cpu);
 +
 +      for_each_possible_cpu(cpu) {
 +              struct debug_store *ds = per_cpu(cpu_hw_counters, cpu).ds;
 +
 +              if (!ds)
 +                      continue;
 +
 +              per_cpu(cpu_hw_counters, cpu).ds = NULL;
 +
 +              kfree((void *)(unsigned long)ds->bts_buffer_base);
 +              kfree(ds);
 +      }
 +
 +      put_online_cpus();
 +}
 +
 +static int reserve_bts_hardware(void)
 +{
 +      int cpu, err = 0;
 +
 +      if (!bts_available())
 +              return 0;
 +
 +      get_online_cpus();
 +
 +      for_each_possible_cpu(cpu) {
 +              struct debug_store *ds;
 +              void *buffer;
 +
 +              err = -ENOMEM;
 +              buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL);
 +              if (unlikely(!buffer))
 +                      break;
 +
 +              ds = kzalloc(sizeof(*ds), GFP_KERNEL);
 +              if (unlikely(!ds)) {
 +                      kfree(buffer);
 +                      break;
 +              }
 +
 +              ds->bts_buffer_base = (u64)(unsigned long)buffer;
 +              ds->bts_index = ds->bts_buffer_base;
 +              ds->bts_absolute_maximum =
 +                      ds->bts_buffer_base + BTS_BUFFER_SIZE;
 +              ds->bts_interrupt_threshold =
 +                      ds->bts_absolute_maximum - BTS_OVFL_TH;
 +
 +              per_cpu(cpu_hw_counters, cpu).ds = ds;
 +              err = 0;
 +      }
 +
 +      if (err)
 +              release_bts_hardware();
 +      else {
 +              for_each_online_cpu(cpu)
 +                      init_debug_store_on_cpu(cpu);
 +      }
 +
 +      put_online_cpus();
 +
 +      return err;
 +}
 +
  static void hw_perf_counter_destroy(struct perf_counter *counter)
  {
        if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) {
                release_pmc_hardware();
 +              release_bts_hardware();
                mutex_unlock(&pmc_reserve_mutex);
        }
  }
@@@ -859,42 -712,6 +859,42 @@@ set_ext_hw_attr(struct hw_perf_counter 
        return 0;
  }
  
 +static void intel_pmu_enable_bts(u64 config)
 +{
 +      unsigned long debugctlmsr;
 +
 +      debugctlmsr = get_debugctlmsr();
 +
 +      debugctlmsr |= X86_DEBUGCTL_TR;
 +      debugctlmsr |= X86_DEBUGCTL_BTS;
 +      debugctlmsr |= X86_DEBUGCTL_BTINT;
 +
 +      if (!(config & ARCH_PERFMON_EVENTSEL_OS))
 +              debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS;
 +
 +      if (!(config & ARCH_PERFMON_EVENTSEL_USR))
 +              debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR;
 +
 +      update_debugctlmsr(debugctlmsr);
 +}
 +
 +static void intel_pmu_disable_bts(void)
 +{
 +      struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 +      unsigned long debugctlmsr;
 +
 +      if (!cpuc->ds)
 +              return;
 +
 +      debugctlmsr = get_debugctlmsr();
 +
 +      debugctlmsr &=
 +              ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT |
 +                X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR);
 +
 +      update_debugctlmsr(debugctlmsr);
 +}
 +
  /*
   * Setup the hardware configuration for a given attr_type
   */
@@@ -911,13 -728,9 +911,13 @@@ static int __hw_perf_counter_init(struc
        err = 0;
        if (!atomic_inc_not_zero(&active_counters)) {
                mutex_lock(&pmc_reserve_mutex);
 -              if (atomic_read(&active_counters) == 0 && !reserve_pmc_hardware())
 -                      err = -EBUSY;
 -              else
 +              if (atomic_read(&active_counters) == 0) {
 +                      if (!reserve_pmc_hardware())
 +                              err = -EBUSY;
 +                      else
 +                              err = reserve_bts_hardware();
 +              }
 +              if (!err)
                        atomic_inc(&active_counters);
                mutex_unlock(&pmc_reserve_mutex);
        }
        if (config == -1LL)
                return -EINVAL;
  
 +      /*
 +       * Branch tracing:
 +       */
 +      if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) &&
 +          (hwc->sample_period == 1)) {
 +              /* BTS is not supported by this architecture. */
 +              if (!bts_available())
 +                      return -EOPNOTSUPP;
 +
 +              /* BTS is currently only allowed for user-mode. */
 +              if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
 +                      return -EOPNOTSUPP;
 +      }
 +
        hwc->config |= config;
  
        return 0;
@@@ -1018,18 -817,7 +1018,18 @@@ static void p6_pmu_disable_all(void
  
  static void intel_pmu_disable_all(void)
  {
 +      struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 +
 +      if (!cpuc->enabled)
 +              return;
 +
 +      cpuc->enabled = 0;
 +      barrier();
 +
        wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
 +
 +      if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask))
 +              intel_pmu_disable_bts();
  }
  
  static void amd_pmu_disable_all(void)
@@@ -1087,25 -875,7 +1087,25 @@@ static void p6_pmu_enable_all(void
  
  static void intel_pmu_enable_all(void)
  {
 +      struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 +
 +      if (cpuc->enabled)
 +              return;
 +
 +      cpuc->enabled = 1;
 +      barrier();
 +
        wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
 +
 +      if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
 +              struct perf_counter *counter =
 +                      cpuc->counters[X86_PMC_IDX_FIXED_BTS];
 +
 +              if (WARN_ON_ONCE(!counter))
 +                      return;
 +
 +              intel_pmu_enable_bts(counter->hw.config);
 +      }
  }
  
  static void amd_pmu_enable_all(void)
@@@ -1192,11 -962,6 +1192,11 @@@ p6_pmu_disable_counter(struct hw_perf_c
  static inline void
  intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
  {
 +      if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
 +              intel_pmu_disable_bts();
 +              return;
 +      }
 +
        if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
                intel_pmu_disable_fixed(hwc, idx);
                return;
@@@ -1211,7 -976,7 +1211,7 @@@ amd_pmu_disable_counter(struct hw_perf_
        x86_pmu_disable_counter(hwc, idx);
  }
  
- static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
+ static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
  
  /*
   * Set the next IRQ period, based on the hwc->period_left value.
@@@ -1225,9 -990,6 +1225,9 @@@ x86_perf_counter_set_period(struct perf
        s64 period = hwc->sample_period;
        int err, ret = 0;
  
 +      if (idx == X86_PMC_IDX_FIXED_BTS)
 +              return 0;
 +
        /*
         * If we are way outside a reasoable range then just skip forward:
         */
        if (left > x86_pmu.max_period)
                left = x86_pmu.max_period;
  
-       per_cpu(prev_left[idx], smp_processor_id()) = left;
+       per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
  
        /*
         * The hw counter starts counting from this counter offset,
@@@ -1310,14 -1072,6 +1310,14 @@@ static void p6_pmu_enable_counter(struc
  
  static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
  {
 +      if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
 +              if (!__get_cpu_var(cpu_hw_counters).enabled)
 +                      return;
 +
 +              intel_pmu_enable_bts(hwc->config);
 +              return;
 +      }
 +
        if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
                intel_pmu_enable_fixed(hwc, idx);
                return;
@@@ -1339,16 -1093,11 +1339,16 @@@ fixed_mode_idx(struct perf_counter *cou
  {
        unsigned int event;
  
 +      event = hwc->config & ARCH_PERFMON_EVENT_MASK;
 +
 +      if (unlikely((event ==
 +                    x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) &&
 +                   (hwc->sample_period == 1)))
 +              return X86_PMC_IDX_FIXED_BTS;
 +
        if (!x86_pmu.num_counters_fixed)
                return -1;
  
 -      event = hwc->config & ARCH_PERFMON_EVENT_MASK;
 -
        if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
                return X86_PMC_IDX_FIXED_INSTRUCTIONS;
        if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
@@@ -1369,15 -1118,7 +1369,15 @@@ static int x86_pmu_enable(struct perf_c
        int idx;
  
        idx = fixed_mode_idx(counter, hwc);
 -      if (idx >= 0) {
 +      if (idx == X86_PMC_IDX_FIXED_BTS) {
 +              /* BTS is already occupied. */
 +              if (test_and_set_bit(idx, cpuc->used_mask))
 +                      return -EAGAIN;
 +
 +              hwc->config_base        = 0;
 +              hwc->counter_base       = 0;
 +              hwc->idx                = idx;
 +      } else if (idx >= 0) {
                /*
                 * Try to get the fixed counter, if that is already taken
                 * then try to get a generic counter:
@@@ -1470,7 -1211,7 +1470,7 @@@ void perf_counter_print_debug(void
                rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
                rdmsrl(x86_pmu.perfctr  + idx, pmc_count);
  
-               prev_left = per_cpu(prev_left[idx], cpu);
+               prev_left = per_cpu(pmc_prev_left[idx], cpu);
  
                pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
                        cpu, idx, pmc_ctrl);
        local_irq_restore(flags);
  }
  
 +static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc,
 +                                     struct perf_sample_data *data)
 +{
 +      struct debug_store *ds = cpuc->ds;
 +      struct bts_record {
 +              u64     from;
 +              u64     to;
 +              u64     flags;
 +      };
 +      struct perf_counter *counter = cpuc->counters[X86_PMC_IDX_FIXED_BTS];
 +      unsigned long orig_ip = data->regs->ip;
 +      struct bts_record *at, *top;
 +
 +      if (!counter)
 +              return;
 +
 +      if (!ds)
 +              return;
 +
 +      at  = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
 +      top = (struct bts_record *)(unsigned long)ds->bts_index;
 +
 +      ds->bts_index = ds->bts_buffer_base;
 +
 +      for (; at < top; at++) {
 +              data->regs->ip  = at->from;
 +              data->addr      = at->to;
 +
 +              perf_counter_output(counter, 1, data);
 +      }
 +
 +      data->regs->ip  = orig_ip;
 +      data->addr      = 0;
 +
 +      /* There's new data available. */
 +      counter->pending_kill = POLL_IN;
 +}
 +
  static void x86_pmu_disable(struct perf_counter *counter)
  {
        struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
         * that we are disabling:
         */
        x86_perf_counter_update(counter, hwc, idx);
 +
 +      /* Drain the remaining BTS records. */
 +      if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
 +              struct perf_sample_data data;
 +              struct pt_regs regs;
 +
 +              data.regs = &regs;
 +              intel_pmu_drain_bts_buffer(cpuc, &data);
 +      }
        cpuc->counters[idx] = NULL;
        clear_bit(idx, cpuc->used_mask);
  
@@@ -1586,7 -1280,6 +1586,7 @@@ static int intel_pmu_save_and_restart(s
  
  static void intel_pmu_reset(void)
  {
 +      struct debug_store *ds = __get_cpu_var(cpu_hw_counters).ds;
        unsigned long flags;
        int idx;
  
        for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
                checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
        }
 +      if (ds)
 +              ds->bts_index = ds->bts_buffer_base;
  
        local_irq_restore(flags);
  }
@@@ -1671,7 -1362,6 +1671,7 @@@ static int intel_pmu_handle_irq(struct 
        cpuc = &__get_cpu_var(cpu_hw_counters);
  
        perf_disable();
 +      intel_pmu_drain_bts_buffer(cpuc, &data);
        status = intel_pmu_get_status();
        if (!status) {
                perf_enable();
@@@ -1881,8 -1571,6 +1881,8 @@@ static struct x86_pmu intel_pmu = 
         * the generic counter period:
         */
        .max_period             = (1ULL << 31) - 1,
 +      .enable_bts             = intel_pmu_enable_bts,
 +      .disable_bts            = intel_pmu_disable_bts,
  };
  
  static struct x86_pmu amd_pmu = {
@@@ -2110,8 -1798,8 +2110,8 @@@ void callchain_store(struct perf_callch
                entry->ip[entry->nr++] = ip;
  }
  
- static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry);
- static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry);
+ static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
+ static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
  static DEFINE_PER_CPU(int, in_nmi_frame);
  
  
@@@ -2264,9 -1952,9 +2264,9 @@@ struct perf_callchain_entry *perf_callc
        struct perf_callchain_entry *entry;
  
        if (in_nmi())
-               entry = &__get_cpu_var(nmi_entry);
+               entry = &__get_cpu_var(pmc_nmi_entry);
        else
-               entry = &__get_cpu_var(irq_entry);
+               entry = &__get_cpu_var(pmc_irq_entry);
  
        entry->nr = 0;
  
  
        return entry;
  }
 +
 +void hw_perf_counter_setup_online(int cpu)
 +{
 +      init_debug_store_on_cpu(cpu);
 +}
@@@ -46,10 -46,11 +46,10 @@@ PHDRS 
        data PT_LOAD FLAGS(7);          /* RWE */
  #ifdef CONFIG_X86_64
        user PT_LOAD FLAGS(7);          /* RWE */
 -      data.init PT_LOAD FLAGS(7);     /* RWE */
  #ifdef CONFIG_SMP
        percpu PT_LOAD FLAGS(7);        /* RWE */
  #endif
 -      data.init2 PT_LOAD FLAGS(7);    /* RWE */
 +      init PT_LOAD FLAGS(7);          /* RWE */
  #endif
        note PT_NOTE FLAGS(0);          /* ___ */
  }
@@@ -102,43 -103,65 +102,43 @@@ SECTION
                __stop___ex_table = .;
        } :text = 0x9090
  
 -      RODATA
 +      RO_DATA(PAGE_SIZE)
  
        /* Data */
 -      . = ALIGN(PAGE_SIZE);
        .data : AT(ADDR(.data) - LOAD_OFFSET) {
                /* Start of data section */
                _sdata = .;
 -              DATA_DATA
 -              CONSTRUCTORS
 -      } :data
 +
 +              /* init_task */
 +              INIT_TASK_DATA(THREAD_SIZE)
  
  #ifdef CONFIG_X86_32
 -      /* 32 bit has nosave before _edata */
 -      . = ALIGN(PAGE_SIZE);
 -      .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
 -              __nosave_begin = .;
 -              *(.data.nosave)
 -              . = ALIGN(PAGE_SIZE);
 -              __nosave_end = .;
 -      }
 +              /* 32 bit has nosave before _edata */
 +              NOSAVE_DATA
  #endif
  
 -      . = ALIGN(PAGE_SIZE);
 -      .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
 -              *(.data.page_aligned)
 +              PAGE_ALIGNED_DATA(PAGE_SIZE)
                *(.data.idt)
 -      }
  
 -#ifdef CONFIG_X86_32
 -      . = ALIGN(32);
 -#else
 -      . = ALIGN(PAGE_SIZE);
 -      . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
 -#endif
 -      .data.cacheline_aligned :
 -              AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
 -              *(.data.cacheline_aligned)
 -      }
 +              CACHELINE_ALIGNED_DATA(CONFIG_X86_L1_CACHE_BYTES)
  
 -      /* rarely changed data like cpu maps */
 -#ifdef CONFIG_X86_32
 -      . = ALIGN(32);
 -#else
 -      . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES);
 -#endif
 -      .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
 -              *(.data.read_mostly)
 +              DATA_DATA
 +              CONSTRUCTORS
 +
 +              /* rarely changed data like cpu maps */
 +              READ_MOSTLY_DATA(CONFIG_X86_INTERNODE_CACHE_BYTES)
  
                /* End of data section */
                _edata = .;
 -      }
 +      } :data
  
  #ifdef CONFIG_X86_64
  
  #define VSYSCALL_ADDR (-10*1024*1024)
 -#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + \
 -                            SIZEOF(.data.read_mostly) + 4095) & ~(4095))
 -#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + \
 -                            SIZEOF(.data.read_mostly) + 4095) & ~(4095))
 +#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data) + SIZEOF(.data) + \
 +                            PAGE_SIZE - 1) & ~(PAGE_SIZE - 1))
 +#define VSYSCALL_VIRT_ADDR ((ADDR(.data) + SIZEOF(.data) + \
 +                            PAGE_SIZE - 1) & ~(PAGE_SIZE - 1))
  
  #define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR)
  #define VLOAD(x) (ADDR(x) - VLOAD_OFFSET)
  
  #endif /* CONFIG_X86_64 */
  
 -      /* init_task */
 -      . = ALIGN(THREAD_SIZE);
 -      .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
 -              *(.data.init_task)
 +      /* Init code and data - will be freed after init */
 +      . = ALIGN(PAGE_SIZE);
 +      .init.begin : AT(ADDR(.init.begin) - LOAD_OFFSET) {
 +              __init_begin = .; /* paired with __init_end */
        }
 -#ifdef CONFIG_X86_64
 -       :data.init
 -#endif
  
 +#if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
        /*
 -       * smp_locks might be freed after init
 -       * start/end must be page aligned
 +       * percpu offsets are zero-based on SMP.  PERCPU_VADDR() changes the
 +       * output PHDR, so the next output section - .init.text - should
 +       * start another segment - init.
         */
 -      . = ALIGN(PAGE_SIZE);
 -      .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
 -              __smp_locks = .;
 -              *(.smp_locks)
 -              __smp_locks_end = .;
 -              . = ALIGN(PAGE_SIZE);
 -      }
 +      PERCPU_VADDR(0, :percpu)
 +#endif
  
 -      /* Init code and data - will be freed after init */
 -      . = ALIGN(PAGE_SIZE);
        .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
 -              __init_begin = .; /* paired with __init_end */
                _sinittext = .;
                INIT_TEXT
                _einittext = .;
        }
 +#ifdef CONFIG_X86_64
 +      :init
 +#endif
  
        .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
                INIT_DATA
        }
  #endif
  
 -#if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
 -      /*
 -       * percpu offsets are zero-based on SMP.  PERCPU_VADDR() changes the
 -       * output PHDR, so the next output section - __data_nosave - should
 -       * start another section data.init2.  Also, pda should be at the head of
 -       * percpu area.  Preallocate it and define the percpu offset symbol
 -       * so that it can be accessed as a percpu variable.
 -       */
 -      . = ALIGN(PAGE_SIZE);
 -      PERCPU_VADDR(0, :percpu)
 -#else
 +#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP)
        PERCPU(PAGE_SIZE)
  #endif
  
                __init_end = .;
        }
  
 +      /*
 +       * smp_locks might be freed after init
 +       * start/end must be page aligned
 +       */
 +      . = ALIGN(PAGE_SIZE);
 +      .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
 +              __smp_locks = .;
 +              *(.smp_locks)
 +              __smp_locks_end = .;
 +              . = ALIGN(PAGE_SIZE);
 +      }
 +
  #ifdef CONFIG_X86_64
        .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
 -              . = ALIGN(PAGE_SIZE);
 -              __nosave_begin = .;
 -              *(.data.nosave)
 -              . = ALIGN(PAGE_SIZE);
 -              __nosave_end = .;
 -      } :data.init2
 -      /* use another section data.init2, see PERCPU_VADDR() above */
 +              NOSAVE_DATA
 +      }
  #endif
  
        /* BSS */
                _end = .;
        }
  
-       /* Sections to be discarded */
-       /DISCARD/ : {
-               *(.exitcall.exit)
-               *(.eh_frame)
-               *(.discard)
-       }
          STABS_DEBUG
          DWARF_DEBUG
+       /* Sections to be discarded */
+       DISCARDS
+       /DISCARD/ : { *(.eh_frame) }
  }
  
  
diff --combined arch/x86/mm/pageattr.c
@@@ -12,6 -12,7 +12,7 @@@
  #include <linux/seq_file.h>
  #include <linux/debugfs.h>
  #include <linux/pfn.h>
+ #include <linux/percpu.h>
  
  #include <asm/e820.h>
  #include <asm/processor.h>
@@@ -686,7 -687,7 +687,7 @@@ static int cpa_process_alias(struct cpa
  {
        struct cpa_data alias_cpa;
        unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT);
-       unsigned long vaddr, remapped;
+       unsigned long vaddr;
        int ret;
  
        if (cpa->pfn >= max_pfn_mapped)
        }
  #endif
  
-       /*
-        * If the PMD page was partially used for per-cpu remapping,
-        * the recycled area needs to be split and modified.  Because
-        * the area is always proper subset of a PMD page
-        * cpa->numpages is guaranteed to be 1 for these areas, so
-        * there's no need to loop over and check for further remaps.
-        */
-       remapped = (unsigned long)pcpu_lpage_remapped((void *)laddr);
-       if (remapped) {
-               WARN_ON(cpa->numpages > 1);
-               alias_cpa = *cpa;
-               alias_cpa.vaddr = &remapped;
-               alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
-               ret = __change_page_attr_set_clr(&alias_cpa, 0);
-               if (ret)
-                       return ret;
-       }
        return 0;
  }
  
@@@ -822,7 -805,6 +805,7 @@@ static int change_page_attr_set_clr(uns
  {
        struct cpa_data cpa;
        int ret, cache, checkalias;
 +      unsigned long baddr = 0;
  
        /*
         * Check, if we are requested to change a not supported
                         */
                        WARN_ON_ONCE(1);
                }
 +              /*
 +               * Save address for cache flush. *addr is modified in the call
 +               * to __change_page_attr_set_clr() below.
 +               */
 +              baddr = *addr;
        }
  
        /* Must avoid aliasing mappings in the highmem code */
                        cpa_flush_array(addr, numpages, cache,
                                        cpa.flags, pages);
                } else
 -                      cpa_flush_range(*addr, numpages, cache);
 +                      cpa_flush_range(baddr, numpages, cache);
        } else
                cpa_flush_all(cache);
  
diff --combined block/cfq-iosched.c
@@@ -48,7 -48,7 +48,7 @@@ static int cfq_slice_idle = HZ / 125
  static struct kmem_cache *cfq_pool;
  static struct kmem_cache *cfq_ioc_pool;
  
- static DEFINE_PER_CPU(unsigned long, ioc_count);
+ static DEFINE_PER_CPU(unsigned long, cfq_ioc_count);
  static struct completion *ioc_gone;
  static DEFINE_SPINLOCK(ioc_gone_lock);
  
@@@ -134,8 -134,13 +134,8 @@@ struct cfq_data 
        struct rb_root prio_trees[CFQ_PRIO_LISTS];
  
        unsigned int busy_queues;
 -      /*
 -       * Used to track any pending rt requests so we can pre-empt current
 -       * non-RT cfqq in service when this value is non-zero.
 -       */
 -      unsigned int busy_rt_queues;
  
 -      int rq_in_driver;
 +      int rq_in_driver[2];
        int sync_flight;
  
        /*
@@@ -186,6 -191,7 +186,6 @@@ enum cfqq_state_flags 
        CFQ_CFQQ_FLAG_on_rr = 0,        /* on round-robin busy list */
        CFQ_CFQQ_FLAG_wait_request,     /* waiting for a request */
        CFQ_CFQQ_FLAG_must_dispatch,    /* must be allowed a dispatch */
 -      CFQ_CFQQ_FLAG_must_alloc,       /* must be allowed rq alloc */
        CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */
        CFQ_CFQQ_FLAG_fifo_expire,      /* FIFO checked in this slice */
        CFQ_CFQQ_FLAG_idle_window,      /* slice idling enabled */
@@@ -212,6 -218,7 +212,6 @@@ static inline int cfq_cfqq_##name(cons
  CFQ_CFQQ_FNS(on_rr);
  CFQ_CFQQ_FNS(wait_request);
  CFQ_CFQQ_FNS(must_dispatch);
 -CFQ_CFQQ_FNS(must_alloc);
  CFQ_CFQQ_FNS(must_alloc_slice);
  CFQ_CFQQ_FNS(fifo_expire);
  CFQ_CFQQ_FNS(idle_window);
@@@ -232,11 -239,6 +232,11 @@@ static struct cfq_queue *cfq_get_queue(
  static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *,
                                                struct io_context *);
  
 +static inline int rq_in_driver(struct cfq_data *cfqd)
 +{
 +      return cfqd->rq_in_driver[0] + cfqd->rq_in_driver[1];
 +}
 +
  static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic,
                                            int is_sync)
  {
@@@ -255,7 -257,7 +255,7 @@@ static inline void cic_set_cfqq(struct 
   */
  static inline int cfq_bio_sync(struct bio *bio)
  {
 -      if (bio_data_dir(bio) == READ || bio_sync(bio))
 +      if (bio_data_dir(bio) == READ || bio_rw_flagged(bio, BIO_RW_SYNCIO))
                return 1;
  
        return 0;
@@@ -646,6 -648,8 +646,6 @@@ static void cfq_add_cfqq_rr(struct cfq_
        BUG_ON(cfq_cfqq_on_rr(cfqq));
        cfq_mark_cfqq_on_rr(cfqq);
        cfqd->busy_queues++;
 -      if (cfq_class_rt(cfqq))
 -              cfqd->busy_rt_queues++;
  
        cfq_resort_rr_list(cfqd, cfqq);
  }
@@@ -669,6 -673,8 +669,6 @@@ static void cfq_del_cfqq_rr(struct cfq_
  
        BUG_ON(!cfqd->busy_queues);
        cfqd->busy_queues--;
 -      if (cfq_class_rt(cfqq))
 -              cfqd->busy_rt_queues--;
  }
  
  /*
@@@ -754,9 -760,9 +754,9 @@@ static void cfq_activate_request(struc
  {
        struct cfq_data *cfqd = q->elevator->elevator_data;
  
 -      cfqd->rq_in_driver++;
 +      cfqd->rq_in_driver[rq_is_sync(rq)]++;
        cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
 -                                              cfqd->rq_in_driver);
 +                                              rq_in_driver(cfqd));
  
        cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
  }
  static void cfq_deactivate_request(struct request_queue *q, struct request *rq)
  {
        struct cfq_data *cfqd = q->elevator->elevator_data;
 +      const int sync = rq_is_sync(rq);
  
 -      WARN_ON(!cfqd->rq_in_driver);
 -      cfqd->rq_in_driver--;
 +      WARN_ON(!cfqd->rq_in_driver[sync]);
 +      cfqd->rq_in_driver[sync]--;
        cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d",
 -                                              cfqd->rq_in_driver);
 +                                              rq_in_driver(cfqd));
  }
  
  static void cfq_remove_request(struct request *rq)
@@@ -1075,7 -1080,7 +1075,7 @@@ static void cfq_arm_slice_timer(struct 
        /*
         * still requests with the driver, don't idle
         */
 -      if (cfqd->rq_in_driver)
 +      if (rq_in_driver(cfqd))
                return;
  
        /*
@@@ -1110,7 -1115,6 +1110,7 @@@ static void cfq_dispatch_insert(struct 
  
        cfq_log_cfqq(cfqd, cfqq, "dispatch_insert");
  
 +      cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq);
        cfq_remove_request(rq);
        cfqq->dispatched++;
        elv_dispatch_sort(q, rq);
@@@ -1175,6 -1179,20 +1175,6 @@@ static struct cfq_queue *cfq_select_que
                goto expire;
  
        /*
 -       * If we have a RT cfqq waiting, then we pre-empt the current non-rt
 -       * cfqq.
 -       */
 -      if (!cfq_class_rt(cfqq) && cfqd->busy_rt_queues) {
 -              /*
 -               * We simulate this as cfqq timed out so that it gets to bank
 -               * the remaining of its time slice.
 -               */
 -              cfq_log_cfqq(cfqd, cfqq, "preempt");
 -              cfq_slice_expired(cfqd, 1);
 -              goto new_queue;
 -      }
 -
 -      /*
         * The active queue has requests and isn't expired, allow it to
         * dispatch.
         */
@@@ -1294,12 -1312,6 +1294,12 @@@ static int cfq_dispatch_requests(struc
                return 0;
  
        /*
 +       * Drain async requests before we start sync IO
 +       */
 +      if (cfq_cfqq_idle_window(cfqq) && cfqd->rq_in_driver[BLK_RW_ASYNC])
 +              return 0;
 +
 +      /*
         * If this is an async queue and we have sync IO in flight, let it wait
         */
        if (cfqd->sync_flight && !cfq_cfqq_sync(cfqq))
                cfq_slice_expired(cfqd, 0);
        }
  
 -      cfq_log(cfqd, "dispatched a request");
 +      cfq_log_cfqq(cfqd, cfqq, "dispatched a request");
        return 1;
  }
  
@@@ -1415,7 -1427,7 +1415,7 @@@ static void cfq_cic_free_rcu(struct rcu
        cic = container_of(head, struct cfq_io_context, rcu_head);
  
        kmem_cache_free(cfq_ioc_pool, cic);
-       elv_ioc_count_dec(ioc_count);
+       elv_ioc_count_dec(cfq_ioc_count);
  
        if (ioc_gone) {
                /*
                 * complete ioc_gone and set it back to NULL
                 */
                spin_lock(&ioc_gone_lock);
-               if (ioc_gone && !elv_ioc_count_read(ioc_count)) {
+               if (ioc_gone && !elv_ioc_count_read(cfq_ioc_count)) {
                        complete(ioc_gone);
                        ioc_gone = NULL;
                }
@@@ -1550,7 -1562,7 +1550,7 @@@ cfq_alloc_io_context(struct cfq_data *c
                INIT_HLIST_NODE(&cic->cic_list);
                cic->dtor = cfq_free_io_context;
                cic->exit = cfq_exit_io_context;
-               elv_ioc_count_inc(ioc_count);
+               elv_ioc_count_inc(cfq_ioc_count);
        }
  
        return cic;
@@@ -2118,11 -2130,11 +2118,11 @@@ static void cfq_insert_request(struct r
   */
  static void cfq_update_hw_tag(struct cfq_data *cfqd)
  {
 -      if (cfqd->rq_in_driver > cfqd->rq_in_driver_peak)
 -              cfqd->rq_in_driver_peak = cfqd->rq_in_driver;
 +      if (rq_in_driver(cfqd) > cfqd->rq_in_driver_peak)
 +              cfqd->rq_in_driver_peak = rq_in_driver(cfqd);
  
        if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&
 -          cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN)
 +          rq_in_driver(cfqd) <= CFQ_HW_QUEUE_MIN)
                return;
  
        if (cfqd->hw_tag_samples++ < 50)
@@@ -2149,9 -2161,9 +2149,9 @@@ static void cfq_completed_request(struc
  
        cfq_update_hw_tag(cfqd);
  
 -      WARN_ON(!cfqd->rq_in_driver);
 +      WARN_ON(!cfqd->rq_in_driver[sync]);
        WARN_ON(!cfqq->dispatched);
 -      cfqd->rq_in_driver--;
 +      cfqd->rq_in_driver[sync]--;
        cfqq->dispatched--;
  
        if (cfq_cfqq_sync(cfqq))
                        cfq_arm_slice_timer(cfqd);
        }
  
 -      if (!cfqd->rq_in_driver)
 +      if (!rq_in_driver(cfqd))
                cfq_schedule_dispatch(cfqd);
  }
  
@@@ -2217,7 -2229,8 +2217,7 @@@ static void cfq_prio_boost(struct cfq_q
  
  static inline int __cfq_may_queue(struct cfq_queue *cfqq)
  {
 -      if ((cfq_cfqq_wait_request(cfqq) || cfq_cfqq_must_alloc(cfqq)) &&
 -          !cfq_cfqq_must_alloc_slice(cfqq)) {
 +      if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) {
                cfq_mark_cfqq_must_alloc_slice(cfqq);
                return ELV_MQUEUE_MUST;
        }
@@@ -2304,6 -2317,7 +2304,6 @@@ cfq_set_request(struct request_queue *q
        }
  
        cfqq->allocated[rw]++;
 -      cfq_clear_cfqq_must_alloc(cfqq);
        atomic_inc(&cfqq->ref);
  
        spin_unlock_irqrestore(q->queue_lock, flags);
@@@ -2654,7 -2668,7 +2654,7 @@@ static void __exit cfq_exit(void
         * this also protects us from entering cfq_slab_kill() with
         * pending RCU callbacks
         */
-       if (elv_ioc_count_read(ioc_count))
+       if (elv_ioc_count_read(cfq_ioc_count))
                wait_for_completion(&all_gone);
        cfq_slab_kill();
  }
  /*
   * Base implementations of per-CPU variable declarations and definitions, where
   * the section in which the variable is to be placed is provided by the
-  * 'section' argument.  This may be used to affect the parameters governing the
+  * 'sec' argument.  This may be used to affect the parameters governing the
   * variable's storage.
   *
   * NOTE!  The sections for the DECLARE and for the DEFINE must match, lest
   * linkage errors occur due the compiler generating the wrong code to access
   * that section.
   */
- #define DECLARE_PER_CPU_SECTION(type, name, section)                  \
-       extern                                                          \
-       __attribute__((__section__(PER_CPU_BASE_SECTION section)))      \
-       PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name
- #define DEFINE_PER_CPU_SECTION(type, name, section)                   \
-       __attribute__((__section__(PER_CPU_BASE_SECTION section)))      \
-       PER_CPU_ATTRIBUTES PER_CPU_DEF_ATTRIBUTES                       \
+ #define __PCPU_ATTRS(sec)                                             \
+       __attribute__((section(PER_CPU_BASE_SECTION sec)))              \
+       PER_CPU_ATTRIBUTES
+ #define __PCPU_DUMMY_ATTRS                                            \
+       __attribute__((section(".discard"), unused))
+ /*
+  * s390 and alpha modules require percpu variables to be defined as
+  * weak to force the compiler to generate GOT based external
+  * references for them.  This is necessary because percpu sections
+  * will be located outside of the usually addressable area.
+  *
+  * This definition puts the following two extra restrictions when
+  * defining percpu variables.
+  *
+  * 1. The symbol must be globally unique, even the static ones.
+  * 2. Static percpu variables cannot be defined inside a function.
+  *
+  * Archs which need weak percpu definitions should define
+  * ARCH_NEEDS_WEAK_PER_CPU in asm/percpu.h when necessary.
+  *
+  * To ensure that the generic code observes the above two
+  * restrictions, if CONFIG_DEBUG_FORCE_WEAK_PER_CPU is set weak
+  * definition is used for all cases.
+  */
+ #if defined(ARCH_NEEDS_WEAK_PER_CPU) || defined(CONFIG_DEBUG_FORCE_WEAK_PER_CPU)
+ /*
+  * __pcpu_scope_* dummy variable is used to enforce scope.  It
+  * receives the static modifier when it's used in front of
+  * DEFINE_PER_CPU() and will trigger build failure if
+  * DECLARE_PER_CPU() is used for the same variable.
+  *
+  * __pcpu_unique_* dummy variable is used to enforce symbol uniqueness
+  * such that hidden weak symbol collision, which will cause unrelated
+  * variables to share the same address, can be detected during build.
+  */
+ #define DECLARE_PER_CPU_SECTION(type, name, sec)                      \
+       extern __PCPU_DUMMY_ATTRS char __pcpu_scope_##name;             \
+       extern __PCPU_ATTRS(sec) __typeof__(type) per_cpu__##name
+ #define DEFINE_PER_CPU_SECTION(type, name, sec)                               \
+       __PCPU_DUMMY_ATTRS char __pcpu_scope_##name;                    \
+       __PCPU_DUMMY_ATTRS char __pcpu_unique_##name;                   \
+       __PCPU_ATTRS(sec) PER_CPU_DEF_ATTRIBUTES __weak                 \
+       __typeof__(type) per_cpu__##name
+ #else
+ /*
+  * Normal declaration and definition macros.
+  */
+ #define DECLARE_PER_CPU_SECTION(type, name, sec)                      \
+       extern __PCPU_ATTRS(sec) __typeof__(type) per_cpu__##name
+ #define DEFINE_PER_CPU_SECTION(type, name, sec)                               \
+       __PCPU_ATTRS(sec) PER_CPU_DEF_ATTRIBUTES                        \
        __typeof__(type) per_cpu__##name
+ #endif
  
  /*
   * Variant on the per-CPU variable declaration/definition theme used for
        DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \
        ____cacheline_aligned_in_smp
  
 +#define DECLARE_PER_CPU_ALIGNED(type, name)                           \
 +      DECLARE_PER_CPU_SECTION(type, name, PER_CPU_ALIGNED_SECTION)    \
 +      ____cacheline_aligned
 +
 +#define DEFINE_PER_CPU_ALIGNED(type, name)                            \
 +      DEFINE_PER_CPU_SECTION(type, name, PER_CPU_ALIGNED_SECTION)     \
 +      ____cacheline_aligned
 +
  /*
   * Declaration/definition used for per-CPU variables that must be page aligned.
   */
 -#define DECLARE_PER_CPU_PAGE_ALIGNED(type, name)                              \
 -      DECLARE_PER_CPU_SECTION(type, name, ".page_aligned")
 +#define DECLARE_PER_CPU_PAGE_ALIGNED(type, name)                      \
 +      DECLARE_PER_CPU_SECTION(type, name, ".page_aligned")            \
 +      __aligned(PAGE_SIZE)
  
  #define DEFINE_PER_CPU_PAGE_ALIGNED(type, name)                               \
 -      DEFINE_PER_CPU_SECTION(type, name, ".page_aligned")
 +      DEFINE_PER_CPU_SECTION(type, name, ".page_aligned")             \
 +      __aligned(PAGE_SIZE)
  
  /*
   * Intermodule exports for per-CPU variables.
diff --combined init/main.c
@@@ -353,7 -353,6 +353,6 @@@ static void __init smp_init(void
  #define smp_init()    do { } while (0)
  #endif
  
- static inline void setup_per_cpu_areas(void) { }
  static inline void setup_nr_cpu_ids(void) { }
  static inline void smp_prepare_cpus(unsigned int maxcpus) { }
  
@@@ -374,29 -373,6 +373,6 @@@ static void __init setup_nr_cpu_ids(voi
        nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1;
  }
  
- #ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
- unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
- EXPORT_SYMBOL(__per_cpu_offset);
- static void __init setup_per_cpu_areas(void)
- {
-       unsigned long size, i;
-       char *ptr;
-       unsigned long nr_possible_cpus = num_possible_cpus();
-       /* Copy section for each CPU (we discard the original) */
-       size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE);
-       ptr = alloc_bootmem_pages(size * nr_possible_cpus);
-       for_each_possible_cpu(i) {
-               __per_cpu_offset[i] = ptr - __per_cpu_start;
-               memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
-               ptr += size;
-       }
- }
- #endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
  /* Called by boot processor to activate the rest. */
  static void __init smp_init(void)
  {
@@@ -451,7 -427,6 +427,7 @@@ static noinline void __init_refok rest_
  {
        int pid;
  
 +      rcu_scheduler_starting();
        kernel_thread(kernel_init, NULL, CLONE_FS | CLONE_SIGHAND);
        numa_default_policy();
        pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);
         * at least once to get things moving:
         */
        init_idle_bootup_task(current);
 -      rcu_scheduler_starting();
        preempt_enable_no_resched();
        schedule();
        preempt_disable();
@@@ -631,6 -607,7 +607,6 @@@ asmlinkage void __init start_kernel(voi
        softirq_init();
        timekeeping_init();
        time_init();
 -      sched_clock_init();
        profile_init();
        if (!irqs_disabled())
                printk(KERN_CRIT "start_kernel(): bug: interrupts were "
        numa_policy_init();
        if (late_time_init)
                late_time_init();
 +      sched_clock_init();
        calibrate_delay();
        pidmap_init();
        anon_vma_init();
@@@ -733,14 -709,13 +709,14 @@@ static void __init do_ctors(void
  int initcall_debug;
  core_param(initcall_debug, initcall_debug, bool, 0644);
  
 +static char msgbuf[64];
 +static struct boot_trace_call call;
 +static struct boot_trace_ret ret;
 +
  int do_one_initcall(initcall_t fn)
  {
        int count = preempt_count();
        ktime_t calltime, delta, rettime;
 -      char msgbuf[64];
 -      struct boot_trace_call call;
 -      struct boot_trace_ret ret;
  
        if (initcall_debug) {
                call.caller = task_pid_nr(current);
diff --combined kernel/module.c
  #include <linux/percpu.h>
  #include <linux/kmemleak.h>
  
 +#define CREATE_TRACE_POINTS
 +#include <trace/events/module.h>
 +
 +EXPORT_TRACEPOINT_SYMBOL(module_get);
 +
  #if 0
  #define DEBUGP printk
  #else
@@@ -369,7 -364,7 +369,7 @@@ EXPORT_SYMBOL_GPL(find_module)
  
  #ifdef CONFIG_SMP
  
- #ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
+ #ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA
  
  static void *percpu_modalloc(unsigned long size, unsigned long align,
                             const char *name)
@@@ -394,7 -389,7 +394,7 @@@ static void percpu_modfree(void *freeme
        free_percpu(freeme);
  }
  
- #else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
+ #else /* ... CONFIG_HAVE_LEGACY_PER_CPU_AREA */
  
  /* Number of blocks used and allocated. */
  static unsigned int pcpu_num_used, pcpu_num_allocated;
@@@ -540,7 -535,7 +540,7 @@@ static int percpu_modinit(void
  }
  __initcall(percpu_modinit);
  
- #endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
+ #endif /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */
  
  static unsigned int find_pcpusec(Elf_Ehdr *hdr,
                                 Elf_Shdr *sechdrs,
@@@ -914,18 -909,16 +914,18 @@@ void __symbol_put(const char *symbol
  }
  EXPORT_SYMBOL(__symbol_put);
  
 +/* Note this assumes addr is a function, which it currently always is. */
  void symbol_put_addr(void *addr)
  {
        struct module *modaddr;
 +      unsigned long a = (unsigned long)dereference_function_descriptor(addr);
  
 -      if (core_kernel_text((unsigned long)addr))
 +      if (core_kernel_text(a))
                return;
  
        /* module_text_address is safe here: we're supposed to have reference
         * to module from symbol_get, so it can't go away. */
 -      modaddr = __module_text_address((unsigned long)addr);
 +      modaddr = __module_text_address(a);
        BUG_ON(!modaddr);
        module_put(modaddr);
  }
@@@ -947,8 -940,6 +947,8 @@@ void module_put(struct module *module
        if (module) {
                unsigned int cpu = get_cpu();
                local_dec(__module_ref_addr(module, cpu));
 +              trace_module_put(module, _RET_IP_,
 +                               local_read(__module_ref_addr(module, cpu)));
                /* Maybe they're waiting for us to drop reference? */
                if (unlikely(!module_is_live(module)))
                        wake_up_process(module->waiter);
@@@ -1281,10 -1272,6 +1281,10 @@@ static void add_notes_attrs(struct modu
        struct module_notes_attrs *notes_attrs;
        struct bin_attribute *nattr;
  
 +      /* failed to create section attributes, so can't create notes */
 +      if (!mod->sect_attrs)
 +              return;
 +
        /* Count notes sections and allocate structures.  */
        notes = 0;
        for (i = 0; i < nsect; i++)
@@@ -1504,8 -1491,6 +1504,8 @@@ static int __unlink_module(void *_mod
  /* Free a module, remove from lists, etc (must hold module_mutex). */
  static void free_module(struct module *mod)
  {
 +      trace_module_free(mod);
 +
        /* Delete from various lists */
        stop_machine(__unlink_module, mod, NULL);
        remove_notes_attrs(mod);
@@@ -2373,8 -2358,6 +2373,8 @@@ static noinline struct module *load_mod
        /* Get rid of temporary copy */
        vfree(hdr);
  
 +      trace_module_load(mod);
 +
        /* Done! */
        return mod;
  
diff --combined kernel/perf_counter.c
@@@ -46,17 -46,11 +46,17 @@@ static atomic_t nr_task_counters __read
  
  /*
   * perf counter paranoia level:
 - *  0 - not paranoid
 - *  1 - disallow cpu counters to unpriv
 - *  2 - disallow kernel profiling to unpriv
 + *  -1 - not paranoid at all
 + *   0 - disallow raw tracepoint access for unpriv
 + *   1 - disallow cpu counters for unpriv
 + *   2 - disallow kernel profiling for unpriv
   */
 -int sysctl_perf_counter_paranoid __read_mostly;
 +int sysctl_perf_counter_paranoid __read_mostly = 1;
 +
 +static inline bool perf_paranoid_tracepoint_raw(void)
 +{
 +      return sysctl_perf_counter_paranoid > -1;
 +}
  
  static inline bool perf_paranoid_cpu(void)
  {
@@@ -106,16 -100,16 +106,16 @@@ hw_perf_group_sched_in(struct perf_coun
  
  void __weak perf_counter_print_debug(void)    { }
  
- static DEFINE_PER_CPU(int, disable_count);
+ static DEFINE_PER_CPU(int, perf_disable_count);
  
  void __perf_disable(void)
  {
-       __get_cpu_var(disable_count)++;
+       __get_cpu_var(perf_disable_count)++;
  }
  
  bool __perf_enable(void)
  {
-       return !--__get_cpu_var(disable_count);
+       return !--__get_cpu_var(perf_disable_count);
  }
  
  void perf_disable(void)
@@@ -475,8 -469,7 +475,8 @@@ static void update_counter_times(struc
        struct perf_counter_context *ctx = counter->ctx;
        u64 run_end;
  
 -      if (counter->state < PERF_COUNTER_STATE_INACTIVE)
 +      if (counter->state < PERF_COUNTER_STATE_INACTIVE ||
 +          counter->group_leader->state < PERF_COUNTER_STATE_INACTIVE)
                return;
  
        counter->total_time_enabled = ctx->time - counter->tstamp_enabled;
@@@ -525,7 -518,7 +525,7 @@@ static void __perf_counter_disable(voi
         */
        if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
                update_context_time(ctx);
 -              update_counter_times(counter);
 +              update_group_times(counter);
                if (counter == counter->group_leader)
                        group_sched_out(counter, cpuctx, ctx);
                else
@@@ -580,7 -573,7 +580,7 @@@ static void perf_counter_disable(struc
         * in, so we can change the state safely.
         */
        if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
 -              update_counter_times(counter);
 +              update_group_times(counter);
                counter->state = PERF_COUNTER_STATE_OFF;
        }
  
@@@ -858,27 -851,6 +858,27 @@@ retry
  }
  
  /*
 + * Put a counter into inactive state and update time fields.
 + * Enabling the leader of a group effectively enables all
 + * the group members that aren't explicitly disabled, so we
 + * have to update their ->tstamp_enabled also.
 + * Note: this works for group members as well as group leaders
 + * since the non-leader members' sibling_lists will be empty.
 + */
 +static void __perf_counter_mark_enabled(struct perf_counter *counter,
 +                                      struct perf_counter_context *ctx)
 +{
 +      struct perf_counter *sub;
 +
 +      counter->state = PERF_COUNTER_STATE_INACTIVE;
 +      counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
 +      list_for_each_entry(sub, &counter->sibling_list, list_entry)
 +              if (sub->state >= PERF_COUNTER_STATE_INACTIVE)
 +                      sub->tstamp_enabled =
 +                              ctx->time - sub->total_time_enabled;
 +}
 +
 +/*
   * Cross CPU call to enable a performance counter
   */
  static void __perf_counter_enable(void *info)
  
        if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
                goto unlock;
 -      counter->state = PERF_COUNTER_STATE_INACTIVE;
 -      counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
 +      __perf_counter_mark_enabled(counter, ctx);
  
        /*
         * If the counter is in a group and isn't the group leader,
@@@ -998,9 -971,11 +998,9 @@@ static void perf_counter_enable(struct 
         * Since we have the lock this context can't be scheduled
         * in, so we can change the state safely.
         */
 -      if (counter->state == PERF_COUNTER_STATE_OFF) {
 -              counter->state = PERF_COUNTER_STATE_INACTIVE;
 -              counter->tstamp_enabled =
 -                      ctx->time - counter->total_time_enabled;
 -      }
 +      if (counter->state == PERF_COUNTER_STATE_OFF)
 +              __perf_counter_mark_enabled(counter, ctx);
 +
   out:
        spin_unlock_irq(&ctx->lock);
  }
@@@ -1504,7 -1479,9 +1504,7 @@@ static void perf_counter_enable_on_exec
                counter->attr.enable_on_exec = 0;
                if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
                        continue;
 -              counter->state = PERF_COUNTER_STATE_INACTIVE;
 -              counter->tstamp_enabled =
 -                      ctx->time - counter->total_time_enabled;
 +              __perf_counter_mark_enabled(counter, ctx);
                enabled = 1;
        }
  
   */
  static void __perf_counter_read(void *info)
  {
 +      struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
        struct perf_counter *counter = info;
        struct perf_counter_context *ctx = counter->ctx;
        unsigned long flags;
  
 +      /*
 +       * If this is a task context, we need to check whether it is
 +       * the current task context of this cpu.  If not it has been
 +       * scheduled out before the smp call arrived.  In that case
 +       * counter->count would have been updated to a recent sample
 +       * when the counter was scheduled out.
 +       */
 +      if (ctx->task && cpuctx->task_ctx != ctx)
 +              return;
 +
        local_irq_save(flags);
        if (ctx->is_active)
                update_context_time(ctx);
@@@ -1698,11 -1664,6 +1698,11 @@@ static void free_counter(struct perf_co
                        atomic_dec(&nr_task_counters);
        }
  
 +      if (counter->output) {
 +              fput(counter->output->filp);
 +              counter->output = NULL;
 +      }
 +
        if (counter->destroy)
                counter->destroy(counter);
  
@@@ -1819,7 -1780,7 +1819,7 @@@ static int perf_counter_read_group(stru
        size += err;
  
        list_for_each_entry(sub, &leader->sibling_list, list_entry) {
 -              err = perf_counter_read_entry(counter, read_format,
 +              err = perf_counter_read_entry(sub, read_format,
                                buf + size);
                if (err < 0)
                        return err;
@@@ -1988,8 -1949,6 +1988,8 @@@ unlock
        return ret;
  }
  
 +int perf_counter_set_output(struct perf_counter *counter, int output_fd);
 +
  static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
  {
        struct perf_counter *counter = file->private_data;
        case PERF_COUNTER_IOC_PERIOD:
                return perf_counter_period(counter, (u64 __user *)arg);
  
 +      case PERF_COUNTER_IOC_SET_OUTPUT:
 +              return perf_counter_set_output(counter, arg);
 +
        default:
                return -ENOTTY;
        }
@@@ -2052,10 -2008,6 +2052,10 @@@ int perf_counter_task_disable(void
        return 0;
  }
  
 +#ifndef PERF_COUNTER_INDEX_OFFSET
 +# define PERF_COUNTER_INDEX_OFFSET 0
 +#endif
 +
  static int perf_counter_index(struct perf_counter *counter)
  {
        if (counter->state != PERF_COUNTER_STATE_ACTIVE)
@@@ -2286,11 -2238,6 +2286,11 @@@ static int perf_mmap(struct file *file
  
        WARN_ON_ONCE(counter->ctx->parent_ctx);
        mutex_lock(&counter->mmap_mutex);
 +      if (counter->output) {
 +              ret = -EINVAL;
 +              goto unlock;
 +      }
 +
        if (atomic_inc_not_zero(&counter->mmap_count)) {
                if (nr_pages != counter->data->nr_pages)
                        ret = -EINVAL;
@@@ -2676,7 -2623,6 +2676,7 @@@ static int perf_output_begin(struct per
                             struct perf_counter *counter, unsigned int size,
                             int nmi, int sample)
  {
 +      struct perf_counter *output_counter;
        struct perf_mmap_data *data;
        unsigned int offset, head;
        int have_lost;
                u64                      lost;
        } lost_event;
  
 +      rcu_read_lock();
        /*
         * For inherited counters we send all the output towards the parent.
         */
        if (counter->parent)
                counter = counter->parent;
  
 -      rcu_read_lock();
 +      output_counter = rcu_dereference(counter->output);
 +      if (output_counter)
 +              counter = output_counter;
 +
        data = rcu_dereference(counter->data);
        if (!data)
                goto out;
@@@ -3977,7 -3919,6 +3977,7 @@@ static const struct pmu *tp_perf_counte
         * have these.
         */
        if ((counter->attr.sample_type & PERF_SAMPLE_RAW) &&
 +                      perf_paranoid_tracepoint_raw() &&
                        !capable(CAP_SYS_ADMIN))
                return ERR_PTR(-EPERM);
  
@@@ -4110,7 -4051,6 +4110,7 @@@ perf_counter_alloc(struct perf_counter_
        hwc->sample_period = attr->sample_period;
        if (attr->freq && attr->sample_freq)
                hwc->sample_period = 1;
 +      hwc->last_period = hwc->sample_period;
  
        atomic64_set(&hwc->period_left, hwc->sample_period);
  
@@@ -4215,7 -4155,6 +4215,7 @@@ static int perf_copy_attr(struct perf_c
                        if (val)
                                goto err_size;
                }
 +              size = sizeof(*attr);
        }
  
        ret = copy_from_user(attr, uattr, size);
@@@ -4247,57 -4186,6 +4247,57 @@@ err_size
        goto out;
  }
  
 +int perf_counter_set_output(struct perf_counter *counter, int output_fd)
 +{
 +      struct perf_counter *output_counter = NULL;
 +      struct file *output_file = NULL;
 +      struct perf_counter *old_output;
 +      int fput_needed = 0;
 +      int ret = -EINVAL;
 +
 +      if (!output_fd)
 +              goto set;
 +
 +      output_file = fget_light(output_fd, &fput_needed);
 +      if (!output_file)
 +              return -EBADF;
 +
 +      if (output_file->f_op != &perf_fops)
 +              goto out;
 +
 +      output_counter = output_file->private_data;
 +
 +      /* Don't chain output fds */
 +      if (output_counter->output)
 +              goto out;
 +
 +      /* Don't set an output fd when we already have an output channel */
 +      if (counter->data)
 +              goto out;
 +
 +      atomic_long_inc(&output_file->f_count);
 +
 +set:
 +      mutex_lock(&counter->mmap_mutex);
 +      old_output = counter->output;
 +      rcu_assign_pointer(counter->output, output_counter);
 +      mutex_unlock(&counter->mmap_mutex);
 +
 +      if (old_output) {
 +              /*
 +               * we need to make sure no existing perf_output_*()
 +               * is still referencing this counter.
 +               */
 +              synchronize_rcu();
 +              fput(old_output->filp);
 +      }
 +
 +      ret = 0;
 +out:
 +      fput_light(output_file, fput_needed);
 +      return ret;
 +}
 +
  /**
   * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
   *
@@@ -4317,15 -4205,15 +4317,15 @@@ SYSCALL_DEFINE5(perf_counter_open
        struct file *group_file = NULL;
        int fput_needed = 0;
        int fput_needed2 = 0;
 -      int ret;
 +      int err;
  
        /* for future expandability... */
 -      if (flags)
 +      if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
                return -EINVAL;
  
 -      ret = perf_copy_attr(attr_uptr, &attr);
 -      if (ret)
 -              return ret;
 +      err = perf_copy_attr(attr_uptr, &attr);
 +      if (err)
 +              return err;
  
        if (!attr.exclude_kernel) {
                if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
         * Look up the group leader (we will attach this counter to it):
         */
        group_leader = NULL;
 -      if (group_fd != -1) {
 -              ret = -EINVAL;
 +      if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
 +              err = -EINVAL;
                group_file = fget_light(group_fd, &fput_needed);
                if (!group_file)
                        goto err_put_context;
  
        counter = perf_counter_alloc(&attr, cpu, ctx, group_leader,
                                     NULL, GFP_KERNEL);
 -      ret = PTR_ERR(counter);
 +      err = PTR_ERR(counter);
        if (IS_ERR(counter))
                goto err_put_context;
  
 -      ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
 -      if (ret < 0)
 +      err = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
 +      if (err < 0)
                goto err_free_put_context;
  
 -      counter_file = fget_light(ret, &fput_needed2);
 +      counter_file = fget_light(err, &fput_needed2);
        if (!counter_file)
                goto err_free_put_context;
  
 +      if (flags & PERF_FLAG_FD_OUTPUT) {
 +              err = perf_counter_set_output(counter, group_fd);
 +              if (err)
 +                      goto err_fput_free_put_context;
 +      }
 +
        counter->filp = counter_file;
        WARN_ON_ONCE(ctx->parent_ctx);
        mutex_lock(&ctx->mutex);
        list_add_tail(&counter->owner_entry, &current->perf_counter_list);
        mutex_unlock(&current->perf_counter_mutex);
  
 +err_fput_free_put_context:
        fput_light(counter_file, fput_needed2);
  
 -out_fput:
 -      fput_light(group_file, fput_needed);
 -
 -      return ret;
 -
  err_free_put_context:
 -      kfree(counter);
 +      if (err < 0)
 +              kfree(counter);
  
  err_put_context:
 -      put_ctx(ctx);
 +      if (err < 0)
 +              put_ctx(ctx);
 +
 +      fput_light(group_file, fput_needed);
  
 -      goto out_fput;
 +      return err;
  }
  
  /*
diff --combined kernel/sched.c
@@@ -64,6 -64,7 +64,6 @@@
  #include <linux/tsacct_kern.h>
  #include <linux/kprobes.h>
  #include <linux/delayacct.h>
 -#include <linux/reciprocal_div.h>
  #include <linux/unistd.h>
  #include <linux/pagemap.h>
  #include <linux/hrtimer.h>
   */
  #define RUNTIME_INF   ((u64)~0ULL)
  
 -#ifdef CONFIG_SMP
 -
  static void double_rq_lock(struct rq *rq1, struct rq *rq2);
  
 -/*
 - * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
 - * Since cpu_power is a 'constant', we can use a reciprocal divide.
 - */
 -static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
 -{
 -      return reciprocal_divide(load, sg->reciprocal_cpu_power);
 -}
 -
 -/*
 - * Each time a sched group cpu_power is changed,
 - * we must compute its reciprocal value
 - */
 -static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
 -{
 -      sg->__cpu_power += val;
 -      sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
 -}
 -#endif
 -
  static inline int rt_policy(int policy)
  {
        if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
@@@ -286,8 -309,8 +286,8 @@@ void set_tg_uid(struct user_struct *use
  
  /*
   * Root task group.
 - *    Every UID task group (including init_task_group aka UID-0) will
 - *    be a child to this group.
 + *    Every UID task group (including init_task_group aka UID-0) will
 + *    be a child to this group.
   */
  struct task_group root_task_group;
  
  /* Default task group's sched entity on each cpu */
  static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
  /* Default task group's cfs_rq on each cpu */
- static DEFINE_PER_CPU(struct cfs_rq, init_tg_cfs_rq) ____cacheline_aligned_in_smp;
 -static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_cfs_rq);
++static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq);
  #endif /* CONFIG_FAIR_GROUP_SCHED */
  
  #ifdef CONFIG_RT_GROUP_SCHED
  static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
- static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
+ static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq);
  #endif /* CONFIG_RT_GROUP_SCHED */
  #else /* !CONFIG_USER_SCHED */
  #define root_task_group init_task_group
@@@ -593,7 -616,6 +593,7 @@@ struct rq 
  
        unsigned char idle_at_tick;
        /* For active balancing */
 +      int post_schedule;
        int active_balance;
        int push_cpu;
        /* cpu of this runqueue: */
  
        struct task_struct *migration_thread;
        struct list_head migration_queue;
 +
 +      u64 rt_avg;
 +      u64 age_stamp;
  #endif
  
        /* calc_load related fields */
@@@ -674,7 -693,6 +674,7 @@@ static inline int cpu_of(struct rq *rq
  #define this_rq()             (&__get_cpu_var(runqueues))
  #define task_rq(p)            cpu_rq(task_cpu(p))
  #define cpu_curr(cpu)         (cpu_rq(cpu)->curr)
 +#define raw_rq()              (&__raw_get_cpu_var(runqueues))
  
  inline void update_rq_clock(struct rq *rq)
  {
@@@ -843,14 -861,6 +843,14 @@@ unsigned int sysctl_sched_shares_rateli
  unsigned int sysctl_sched_shares_thresh = 4;
  
  /*
 + * period over which we average the RT time consumption, measured
 + * in ms.
 + *
 + * default: 1s
 + */
 +const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
 +
 +/*
   * period over which we measure -rt task cpu usage in us.
   * default: 1s
   */
@@@ -1268,37 -1278,12 +1268,37 @@@ void wake_up_idle_cpu(int cpu
  }
  #endif /* CONFIG_NO_HZ */
  
 +static u64 sched_avg_period(void)
 +{
 +      return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
 +}
 +
 +static void sched_avg_update(struct rq *rq)
 +{
 +      s64 period = sched_avg_period();
 +
 +      while ((s64)(rq->clock - rq->age_stamp) > period) {
 +              rq->age_stamp += period;
 +              rq->rt_avg /= 2;
 +      }
 +}
 +
 +static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
 +{
 +      rq->rt_avg += rt_delta;
 +      sched_avg_update(rq);
 +}
 +
  #else /* !CONFIG_SMP */
  static void resched_task(struct task_struct *p)
  {
        assert_spin_locked(&task_rq(p)->lock);
        set_tsk_need_resched(p);
  }
 +
 +static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
 +{
 +}
  #endif /* CONFIG_SMP */
  
  #if BITS_PER_LONG == 32
@@@ -1528,35 -1513,28 +1528,35 @@@ static unsigned long cpu_avg_load_per_t
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
  
 +struct update_shares_data {
 +      unsigned long rq_weight[NR_CPUS];
 +};
 +
 +static DEFINE_PER_CPU(struct update_shares_data, update_shares_data);
 +
  static void __set_se_shares(struct sched_entity *se, unsigned long shares);
  
  /*
   * Calculate and set the cpu's group shares.
   */
 -static void
 -update_group_shares_cpu(struct task_group *tg, int cpu,
 -                      unsigned long sd_shares, unsigned long sd_rq_weight)
 +static void update_group_shares_cpu(struct task_group *tg, int cpu,
 +                                  unsigned long sd_shares,
 +                                  unsigned long sd_rq_weight,
 +                                  struct update_shares_data *usd)
  {
 -      unsigned long shares;
 -      unsigned long rq_weight;
 -
 -      if (!tg->se[cpu])
 -              return;
 +      unsigned long shares, rq_weight;
 +      int boost = 0;
  
 -      rq_weight = tg->cfs_rq[cpu]->rq_weight;
 +      rq_weight = usd->rq_weight[cpu];
 +      if (!rq_weight) {
 +              boost = 1;
 +              rq_weight = NICE_0_LOAD;
 +      }
  
        /*
 -       *           \Sum shares * rq_weight
 -       * shares =  -----------------------
 -       *               \Sum rq_weight
 -       *
 +       *             \Sum_j shares_j * rq_weight_i
 +       * shares_i =  -----------------------------
 +       *                  \Sum_j rq_weight_j
         */
        shares = (sd_shares * rq_weight) / sd_rq_weight;
        shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
                unsigned long flags;
  
                spin_lock_irqsave(&rq->lock, flags);
 -              tg->cfs_rq[cpu]->shares = shares;
 -
 +              tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
 +              tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
                __set_se_shares(tg->se[cpu], shares);
                spin_unlock_irqrestore(&rq->lock, flags);
        }
   */
  static int tg_shares_up(struct task_group *tg, void *data)
  {
 -      unsigned long weight, rq_weight = 0;
 -      unsigned long shares = 0;
 +      unsigned long weight, rq_weight = 0, shares = 0;
 +      struct update_shares_data *usd;
        struct sched_domain *sd = data;
 +      unsigned long flags;
        int i;
  
 +      if (!tg->se[0])
 +              return 0;
 +
 +      local_irq_save(flags);
 +      usd = &__get_cpu_var(update_shares_data);
 +
        for_each_cpu(i, sched_domain_span(sd)) {
 +              weight = tg->cfs_rq[i]->load.weight;
 +              usd->rq_weight[i] = weight;
 +
                /*
                 * If there are currently no tasks on the cpu pretend there
                 * is one of average load so that when a new task gets to
                 * run here it will not get delayed by group starvation.
                 */
 -              weight = tg->cfs_rq[i]->load.weight;
                if (!weight)
                        weight = NICE_0_LOAD;
  
 -              tg->cfs_rq[i]->rq_weight = weight;
                rq_weight += weight;
                shares += tg->cfs_rq[i]->shares;
        }
                shares = tg->shares;
  
        for_each_cpu(i, sched_domain_span(sd))
 -              update_group_shares_cpu(tg, i, shares, rq_weight);
 +              update_group_shares_cpu(tg, i, shares, rq_weight, usd);
 +
 +      local_irq_restore(flags);
  
        return 0;
  }
@@@ -1648,14 -1616,8 +1648,14 @@@ static int tg_load_down(struct task_gro
  
  static void update_shares(struct sched_domain *sd)
  {
 -      u64 now = cpu_clock(raw_smp_processor_id());
 -      s64 elapsed = now - sd->last_update;
 +      s64 elapsed;
 +      u64 now;
 +
 +      if (root_task_group_empty())
 +              return;
 +
 +      now = cpu_clock(raw_smp_processor_id());
 +      elapsed = now - sd->last_update;
  
        if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
                sd->last_update = now;
  
  static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
  {
 +      if (root_task_group_empty())
 +              return;
 +
        spin_unlock(&rq->lock);
        update_shares(sd);
        spin_lock(&rq->lock);
  
  static void update_h_load(long cpu)
  {
 +      if (root_task_group_empty())
 +              return;
 +
        walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
  }
  
@@@ -2312,7 -2268,8 +2312,7 @@@ find_idlest_group(struct sched_domain *
                }
  
                /* Adjust by relative CPU power of the group */
 -              avg_load = sg_div_cpu_power(group,
 -                              avg_load * SCHED_LOAD_SCALE);
 +              avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
  
                if (local_group) {
                        this_load = avg_load;
@@@ -2680,32 -2637,9 +2680,32 @@@ void sched_fork(struct task_struct *p, 
        set_task_cpu(p, cpu);
  
        /*
 -       * Make sure we do not leak PI boosting priority to the child:
 +       * Make sure we do not leak PI boosting priority to the child.
         */
        p->prio = current->normal_prio;
 +
 +      /*
 +       * Revert to default priority/policy on fork if requested.
 +       */
 +      if (unlikely(p->sched_reset_on_fork)) {
 +              if (p->policy == SCHED_FIFO || p->policy == SCHED_RR)
 +                      p->policy = SCHED_NORMAL;
 +
 +              if (p->normal_prio < DEFAULT_PRIO)
 +                      p->prio = DEFAULT_PRIO;
 +
 +              if (PRIO_TO_NICE(p->static_prio) < 0) {
 +                      p->static_prio = NICE_TO_PRIO(0);
 +                      set_load_weight(p);
 +              }
 +
 +              /*
 +               * We don't need the reset flag anymore after the fork. It has
 +               * fulfilled its duty:
 +               */
 +              p->sched_reset_on_fork = 0;
 +      }
 +
        if (!rt_prio(p->prio))
                p->sched_class = &fair_sched_class;
  
@@@ -2862,6 -2796,12 +2862,6 @@@ static void finish_task_switch(struct r
  {
        struct mm_struct *mm = rq->prev_mm;
        long prev_state;
 -#ifdef CONFIG_SMP
 -      int post_schedule = 0;
 -
 -      if (current->sched_class->needs_post_schedule)
 -              post_schedule = current->sched_class->needs_post_schedule(rq);
 -#endif
  
        rq->prev_mm = NULL;
  
        finish_arch_switch(prev);
        perf_counter_task_sched_in(current, cpu_of(rq));
        finish_lock_switch(rq, prev);
 -#ifdef CONFIG_SMP
 -      if (post_schedule)
 -              current->sched_class->post_schedule(rq);
 -#endif
  
        fire_sched_in_preempt_notifiers(current);
        if (mm)
        }
  }
  
 +#ifdef CONFIG_SMP
 +
 +/* assumes rq->lock is held */
 +static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
 +{
 +      if (prev->sched_class->pre_schedule)
 +              prev->sched_class->pre_schedule(rq, prev);
 +}
 +
 +/* rq->lock is NOT held, but preemption is disabled */
 +static inline void post_schedule(struct rq *rq)
 +{
 +      if (rq->post_schedule) {
 +              unsigned long flags;
 +
 +              spin_lock_irqsave(&rq->lock, flags);
 +              if (rq->curr->sched_class->post_schedule)
 +                      rq->curr->sched_class->post_schedule(rq);
 +              spin_unlock_irqrestore(&rq->lock, flags);
 +
 +              rq->post_schedule = 0;
 +      }
 +}
 +
 +#else
 +
 +static inline void pre_schedule(struct rq *rq, struct task_struct *p)
 +{
 +}
 +
 +static inline void post_schedule(struct rq *rq)
 +{
 +}
 +
 +#endif
 +
  /**
   * schedule_tail - first thing a freshly forked thread must call.
   * @prev: the thread we just switched away from.
@@@ -2940,13 -2848,6 +2940,13 @@@ asmlinkage void schedule_tail(struct ta
        struct rq *rq = this_rq();
  
        finish_task_switch(rq, prev);
 +
 +      /*
 +       * FIXME: do we need to worry about rq being invalidated by the
 +       * task_switch?
 +       */
 +      post_schedule(rq);
 +
  #ifdef __ARCH_WANT_UNLOCKED_CTXSW
        /* In this case, finish_task_switch does not reenable preemption */
        preempt_enable();
@@@ -3478,10 -3379,9 +3478,10 @@@ static int move_one_task(struct rq *thi
  {
        const struct sched_class *class;
  
 -      for (class = sched_class_highest; class; class = class->next)
 +      for_each_class(class) {
                if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
                        return 1;
 +      }
  
        return 0;
  }
@@@ -3644,7 -3544,7 +3644,7 @@@ static inline void update_sd_power_savi
         * capacity but still has some space to pick up some load
         * from other group and save more power
         */
 -      if (sgs->sum_nr_running > sgs->group_capacity - 1)
 +      if (sgs->sum_nr_running + 1 > sgs->group_capacity)
                return;
  
        if (sgs->sum_nr_running > sds->leader_nr_running ||
@@@ -3711,77 -3611,6 +3711,77 @@@ static inline int check_power_save_busi
  }
  #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
  
 +unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
 +{
 +      unsigned long weight = cpumask_weight(sched_domain_span(sd));
 +      unsigned long smt_gain = sd->smt_gain;
 +
 +      smt_gain /= weight;
 +
 +      return smt_gain;
 +}
 +
 +unsigned long scale_rt_power(int cpu)
 +{
 +      struct rq *rq = cpu_rq(cpu);
 +      u64 total, available;
 +
 +      sched_avg_update(rq);
 +
 +      total = sched_avg_period() + (rq->clock - rq->age_stamp);
 +      available = total - rq->rt_avg;
 +
 +      if (unlikely((s64)total < SCHED_LOAD_SCALE))
 +              total = SCHED_LOAD_SCALE;
 +
 +      total >>= SCHED_LOAD_SHIFT;
 +
 +      return div_u64(available, total);
 +}
 +
 +static void update_cpu_power(struct sched_domain *sd, int cpu)
 +{
 +      unsigned long weight = cpumask_weight(sched_domain_span(sd));
 +      unsigned long power = SCHED_LOAD_SCALE;
 +      struct sched_group *sdg = sd->groups;
 +
 +      /* here we could scale based on cpufreq */
 +
 +      if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
 +              power *= arch_scale_smt_power(sd, cpu);
 +              power >>= SCHED_LOAD_SHIFT;
 +      }
 +
 +      power *= scale_rt_power(cpu);
 +      power >>= SCHED_LOAD_SHIFT;
 +
 +      if (!power)
 +              power = 1;
 +
 +      sdg->cpu_power = power;
 +}
 +
 +static void update_group_power(struct sched_domain *sd, int cpu)
 +{
 +      struct sched_domain *child = sd->child;
 +      struct sched_group *group, *sdg = sd->groups;
 +      unsigned long power;
 +
 +      if (!child) {
 +              update_cpu_power(sd, cpu);
 +              return;
 +      }
 +
 +      power = 0;
 +
 +      group = child->groups;
 +      do {
 +              power += group->cpu_power;
 +              group = group->next;
 +      } while (group != child->groups);
 +
 +      sdg->cpu_power = power;
 +}
  
  /**
   * update_sg_lb_stats - Update sched_group's statistics for load balancing.
   * @balance: Should we balance.
   * @sgs: variable to hold the statistics for this group.
   */
 -static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
 +static inline void update_sg_lb_stats(struct sched_domain *sd,
 +                      struct sched_group *group, int this_cpu,
                        enum cpu_idle_type idle, int load_idx, int *sd_idle,
                        int local_group, const struct cpumask *cpus,
                        int *balance, struct sg_lb_stats *sgs)
        unsigned long sum_avg_load_per_task;
        unsigned long avg_load_per_task;
  
 -      if (local_group)
 +      if (local_group) {
                balance_cpu = group_first_cpu(group);
 +              if (balance_cpu == this_cpu)
 +                      update_group_power(sd, this_cpu);
 +      }
  
        /* Tally up the load of all CPUs in the group */
        sum_avg_load_per_task = avg_load_per_task = 0;
        }
  
        /* Adjust by relative CPU power of the group */
 -      sgs->avg_load = sg_div_cpu_power(group,
 -                      sgs->group_load * SCHED_LOAD_SCALE);
 +      sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
  
  
        /*
         *      normalized nr_running number somewhere that negates
         *      the hierarchy?
         */
 -      avg_load_per_task = sg_div_cpu_power(group,
 -                      sum_avg_load_per_task * SCHED_LOAD_SCALE);
 +      avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) /
 +              group->cpu_power;
  
        if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
                sgs->group_imb = 1;
  
 -      sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
 -
 +      sgs->group_capacity =
 +              DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
  }
  
  /**
@@@ -3897,13 -3723,9 +3897,13 @@@ static inline void update_sd_lb_stats(s
                        const struct cpumask *cpus, int *balance,
                        struct sd_lb_stats *sds)
  {
 +      struct sched_domain *child = sd->child;
        struct sched_group *group = sd->groups;
        struct sg_lb_stats sgs;
 -      int load_idx;
 +      int load_idx, prefer_sibling = 0;
 +
 +      if (child && child->flags & SD_PREFER_SIBLING)
 +              prefer_sibling = 1;
  
        init_sd_power_savings_stats(sd, sds, idle);
        load_idx = get_sd_load_idx(sd, idle);
                local_group = cpumask_test_cpu(this_cpu,
                                               sched_group_cpus(group));
                memset(&sgs, 0, sizeof(sgs));
 -              update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle,
 +              update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
                                local_group, cpus, balance, &sgs);
  
                if (local_group && balance && !(*balance))
                        return;
  
                sds->total_load += sgs.group_load;
 -              sds->total_pwr += group->__cpu_power;
 +              sds->total_pwr += group->cpu_power;
 +
 +              /*
 +               * In case the child domain prefers tasks go to siblings
 +               * first, lower the group capacity to one so that we'll try
 +               * and move all the excess tasks away.
 +               */
 +              if (prefer_sibling)
 +                      sgs.group_capacity = min(sgs.group_capacity, 1UL);
  
                if (local_group) {
                        sds->this_load = sgs.avg_load;
                update_sd_power_savings_stats(group, sds, local_group, &sgs);
                group = group->next;
        } while (group != sd->groups);
 -
  }
  
  /**
@@@ -3986,28 -3801,28 +3986,28 @@@ static inline void fix_small_imbalance(
         * moving them.
         */
  
 -      pwr_now += sds->busiest->__cpu_power *
 +      pwr_now += sds->busiest->cpu_power *
                        min(sds->busiest_load_per_task, sds->max_load);
 -      pwr_now += sds->this->__cpu_power *
 +      pwr_now += sds->this->cpu_power *
                        min(sds->this_load_per_task, sds->this_load);
        pwr_now /= SCHED_LOAD_SCALE;
  
        /* Amount of load we'd subtract */
 -      tmp = sg_div_cpu_power(sds->busiest,
 -                      sds->busiest_load_per_task * SCHED_LOAD_SCALE);
 +      tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
 +              sds->busiest->cpu_power;
        if (sds->max_load > tmp)
 -              pwr_move += sds->busiest->__cpu_power *
 +              pwr_move += sds->busiest->cpu_power *
                        min(sds->busiest_load_per_task, sds->max_load - tmp);
  
        /* Amount of load we'd add */
 -      if (sds->max_load * sds->busiest->__cpu_power <
 +      if (sds->max_load * sds->busiest->cpu_power <
                sds->busiest_load_per_task * SCHED_LOAD_SCALE)
 -              tmp = sg_div_cpu_power(sds->this,
 -                      sds->max_load * sds->busiest->__cpu_power);
 +              tmp = (sds->max_load * sds->busiest->cpu_power) /
 +                      sds->this->cpu_power;
        else
 -              tmp = sg_div_cpu_power(sds->this,
 -                      sds->busiest_load_per_task * SCHED_LOAD_SCALE);
 -      pwr_move += sds->this->__cpu_power *
 +              tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
 +                      sds->this->cpu_power;
 +      pwr_move += sds->this->cpu_power *
                        min(sds->this_load_per_task, sds->this_load + tmp);
        pwr_move /= SCHED_LOAD_SCALE;
  
@@@ -4042,8 -3857,8 +4042,8 @@@ static inline void calculate_imbalance(
                        sds->max_load - sds->busiest_load_per_task);
  
        /* How much load to actually move to equalise the imbalance */
 -      *imbalance = min(max_pull * sds->busiest->__cpu_power,
 -              (sds->avg_load - sds->this_load) * sds->this->__cpu_power)
 +      *imbalance = min(max_pull * sds->busiest->cpu_power,
 +              (sds->avg_load - sds->this_load) * sds->this->cpu_power)
                        / SCHED_LOAD_SCALE;
  
        /*
        return NULL;
  }
  
 +static struct sched_group *group_of(int cpu)
 +{
 +      struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);
 +
 +      if (!sd)
 +              return NULL;
 +
 +      return sd->groups;
 +}
 +
 +static unsigned long power_of(int cpu)
 +{
 +      struct sched_group *group = group_of(cpu);
 +
 +      if (!group)
 +              return SCHED_LOAD_SCALE;
 +
 +      return group->cpu_power;
 +}
 +
  /*
   * find_busiest_queue - find the busiest runqueue among the cpus in group.
   */
@@@ -4193,18 -3988,15 +4193,18 @@@ find_busiest_queue(struct sched_group *
        int i;
  
        for_each_cpu(i, sched_group_cpus(group)) {
 +              unsigned long power = power_of(i);
 +              unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
                unsigned long wl;
  
                if (!cpumask_test_cpu(i, cpus))
                        continue;
  
                rq = cpu_rq(i);
 -              wl = weighted_cpuload(i);
 +              wl = weighted_cpuload(i) * SCHED_LOAD_SCALE;
 +              wl /= power;
  
 -              if (rq->nr_running == 1 && wl > imbalance)
 +              if (capacity && rq->nr_running == 1 && wl > imbalance)
                        continue;
  
                if (wl > max_load) {
@@@ -5533,7 -5325,7 +5533,7 @@@ need_resched
        preempt_disable();
        cpu = smp_processor_id();
        rq = cpu_rq(cpu);
 -      rcu_qsctr_inc(cpu);
 +      rcu_sched_qs(cpu);
        prev = rq->curr;
        switch_count = &prev->nivcsw;
  
@@@ -5557,7 -5349,10 +5557,7 @@@ need_resched_nonpreemptible
                switch_count = &prev->nvcsw;
        }
  
 -#ifdef CONFIG_SMP
 -      if (prev->sched_class->pre_schedule)
 -              prev->sched_class->pre_schedule(rq, prev);
 -#endif
 +      pre_schedule(rq, prev);
  
        if (unlikely(!rq->nr_running))
                idle_balance(cpu, rq);
        } else
                spin_unlock_irq(&rq->lock);
  
 +      post_schedule(rq);
 +
        if (unlikely(reacquire_kernel_lock(current) < 0))
                goto need_resched_nonpreemptible;
  
@@@ -6330,25 -6123,17 +6330,25 @@@ static int __sched_setscheduler(struct 
        unsigned long flags;
        const struct sched_class *prev_class = p->sched_class;
        struct rq *rq;
 +      int reset_on_fork;
  
        /* may grab non-irq protected spin_locks */
        BUG_ON(in_interrupt());
  recheck:
        /* double check policy once rq lock held */
 -      if (policy < 0)
 +      if (policy < 0) {
 +              reset_on_fork = p->sched_reset_on_fork;
                policy = oldpolicy = p->policy;
 -      else if (policy != SCHED_FIFO && policy != SCHED_RR &&
 -                      policy != SCHED_NORMAL && policy != SCHED_BATCH &&
 -                      policy != SCHED_IDLE)
 -              return -EINVAL;
 +      } else {
 +              reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
 +              policy &= ~SCHED_RESET_ON_FORK;
 +
 +              if (policy != SCHED_FIFO && policy != SCHED_RR &&
 +                              policy != SCHED_NORMAL && policy != SCHED_BATCH &&
 +                              policy != SCHED_IDLE)
 +                      return -EINVAL;
 +      }
 +
        /*
         * Valid priorities for SCHED_FIFO and SCHED_RR are
         * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
                /* can't change other user's priorities */
                if (!check_same_owner(p))
                        return -EPERM;
 +
 +              /* Normal users shall not reset the sched_reset_on_fork flag */
 +              if (p->sched_reset_on_fork && !reset_on_fork)
 +                      return -EPERM;
        }
  
        if (user) {
        if (running)
                p->sched_class->put_prev_task(rq, p);
  
 +      p->sched_reset_on_fork = reset_on_fork;
 +
        oldprio = p->prio;
        __setscheduler(rq, p, policy, param->sched_priority);
  
@@@ -6557,15 -6336,14 +6557,15 @@@ SYSCALL_DEFINE1(sched_getscheduler, pid
        if (p) {
                retval = security_task_getscheduler(p);
                if (!retval)
 -                      retval = p->policy;
 +                      retval = p->policy
 +                              | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
        }
        read_unlock(&tasklist_lock);
        return retval;
  }
  
  /**
 - * sys_sched_getscheduler - get the RT priority of a thread
 + * sys_sched_getparam - get the RT priority of a thread
   * @pid: the pid in question.
   * @param: structure containing the RT priority.
   */
@@@ -6793,9 -6571,19 +6793,9 @@@ static inline int should_resched(void
  
  static void __cond_resched(void)
  {
 -#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
 -      __might_sleep(__FILE__, __LINE__);
 -#endif
 -      /*
 -       * The BKS might be reacquired before we have dropped
 -       * PREEMPT_ACTIVE, which could trigger a second
 -       * cond_resched() call.
 -       */
 -      do {
 -              add_preempt_count(PREEMPT_ACTIVE);
 -              schedule();
 -              sub_preempt_count(PREEMPT_ACTIVE);
 -      } while (need_resched());
 +      add_preempt_count(PREEMPT_ACTIVE);
 +      schedule();
 +      sub_preempt_count(PREEMPT_ACTIVE);
  }
  
  int __sched _cond_resched(void)
  EXPORT_SYMBOL(_cond_resched);
  
  /*
 - * cond_resched_lock() - if a reschedule is pending, drop the given lock,
 + * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
   * call schedule, and on return reacquire the lock.
   *
   * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
   * operations here to prevent schedule() from being called twice (once via
   * spin_unlock(), once by hand).
   */
 -int cond_resched_lock(spinlock_t *lock)
 +int __cond_resched_lock(spinlock_t *lock)
  {
        int resched = should_resched();
        int ret = 0;
  
 +      lockdep_assert_held(lock);
 +
        if (spin_needbreak(lock) || resched) {
                spin_unlock(lock);
                if (resched)
        }
        return ret;
  }
 -EXPORT_SYMBOL(cond_resched_lock);
 +EXPORT_SYMBOL(__cond_resched_lock);
  
 -int __sched cond_resched_softirq(void)
 +int __sched __cond_resched_softirq(void)
  {
        BUG_ON(!in_softirq());
  
        }
        return 0;
  }
 -EXPORT_SYMBOL(cond_resched_softirq);
 +EXPORT_SYMBOL(__cond_resched_softirq);
  
  /**
   * yield - yield the current processor to other threads.
@@@ -6872,13 -6658,11 +6872,13 @@@ EXPORT_SYMBOL(yield)
   */
  void __sched io_schedule(void)
  {
 -      struct rq *rq = &__raw_get_cpu_var(runqueues);
 +      struct rq *rq = raw_rq();
  
        delayacct_blkio_start();
        atomic_inc(&rq->nr_iowait);
 +      current->in_iowait = 1;
        schedule();
 +      current->in_iowait = 0;
        atomic_dec(&rq->nr_iowait);
        delayacct_blkio_end();
  }
@@@ -6886,14 -6670,12 +6886,14 @@@ EXPORT_SYMBOL(io_schedule)
  
  long __sched io_schedule_timeout(long timeout)
  {
 -      struct rq *rq = &__raw_get_cpu_var(runqueues);
 +      struct rq *rq = raw_rq();
        long ret;
  
        delayacct_blkio_start();
        atomic_inc(&rq->nr_iowait);
 +      current->in_iowait = 1;
        ret = schedule_timeout(timeout);
 +      current->in_iowait = 0;
        atomic_dec(&rq->nr_iowait);
        delayacct_blkio_end();
        return ret;
@@@ -7210,12 -6992,8 +7210,12 @@@ int set_cpus_allowed_ptr(struct task_st
  
        if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {
                /* Need help from migration thread: drop lock and wait. */
 +              struct task_struct *mt = rq->migration_thread;
 +
 +              get_task_struct(mt);
                task_rq_unlock(rq, &flags);
                wake_up_process(rq->migration_thread);
 +              put_task_struct(mt);
                wait_for_completion(&req.done);
                tlb_migrate_finish(p->mm);
                return 0;
@@@ -7273,11 -7051,6 +7273,11 @@@ fail
        return ret;
  }
  
 +#define RCU_MIGRATION_IDLE    0
 +#define RCU_MIGRATION_NEED_QS 1
 +#define RCU_MIGRATION_GOT_QS  2
 +#define RCU_MIGRATION_MUST_SYNC       3
 +
  /*
   * migration_thread - this is a highprio system thread that performs
   * thread migration by bumping thread off CPU then 'pushing' onto
   */
  static int migration_thread(void *data)
  {
 +      int badcpu;
        int cpu = (long)data;
        struct rq *rq;
  
                req = list_entry(head->next, struct migration_req, list);
                list_del_init(head->next);
  
 -              spin_unlock(&rq->lock);
 -              __migrate_task(req->task, cpu, req->dest_cpu);
 +              if (req->task != NULL) {
 +                      spin_unlock(&rq->lock);
 +                      __migrate_task(req->task, cpu, req->dest_cpu);
 +              } else if (likely(cpu == (badcpu = smp_processor_id()))) {
 +                      req->dest_cpu = RCU_MIGRATION_GOT_QS;
 +                      spin_unlock(&rq->lock);
 +              } else {
 +                      req->dest_cpu = RCU_MIGRATION_MUST_SYNC;
 +                      spin_unlock(&rq->lock);
 +                      WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);
 +              }
                local_irq_enable();
  
                complete(&req->done);
@@@ -7862,7 -7625,7 +7862,7 @@@ static int __init migration_init(void
        migration_call(&migration_notifier, CPU_ONLINE, cpu);
        register_cpu_notifier(&migration_notifier);
  
 -      return err;
 +      return 0;
  }
  early_initcall(migration_init);
  #endif
@@@ -7909,7 -7672,7 +7909,7 @@@ static int sched_domain_debug_one(struc
                        break;
                }
  
 -              if (!group->__cpu_power) {
 +              if (!group->cpu_power) {
                        printk(KERN_CONT "\n");
                        printk(KERN_ERR "ERROR: domain->cpu_power not "
                                        "set\n");
                cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
  
                printk(KERN_CONT " %s", str);
 -              if (group->__cpu_power != SCHED_LOAD_SCALE) {
 -                      printk(KERN_CONT " (__cpu_power = %d)",
 -                              group->__cpu_power);
 +              if (group->cpu_power != SCHED_LOAD_SCALE) {
 +                      printk(KERN_CONT " (cpu_power = %d)",
 +                              group->cpu_power);
                }
  
                group = group->next;
@@@ -8078,7 -7841,7 +8078,7 @@@ static void rq_attach_root(struct rq *r
        rq->rd = rd;
  
        cpumask_set_cpu(rq->cpu, rd->span);
 -      if (cpumask_test_cpu(rq->cpu, cpu_online_mask))
 +      if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
                set_rq_online(rq);
  
        spin_unlock_irqrestore(&rq->lock, flags);
@@@ -8220,7 -7983,7 +8220,7 @@@ init_sched_build_groups(const struct cp
                        continue;
  
                cpumask_clear(sched_group_cpus(sg));
 -              sg->__cpu_power = 0;
 +              sg->cpu_power = 0;
  
                for_each_cpu(j, span) {
                        if (group_fn(j, cpu_map, NULL, tmpmask) != group)
@@@ -8328,39 -8091,6 +8328,39 @@@ struct static_sched_domain 
        DECLARE_BITMAP(span, CONFIG_NR_CPUS);
  };
  
 +struct s_data {
 +#ifdef CONFIG_NUMA
 +      int                     sd_allnodes;
 +      cpumask_var_t           domainspan;
 +      cpumask_var_t           covered;
 +      cpumask_var_t           notcovered;
 +#endif
 +      cpumask_var_t           nodemask;
 +      cpumask_var_t           this_sibling_map;
 +      cpumask_var_t           this_core_map;
 +      cpumask_var_t           send_covered;
 +      cpumask_var_t           tmpmask;
 +      struct sched_group      **sched_group_nodes;
 +      struct root_domain      *rd;
 +};
 +
 +enum s_alloc {
 +      sa_sched_groups = 0,
 +      sa_rootdomain,
 +      sa_tmpmask,
 +      sa_send_covered,
 +      sa_this_core_map,
 +      sa_this_sibling_map,
 +      sa_nodemask,
 +      sa_sched_group_nodes,
 +#ifdef CONFIG_NUMA
 +      sa_notcovered,
 +      sa_covered,
 +      sa_domainspan,
 +#endif
 +      sa_none,
 +};
 +
  /*
   * SMT sched-domains:
   */
@@@ -8478,76 -8208,11 +8478,76 @@@ static void init_numa_sched_groups_powe
                                continue;
                        }
  
 -                      sg_inc_cpu_power(sg, sd->groups->__cpu_power);
 +                      sg->cpu_power += sd->groups->cpu_power;
                }
                sg = sg->next;
        } while (sg != group_head);
  }
 +
 +static int build_numa_sched_groups(struct s_data *d,
 +                                 const struct cpumask *cpu_map, int num)
 +{
 +      struct sched_domain *sd;
 +      struct sched_group *sg, *prev;
 +      int n, j;
 +
 +      cpumask_clear(d->covered);
 +      cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);
 +      if (cpumask_empty(d->nodemask)) {
 +              d->sched_group_nodes[num] = NULL;
 +              goto out;
 +      }
 +
 +      sched_domain_node_span(num, d->domainspan);
 +      cpumask_and(d->domainspan, d->domainspan, cpu_map);
 +
 +      sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
 +                        GFP_KERNEL, num);
 +      if (!sg) {
 +              printk(KERN_WARNING "Can not alloc domain group for node %d\n",
 +                     num);
 +              return -ENOMEM;
 +      }
 +      d->sched_group_nodes[num] = sg;
 +
 +      for_each_cpu(j, d->nodemask) {
 +              sd = &per_cpu(node_domains, j).sd;
 +              sd->groups = sg;
 +      }
 +
 +      sg->cpu_power = 0;
 +      cpumask_copy(sched_group_cpus(sg), d->nodemask);
 +      sg->next = sg;
 +      cpumask_or(d->covered, d->covered, d->nodemask);
 +
 +      prev = sg;
 +      for (j = 0; j < nr_node_ids; j++) {
 +              n = (num + j) % nr_node_ids;
 +              cpumask_complement(d->notcovered, d->covered);
 +              cpumask_and(d->tmpmask, d->notcovered, cpu_map);
 +              cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);
 +              if (cpumask_empty(d->tmpmask))
 +                      break;
 +              cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));
 +              if (cpumask_empty(d->tmpmask))
 +                      continue;
 +              sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
 +                                GFP_KERNEL, num);
 +              if (!sg) {
 +                      printk(KERN_WARNING
 +                             "Can not alloc domain group for node %d\n", j);
 +                      return -ENOMEM;
 +              }
 +              sg->cpu_power = 0;
 +              cpumask_copy(sched_group_cpus(sg), d->tmpmask);
 +              sg->next = prev->next;
 +              cpumask_or(d->covered, d->covered, d->tmpmask);
 +              prev->next = sg;
 +              prev = sg;
 +      }
 +out:
 +      return 0;
 +}
  #endif /* CONFIG_NUMA */
  
  #ifdef CONFIG_NUMA
@@@ -8601,13 -8266,15 +8601,13 @@@ static void free_sched_groups(const str
   * there are asymmetries in the topology. If there are asymmetries, group
   * having more cpu_power will pickup more load compared to the group having
   * less cpu_power.
 - *
 - * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
 - * the maximum number of tasks a group can handle in the presence of other idle
 - * or lightly loaded groups in the same sched domain.
   */
  static void init_sched_groups_power(int cpu, struct sched_domain *sd)
  {
        struct sched_domain *child;
        struct sched_group *group;
 +      long power;
 +      int weight;
  
        WARN_ON(!sd || !sd->groups);
  
  
        child = sd->child;
  
 -      sd->groups->__cpu_power = 0;
 +      sd->groups->cpu_power = 0;
  
 -      /*
 -       * For perf policy, if the groups in child domain share resources
 -       * (for example cores sharing some portions of the cache hierarchy
 -       * or SMT), then set this domain groups cpu_power such that each group
 -       * can handle only one task, when there are other idle groups in the
 -       * same sched domain.
 -       */
 -      if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
 -                     (child->flags &
 -                      (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
 -              sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
 +      if (!child) {
 +              power = SCHED_LOAD_SCALE;
 +              weight = cpumask_weight(sched_domain_span(sd));
 +              /*
 +               * SMT siblings share the power of a single core.
 +               * Usually multiple threads get a better yield out of
 +               * that one core than a single thread would have,
 +               * reflect that in sd->smt_gain.
 +               */
 +              if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
 +                      power *= sd->smt_gain;
 +                      power /= weight;
 +                      power >>= SCHED_LOAD_SHIFT;
 +              }
 +              sd->groups->cpu_power += power;
                return;
        }
  
        /*
 -       * add cpu_power of each child group to this groups cpu_power
 +       * Add cpu_power of each child group to this groups cpu_power.
         */
        group = child->groups;
        do {
 -              sg_inc_cpu_power(sd->groups, group->__cpu_power);
 +              sd->groups->cpu_power += group->cpu_power;
                group = group->next;
        } while (group != child->groups);
  }
@@@ -8715,285 -8378,280 +8715,285 @@@ static void set_domain_attribute(struc
        }
  }
  
 -/*
 - * Build sched domains for a given set of cpus and attach the sched domains
 - * to the individual cpus
 - */
 -static int __build_sched_domains(const struct cpumask *cpu_map,
 -                               struct sched_domain_attr *attr)
 -{
 -      int i, err = -ENOMEM;
 -      struct root_domain *rd;
 -      cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered,
 -              tmpmask;
 +static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
 +                               const struct cpumask *cpu_map)
 +{
 +      switch (what) {
 +      case sa_sched_groups:
 +              free_sched_groups(cpu_map, d->tmpmask); /* fall through */
 +              d->sched_group_nodes = NULL;
 +      case sa_rootdomain:
 +              free_rootdomain(d->rd); /* fall through */
 +      case sa_tmpmask:
 +              free_cpumask_var(d->tmpmask); /* fall through */
 +      case sa_send_covered:
 +              free_cpumask_var(d->send_covered); /* fall through */
 +      case sa_this_core_map:
 +              free_cpumask_var(d->this_core_map); /* fall through */
 +      case sa_this_sibling_map:
 +              free_cpumask_var(d->this_sibling_map); /* fall through */
 +      case sa_nodemask:
 +              free_cpumask_var(d->nodemask); /* fall through */
 +      case sa_sched_group_nodes:
  #ifdef CONFIG_NUMA
 -      cpumask_var_t domainspan, covered, notcovered;
 -      struct sched_group **sched_group_nodes = NULL;
 -      int sd_allnodes = 0;
 -
 -      if (!alloc_cpumask_var(&domainspan, GFP_KERNEL))
 -              goto out;
 -      if (!alloc_cpumask_var(&covered, GFP_KERNEL))
 -              goto free_domainspan;
 -      if (!alloc_cpumask_var(&notcovered, GFP_KERNEL))
 -              goto free_covered;
 -#endif
 -
 -      if (!alloc_cpumask_var(&nodemask, GFP_KERNEL))
 -              goto free_notcovered;
 -      if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL))
 -              goto free_nodemask;
 -      if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL))
 -              goto free_this_sibling_map;
 -      if (!alloc_cpumask_var(&send_covered, GFP_KERNEL))
 -              goto free_this_core_map;
 -      if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
 -              goto free_send_covered;
 +              kfree(d->sched_group_nodes); /* fall through */
 +      case sa_notcovered:
 +              free_cpumask_var(d->notcovered); /* fall through */
 +      case sa_covered:
 +              free_cpumask_var(d->covered); /* fall through */
 +      case sa_domainspan:
 +              free_cpumask_var(d->domainspan); /* fall through */
 +#endif
 +      case sa_none:
 +              break;
 +      }
 +}
  
 +static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
 +                                                 const struct cpumask *cpu_map)
 +{
  #ifdef CONFIG_NUMA
 -      /*
 -       * Allocate the per-node list of sched groups
 -       */
 -      sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *),
 -                                  GFP_KERNEL);
 -      if (!sched_group_nodes) {
 +      if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL))
 +              return sa_none;
 +      if (!alloc_cpumask_var(&d->covered, GFP_KERNEL))
 +              return sa_domainspan;
 +      if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL))
 +              return sa_covered;
 +      /* Allocate the per-node list of sched groups */
 +      d->sched_group_nodes = kcalloc(nr_node_ids,
 +                                    sizeof(struct sched_group *), GFP_KERNEL);
 +      if (!d->sched_group_nodes) {
                printk(KERN_WARNING "Can not alloc sched group node list\n");
 -              goto free_tmpmask;
 -      }
 -#endif
 -
 -      rd = alloc_rootdomain();
 -      if (!rd) {
 +              return sa_notcovered;
 +      }
 +      sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;
 +#endif
 +      if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
 +              return sa_sched_group_nodes;
 +      if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))
 +              return sa_nodemask;
 +      if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
 +              return sa_this_sibling_map;
 +      if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
 +              return sa_this_core_map;
 +      if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
 +              return sa_send_covered;
 +      d->rd = alloc_rootdomain();
 +      if (!d->rd) {
                printk(KERN_WARNING "Cannot alloc root domain\n");
 -              goto free_sched_groups;
 +              return sa_tmpmask;
        }
 +      return sa_rootdomain;
 +}
  
 +static struct sched_domain *__build_numa_sched_domains(struct s_data *d,
 +      const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i)
 +{
 +      struct sched_domain *sd = NULL;
  #ifdef CONFIG_NUMA
 -      sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes;
 -#endif
 -
 -      /*
 -       * Set up domains for cpus specified by the cpu_map.
 -       */
 -      for_each_cpu(i, cpu_map) {
 -              struct sched_domain *sd = NULL, *p;
 -
 -              cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map);
 -
 -#ifdef CONFIG_NUMA
 -              if (cpumask_weight(cpu_map) >
 -                              SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) {
 -                      sd = &per_cpu(allnodes_domains, i).sd;
 -                      SD_INIT(sd, ALLNODES);
 -                      set_domain_attribute(sd, attr);
 -                      cpumask_copy(sched_domain_span(sd), cpu_map);
 -                      cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
 -                      p = sd;
 -                      sd_allnodes = 1;
 -              } else
 -                      p = NULL;
 +      struct sched_domain *parent;
  
 -              sd = &per_cpu(node_domains, i).sd;
 -              SD_INIT(sd, NODE);
 +      d->sd_allnodes = 0;
 +      if (cpumask_weight(cpu_map) >
 +          SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {
 +              sd = &per_cpu(allnodes_domains, i).sd;
 +              SD_INIT(sd, ALLNODES);
                set_domain_attribute(sd, attr);
 -              sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
 -              sd->parent = p;
 -              if (p)
 -                      p->child = sd;
 -              cpumask_and(sched_domain_span(sd),
 -                          sched_domain_span(sd), cpu_map);
 +              cpumask_copy(sched_domain_span(sd), cpu_map);
 +              cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);
 +              d->sd_allnodes = 1;
 +      }
 +      parent = sd;
 +
 +      sd = &per_cpu(node_domains, i).sd;
 +      SD_INIT(sd, NODE);
 +      set_domain_attribute(sd, attr);
 +      sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
 +      sd->parent = parent;
 +      if (parent)
 +              parent->child = sd;
 +      cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);
  #endif
 +      return sd;
 +}
  
 -              p = sd;
 -              sd = &per_cpu(phys_domains, i).sd;
 -              SD_INIT(sd, CPU);
 -              set_domain_attribute(sd, attr);
 -              cpumask_copy(sched_domain_span(sd), nodemask);
 -              sd->parent = p;
 -              if (p)
 -                      p->child = sd;
 -              cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask);
 +static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,
 +      const struct cpumask *cpu_map, struct sched_domain_attr *attr,
 +      struct sched_domain *parent, int i)
 +{
 +      struct sched_domain *sd;
 +      sd = &per_cpu(phys_domains, i).sd;
 +      SD_INIT(sd, CPU);
 +      set_domain_attribute(sd, attr);
 +      cpumask_copy(sched_domain_span(sd), d->nodemask);
 +      sd->parent = parent;
 +      if (parent)
 +              parent->child = sd;
 +      cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);
 +      return sd;
 +}
  
 +static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
 +      const struct cpumask *cpu_map, struct sched_domain_attr *attr,
 +      struct sched_domain *parent, int i)
 +{
 +      struct sched_domain *sd = parent;
  #ifdef CONFIG_SCHED_MC
 -              p = sd;
 -              sd = &per_cpu(core_domains, i).sd;
 -              SD_INIT(sd, MC);
 -              set_domain_attribute(sd, attr);
 -              cpumask_and(sched_domain_span(sd), cpu_map,
 -                                                 cpu_coregroup_mask(i));
 -              sd->parent = p;
 -              p->child = sd;
 -              cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
 +      sd = &per_cpu(core_domains, i).sd;
 +      SD_INIT(sd, MC);
 +      set_domain_attribute(sd, attr);
 +      cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));
 +      sd->parent = parent;
 +      parent->child = sd;
 +      cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);
  #endif
 +      return sd;
 +}
  
 +static struct sched_domain *__build_smt_sched_domain(struct s_data *d,
 +      const struct cpumask *cpu_map, struct sched_domain_attr *attr,
 +      struct sched_domain *parent, int i)
 +{
 +      struct sched_domain *sd = parent;
  #ifdef CONFIG_SCHED_SMT
 -              p = sd;
 -              sd = &per_cpu(cpu_domains, i).sd;
 -              SD_INIT(sd, SIBLING);
 -              set_domain_attribute(sd, attr);
 -              cpumask_and(sched_domain_span(sd),
 -                          topology_thread_cpumask(i), cpu_map);
 -              sd->parent = p;
 -              p->child = sd;
 -              cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
 +      sd = &per_cpu(cpu_domains, i).sd;
 +      SD_INIT(sd, SIBLING);
 +      set_domain_attribute(sd, attr);
 +      cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));
 +      sd->parent = parent;
 +      parent->child = sd;
 +      cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);
  #endif
 -      }
 +      return sd;
 +}
  
 +static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
 +                             const struct cpumask *cpu_map, int cpu)
 +{
 +      switch (l) {
  #ifdef CONFIG_SCHED_SMT
 -      /* Set up CPU (sibling) groups */
 -      for_each_cpu(i, cpu_map) {
 -              cpumask_and(this_sibling_map,
 -                          topology_thread_cpumask(i), cpu_map);
 -              if (i != cpumask_first(this_sibling_map))
 -                      continue;
 -
 -              init_sched_build_groups(this_sibling_map, cpu_map,
 -                                      &cpu_to_cpu_group,
 -                                      send_covered, tmpmask);
 -      }
 +      case SD_LV_SIBLING: /* set up CPU (sibling) groups */
 +              cpumask_and(d->this_sibling_map, cpu_map,
 +                          topology_thread_cpumask(cpu));
 +              if (cpu == cpumask_first(d->this_sibling_map))
 +                      init_sched_build_groups(d->this_sibling_map, cpu_map,
 +                                              &cpu_to_cpu_group,
 +                                              d->send_covered, d->tmpmask);
 +              break;
  #endif
 -
  #ifdef CONFIG_SCHED_MC
 -      /* Set up multi-core groups */
 -      for_each_cpu(i, cpu_map) {
 -              cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map);
 -              if (i != cpumask_first(this_core_map))
 -                      continue;
 -
 -              init_sched_build_groups(this_core_map, cpu_map,
 -                                      &cpu_to_core_group,
 -                                      send_covered, tmpmask);
 -      }
 +      case SD_LV_MC: /* set up multi-core groups */
 +              cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));
 +              if (cpu == cpumask_first(d->this_core_map))
 +                      init_sched_build_groups(d->this_core_map, cpu_map,
 +                                              &cpu_to_core_group,
 +                                              d->send_covered, d->tmpmask);
 +              break;
  #endif
 -
 -      /* Set up physical groups */
 -      for (i = 0; i < nr_node_ids; i++) {
 -              cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
 -              if (cpumask_empty(nodemask))
 -                      continue;
 -
 -              init_sched_build_groups(nodemask, cpu_map,
 -                                      &cpu_to_phys_group,
 -                                      send_covered, tmpmask);
 -      }
 -
 +      case SD_LV_CPU: /* set up physical groups */
 +              cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
 +              if (!cpumask_empty(d->nodemask))
 +                      init_sched_build_groups(d->nodemask, cpu_map,
 +                                              &cpu_to_phys_group,
 +                                              d->send_covered, d->tmpmask);
 +              break;
  #ifdef CONFIG_NUMA
 -      /* Set up node groups */
 -      if (sd_allnodes) {
 -              init_sched_build_groups(cpu_map, cpu_map,
 -                                      &cpu_to_allnodes_group,
 -                                      send_covered, tmpmask);
 +      case SD_LV_ALLNODES:
 +              init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group,
 +                                      d->send_covered, d->tmpmask);
 +              break;
 +#endif
 +      default:
 +              break;
        }
 +}
  
 -      for (i = 0; i < nr_node_ids; i++) {
 -              /* Set up node groups */
 -              struct sched_group *sg, *prev;
 -              int j;
 -
 -              cpumask_clear(covered);
 -              cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
 -              if (cpumask_empty(nodemask)) {
 -                      sched_group_nodes[i] = NULL;
 -                      continue;
 -              }
 +/*
 + * Build sched domains for a given set of cpus and attach the sched domains
 + * to the individual cpus
 + */
 +static int __build_sched_domains(const struct cpumask *cpu_map,
 +                               struct sched_domain_attr *attr)
 +{
 +      enum s_alloc alloc_state = sa_none;
 +      struct s_data d;
 +      struct sched_domain *sd;
 +      int i;
 +#ifdef CONFIG_NUMA
 +      d.sd_allnodes = 0;
 +#endif
  
 -              sched_domain_node_span(i, domainspan);
 -              cpumask_and(domainspan, domainspan, cpu_map);
 +      alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
 +      if (alloc_state != sa_rootdomain)
 +              goto error;
 +      alloc_state = sa_sched_groups;
  
 -              sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
 -                                GFP_KERNEL, i);
 -              if (!sg) {
 -                      printk(KERN_WARNING "Can not alloc domain group for "
 -                              "node %d\n", i);
 -                      goto error;
 -              }
 -              sched_group_nodes[i] = sg;
 -              for_each_cpu(j, nodemask) {
 -                      struct sched_domain *sd;
 +      /*
 +       * Set up domains for cpus specified by the cpu_map.
 +       */
 +      for_each_cpu(i, cpu_map) {
 +              cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)),
 +                          cpu_map);
  
 -                      sd = &per_cpu(node_domains, j).sd;
 -                      sd->groups = sg;
 -              }
 -              sg->__cpu_power = 0;
 -              cpumask_copy(sched_group_cpus(sg), nodemask);
 -              sg->next = sg;
 -              cpumask_or(covered, covered, nodemask);
 -              prev = sg;
 +              sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
 +              sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
 +              sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
 +              sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
 +      }
  
 -              for (j = 0; j < nr_node_ids; j++) {
 -                      int n = (i + j) % nr_node_ids;
 +      for_each_cpu(i, cpu_map) {
 +              build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
 +              build_sched_groups(&d, SD_LV_MC, cpu_map, i);
 +      }
  
 -                      cpumask_complement(notcovered, covered);
 -                      cpumask_and(tmpmask, notcovered, cpu_map);
 -                      cpumask_and(tmpmask, tmpmask, domainspan);
 -                      if (cpumask_empty(tmpmask))
 -                              break;
 +      /* Set up physical groups */
 +      for (i = 0; i < nr_node_ids; i++)
 +              build_sched_groups(&d, SD_LV_CPU, cpu_map, i);
  
 -                      cpumask_and(tmpmask, tmpmask, cpumask_of_node(n));
 -                      if (cpumask_empty(tmpmask))
 -                              continue;
 +#ifdef CONFIG_NUMA
 +      /* Set up node groups */
 +      if (d.sd_allnodes)
 +              build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);
  
 -                      sg = kmalloc_node(sizeof(struct sched_group) +
 -                                        cpumask_size(),
 -                                        GFP_KERNEL, i);
 -                      if (!sg) {
 -                              printk(KERN_WARNING
 -                              "Can not alloc domain group for node %d\n", j);
 -                              goto error;
 -                      }
 -                      sg->__cpu_power = 0;
 -                      cpumask_copy(sched_group_cpus(sg), tmpmask);
 -                      sg->next = prev->next;
 -                      cpumask_or(covered, covered, tmpmask);
 -                      prev->next = sg;
 -                      prev = sg;
 -              }
 -      }
 +      for (i = 0; i < nr_node_ids; i++)
 +              if (build_numa_sched_groups(&d, cpu_map, i))
 +                      goto error;
  #endif
  
        /* Calculate CPU power for physical packages and nodes */
  #ifdef CONFIG_SCHED_SMT
        for_each_cpu(i, cpu_map) {
 -              struct sched_domain *sd = &per_cpu(cpu_domains, i).sd;
 -
 +              sd = &per_cpu(cpu_domains, i).sd;
                init_sched_groups_power(i, sd);
        }
  #endif
  #ifdef CONFIG_SCHED_MC
        for_each_cpu(i, cpu_map) {
 -              struct sched_domain *sd = &per_cpu(core_domains, i).sd;
 -
 +              sd = &per_cpu(core_domains, i).sd;
                init_sched_groups_power(i, sd);
        }
  #endif
  
        for_each_cpu(i, cpu_map) {
 -              struct sched_domain *sd = &per_cpu(phys_domains, i).sd;
 -
 +              sd = &per_cpu(phys_domains, i).sd;
                init_sched_groups_power(i, sd);
        }
  
  #ifdef CONFIG_NUMA
        for (i = 0; i < nr_node_ids; i++)
 -              init_numa_sched_groups_power(sched_group_nodes[i]);
 +              init_numa_sched_groups_power(d.sched_group_nodes[i]);
  
 -      if (sd_allnodes) {
 +      if (d.sd_allnodes) {
                struct sched_group *sg;
  
                cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
 -                                                              tmpmask);
 +                                                              d.tmpmask);
                init_numa_sched_groups_power(sg);
        }
  #endif
  
        /* Attach the domains */
        for_each_cpu(i, cpu_map) {
 -              struct sched_domain *sd;
  #ifdef CONFIG_SCHED_SMT
                sd = &per_cpu(cpu_domains, i).sd;
  #elif defined(CONFIG_SCHED_MC)
  #else
                sd = &per_cpu(phys_domains, i).sd;
  #endif
 -              cpu_attach_domain(sd, rd, i);
 +              cpu_attach_domain(sd, d.rd, i);
        }
  
 -      err = 0;
 -
 -free_tmpmask:
 -      free_cpumask_var(tmpmask);
 -free_send_covered:
 -      free_cpumask_var(send_covered);
 -free_this_core_map:
 -      free_cpumask_var(this_core_map);
 -free_this_sibling_map:
 -      free_cpumask_var(this_sibling_map);
 -free_nodemask:
 -      free_cpumask_var(nodemask);
 -free_notcovered:
 -#ifdef CONFIG_NUMA
 -      free_cpumask_var(notcovered);
 -free_covered:
 -      free_cpumask_var(covered);
 -free_domainspan:
 -      free_cpumask_var(domainspan);
 -out:
 -#endif
 -      return err;
 -
 -free_sched_groups:
 -#ifdef CONFIG_NUMA
 -      kfree(sched_group_nodes);
 -#endif
 -      goto free_tmpmask;
 +      d.sched_group_nodes = NULL; /* don't free this we still need it */
 +      __free_domain_allocs(&d, sa_tmpmask, cpu_map);
 +      return 0;
  
 -#ifdef CONFIG_NUMA
  error:
 -      free_sched_groups(cpu_map, tmpmask);
 -      free_rootdomain(rd);
 -      goto free_tmpmask;
 -#endif
 +      __free_domain_allocs(&d, alloc_state, cpu_map);
 +      return -ENOMEM;
  }
  
  static int build_sched_domains(const struct cpumask *cpu_map)
@@@ -9618,11 -9304,11 +9618,11 @@@ void __init sched_init(void
                 * system cpu resource, based on the weight assigned to root
                 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
                 * by letting tasks of init_task_group sit in a separate cfs_rq
 -               * (init_cfs_rq) and having one entity represent this group of
 +               * (init_tg_cfs_rq) and having one entity represent this group of
                 * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
                 */
                init_tg_cfs_entry(&init_task_group,
 -                              &per_cpu(init_cfs_rq, i),
 +                              &per_cpu(init_tg_cfs_rq, i),
                                &per_cpu(init_sched_entity, i), i, 1,
                                root_task_group.se[i]);
  
  #ifdef CONFIG_SMP
                rq->sd = NULL;
                rq->rd = NULL;
 +              rq->post_schedule = 0;
                rq->active_balance = 0;
                rq->next_balance = jiffies;
                rq->push_cpu = 0;
  }
  
  #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
 -void __might_sleep(char *file, int line)
 +static inline int preempt_count_equals(int preempt_offset)
 +{
 +      int nested = preempt_count() & ~PREEMPT_ACTIVE;
 +
 +      return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
 +}
 +
 +void __might_sleep(char *file, int line, int preempt_offset)
  {
  #ifdef in_atomic
        static unsigned long prev_jiffy;        /* ratelimiting */
  
 -      if ((!in_atomic() && !irqs_disabled()) ||
 -                  system_state != SYSTEM_RUNNING || oops_in_progress)
 +      if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
 +          system_state != SYSTEM_RUNNING || oops_in_progress)
                return;
        if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
                return;
@@@ -10903,113 -10581,3 +10903,113 @@@ struct cgroup_subsys cpuacct_subsys = 
        .subsys_id = cpuacct_subsys_id,
  };
  #endif        /* CONFIG_CGROUP_CPUACCT */
 +
 +#ifndef CONFIG_SMP
 +
 +int rcu_expedited_torture_stats(char *page)
 +{
 +      return 0;
 +}
 +EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
 +
 +void synchronize_sched_expedited(void)
 +{
 +}
 +EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
 +
 +#else /* #ifndef CONFIG_SMP */
 +
 +static DEFINE_PER_CPU(struct migration_req, rcu_migration_req);
 +static DEFINE_MUTEX(rcu_sched_expedited_mutex);
 +
 +#define RCU_EXPEDITED_STATE_POST -2
 +#define RCU_EXPEDITED_STATE_IDLE -1
 +
 +static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
 +
 +int rcu_expedited_torture_stats(char *page)
 +{
 +      int cnt = 0;
 +      int cpu;
 +
 +      cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state);
 +      for_each_online_cpu(cpu) {
 +               cnt += sprintf(&page[cnt], " %d:%d",
 +                              cpu, per_cpu(rcu_migration_req, cpu).dest_cpu);
 +      }
 +      cnt += sprintf(&page[cnt], "\n");
 +      return cnt;
 +}
 +EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
 +
 +static long synchronize_sched_expedited_count;
 +
 +/*
 + * Wait for an rcu-sched grace period to elapse, but use "big hammer"
 + * approach to force grace period to end quickly.  This consumes
 + * significant time on all CPUs, and is thus not recommended for
 + * any sort of common-case code.
 + *
 + * Note that it is illegal to call this function while holding any
 + * lock that is acquired by a CPU-hotplug notifier.  Failing to
 + * observe this restriction will result in deadlock.
 + */
 +void synchronize_sched_expedited(void)
 +{
 +      int cpu;
 +      unsigned long flags;
 +      bool need_full_sync = 0;
 +      struct rq *rq;
 +      struct migration_req *req;
 +      long snap;
 +      int trycount = 0;
 +
 +      smp_mb();  /* ensure prior mod happens before capturing snap. */
 +      snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1;
 +      get_online_cpus();
 +      while (!mutex_trylock(&rcu_sched_expedited_mutex)) {
 +              put_online_cpus();
 +              if (trycount++ < 10)
 +                      udelay(trycount * num_online_cpus());
 +              else {
 +                      synchronize_sched();
 +                      return;
 +              }
 +              if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) {
 +                      smp_mb(); /* ensure test happens before caller kfree */
 +                      return;
 +              }
 +              get_online_cpus();
 +      }
 +      rcu_expedited_state = RCU_EXPEDITED_STATE_POST;
 +      for_each_online_cpu(cpu) {
 +              rq = cpu_rq(cpu);
 +              req = &per_cpu(rcu_migration_req, cpu);
 +              init_completion(&req->done);
 +              req->task = NULL;
 +              req->dest_cpu = RCU_MIGRATION_NEED_QS;
 +              spin_lock_irqsave(&rq->lock, flags);
 +              list_add(&req->list, &rq->migration_queue);
 +              spin_unlock_irqrestore(&rq->lock, flags);
 +              wake_up_process(rq->migration_thread);
 +      }
 +      for_each_online_cpu(cpu) {
 +              rcu_expedited_state = cpu;
 +              req = &per_cpu(rcu_migration_req, cpu);
 +              rq = cpu_rq(cpu);
 +              wait_for_completion(&req->done);
 +              spin_lock_irqsave(&rq->lock, flags);
 +              if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))
 +                      need_full_sync = 1;
 +              req->dest_cpu = RCU_MIGRATION_IDLE;
 +              spin_unlock_irqrestore(&rq->lock, flags);
 +      }
 +      rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
 +      mutex_unlock(&rcu_sched_expedited_mutex);
 +      put_online_cpus();
 +      if (need_full_sync)
 +              synchronize_sched();
 +}
 +EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
 +
 +#endif /* #else #ifndef CONFIG_SMP */
@@@ -17,8 -17,6 +17,8 @@@
  #include <linux/ctype.h>
  #include <linux/delay.h>
  
 +#include <asm/setup.h>
 +
  #include "trace_output.h"
  
  #define TRACE_SYSTEM "TRACE_SYSTEM"
@@@ -27,9 -25,8 +27,9 @@@ DEFINE_MUTEX(event_mutex)
  
  LIST_HEAD(ftrace_events);
  
 -int trace_define_field(struct ftrace_event_call *call, char *type,
 -                     char *name, int offset, int size, int is_signed)
 +int trace_define_field(struct ftrace_event_call *call, const char *type,
 +                     const char *name, int offset, int size, int is_signed,
 +                     int filter_type)
  {
        struct ftrace_event_field *field;
  
        if (!field->type)
                goto err;
  
 +      if (filter_type == FILTER_OTHER)
 +              field->filter_type = filter_assign_type(type);
 +      else
 +              field->filter_type = filter_type;
 +
        field->offset = offset;
        field->size = size;
        field->is_signed = is_signed;
 +
        list_add(&field->link, &call->fields);
  
        return 0;
@@@ -69,29 -60,6 +69,29 @@@ err
  }
  EXPORT_SYMBOL_GPL(trace_define_field);
  
 +#define __common_field(type, item)                                    \
 +      ret = trace_define_field(call, #type, "common_" #item,          \
 +                               offsetof(typeof(ent), item),           \
 +                               sizeof(ent.item),                      \
 +                               is_signed_type(type), FILTER_OTHER);   \
 +      if (ret)                                                        \
 +              return ret;
 +
 +int trace_define_common_fields(struct ftrace_event_call *call)
 +{
 +      int ret;
 +      struct trace_entry ent;
 +
 +      __common_field(unsigned short, type);
 +      __common_field(unsigned char, flags);
 +      __common_field(unsigned char, preempt_count);
 +      __common_field(int, pid);
 +      __common_field(int, tgid);
 +
 +      return ret;
 +}
 +EXPORT_SYMBOL_GPL(trace_define_common_fields);
 +
  #ifdef CONFIG_MODULES
  
  static void trace_destroy_fields(struct ftrace_event_call *call)
@@@ -116,14 -84,14 +116,14 @@@ static void ftrace_event_enable_disable
                if (call->enabled) {
                        call->enabled = 0;
                        tracing_stop_cmdline_record();
 -                      call->unregfunc();
 +                      call->unregfunc(call->data);
                }
                break;
        case 1:
                if (!call->enabled) {
                        call->enabled = 1;
                        tracing_start_cmdline_record();
 -                      call->regfunc();
 +                      call->regfunc(call->data);
                }
                break;
        }
@@@ -606,7 -574,7 +606,7 @@@ event_format_read(struct file *filp, ch
        trace_seq_printf(s, "format:\n");
        trace_write_header(s);
  
 -      r = call->show_format(s);
 +      r = call->show_format(call, s);
        if (!r) {
                /*
                 * ug!  The format output is bigger than a PAGE!!
@@@ -881,10 -849,8 +881,10 @@@ event_subsystem_dir(const char *name, s
  
        /* First see if we did not already create this dir */
        list_for_each_entry(system, &event_subsystems, list) {
 -              if (strcmp(system->name, name) == 0)
 +              if (strcmp(system->name, name) == 0) {
 +                      system->nr_events++;
                        return system->entry;
 +              }
        }
  
        /* need to create new entry */
                return d_events;
        }
  
 +      system->nr_events = 1;
        system->name = kstrdup(name, GFP_KERNEL);
        if (!system->name) {
                debugfs_remove(system->entry);
@@@ -955,6 -920,15 +955,6 @@@ event_create_dir(struct ftrace_event_ca
        if (strcmp(call->system, TRACE_SYSTEM) != 0)
                d_events = event_subsystem_dir(call->system, d_events);
  
 -      if (call->raw_init) {
 -              ret = call->raw_init();
 -              if (ret < 0) {
 -                      pr_warning("Could not initialize trace point"
 -                                 " events/%s\n", call->name);
 -                      return ret;
 -              }
 -      }
 -
        call->dir = debugfs_create_dir(call->name, d_events);
        if (!call->dir) {
                pr_warning("Could not create debugfs "
                                          id);
  
        if (call->define_fields) {
 -              ret = call->define_fields();
 +              ret = call->define_fields(call);
                if (ret < 0) {
                        pr_warning("Could not initialize trace point"
                                   " events/%s\n", call->name);
@@@ -1013,32 -987,6 +1013,32 @@@ struct ftrace_module_file_ops 
        struct file_operations          filter;
  };
  
 +static void remove_subsystem_dir(const char *name)
 +{
 +      struct event_subsystem *system;
 +
 +      if (strcmp(name, TRACE_SYSTEM) == 0)
 +              return;
 +
 +      list_for_each_entry(system, &event_subsystems, list) {
 +              if (strcmp(system->name, name) == 0) {
 +                      if (!--system->nr_events) {
 +                              struct event_filter *filter = system->filter;
 +
 +                              debugfs_remove_recursive(system->entry);
 +                              list_del(&system->list);
 +                              if (filter) {
 +                                      kfree(filter->filter_string);
 +                                      kfree(filter);
 +                              }
 +                              kfree(system->name);
 +                              kfree(system);
 +                      }
 +                      break;
 +              }
 +      }
 +}
 +
  static struct ftrace_module_file_ops *
  trace_create_file_ops(struct module *mod)
  {
@@@ -1079,7 -1027,6 +1079,7 @@@ static void trace_module_add_events(str
        struct ftrace_module_file_ops *file_ops = NULL;
        struct ftrace_event_call *call, *start, *end;
        struct dentry *d_events;
 +      int ret;
  
        start = mod->trace_events;
        end = mod->trace_events + mod->num_trace_events;
                /* The linker may leave blanks */
                if (!call->name)
                        continue;
 -
 +              if (call->raw_init) {
 +                      ret = call->raw_init();
 +                      if (ret < 0) {
 +                              if (ret != -ENOSYS)
 +                                      pr_warning("Could not initialize trace "
 +                                      "point events/%s\n", call->name);
 +                              continue;
 +                      }
 +              }
                /*
                 * This module has events, create file ops for this module
                 * if not already done.
@@@ -1138,7 -1077,6 +1138,7 @@@ static void trace_module_remove_events(
                        list_del(&call->list);
                        trace_destroy_fields(call);
                        destroy_preds(call);
 +                      remove_subsystem_dir(call->system);
                }
        }
  
@@@ -1195,18 -1133,6 +1195,18 @@@ struct notifier_block trace_module_nb 
  extern struct ftrace_event_call __start_ftrace_events[];
  extern struct ftrace_event_call __stop_ftrace_events[];
  
 +static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata;
 +
 +static __init int setup_trace_event(char *str)
 +{
 +      strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE);
 +      ring_buffer_expanded = 1;
 +      tracing_selftest_disabled = 1;
 +
 +      return 1;
 +}
 +__setup("trace_event=", setup_trace_event);
 +
  static __init int event_trace_init(void)
  {
        struct ftrace_event_call *call;
        struct dentry *entry;
        struct dentry *d_events;
        int ret;
 +      char *buf = bootup_event_buf;
 +      char *token;
  
        d_tracer = tracing_init_dentry();
        if (!d_tracer)
                /* The linker may leave blanks */
                if (!call->name)
                        continue;
 +              if (call->raw_init) {
 +                      ret = call->raw_init();
 +                      if (ret < 0) {
 +                              if (ret != -ENOSYS)
 +                                      pr_warning("Could not initialize trace "
 +                                      "point events/%s\n", call->name);
 +                              continue;
 +                      }
 +              }
                list_add(&call->list, &ftrace_events);
                event_create_dir(call, d_events, &ftrace_event_id_fops,
                                 &ftrace_enable_fops, &ftrace_event_filter_fops,
                                 &ftrace_event_format_fops);
        }
  
 +      while (true) {
 +              token = strsep(&buf, ",");
 +
 +              if (!token)
 +                      break;
 +              if (!*token)
 +                      continue;
 +
 +              ret = ftrace_set_clr_event(token, 1);
 +              if (ret)
 +                      pr_warning("Failed to enable trace event: %s\n", token);
 +      }
 +
        ret = register_module_notifier(&trace_module_nb);
        if (ret)
                pr_warning("Failed to register trace events module notifier\n");
@@@ -1432,13 -1334,12 +1432,13 @@@ static __init void event_trace_self_tes
  
  #ifdef CONFIG_FUNCTION_TRACER
  
- static DEFINE_PER_CPU(atomic_t, test_event_disable);
+ static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable);
  
  static void
  function_test_events_call(unsigned long ip, unsigned long parent_ip)
  {
        struct ring_buffer_event *event;
 +      struct ring_buffer *buffer;
        struct ftrace_entry *entry;
        unsigned long flags;
        long disabled;
        pc = preempt_count();
        resched = ftrace_preempt_disable();
        cpu = raw_smp_processor_id();
-       disabled = atomic_inc_return(&per_cpu(test_event_disable, cpu));
+       disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu));
  
        if (disabled != 1)
                goto out;
  
        local_save_flags(flags);
  
 -      event = trace_current_buffer_lock_reserve(TRACE_FN, sizeof(*entry),
 +      event = trace_current_buffer_lock_reserve(&buffer,
 +                                                TRACE_FN, sizeof(*entry),
                                                  flags, pc);
        if (!event)
                goto out;
        entry->ip                       = ip;
        entry->parent_ip                = parent_ip;
  
 -      trace_nowake_buffer_unlock_commit(event, flags, pc);
 +      trace_nowake_buffer_unlock_commit(buffer, event, flags, pc);
  
   out:
-       atomic_dec(&per_cpu(test_event_disable, cpu));
+       atomic_dec(&per_cpu(ftrace_test_event_disable, cpu));
        ftrace_preempt_enable(resched);
  }
  
@@@ -1492,10 -1392,10 +1492,10 @@@ static __init void event_trace_self_tes
  
  static __init int event_trace_self_tests_init(void)
  {
 -
 -      event_trace_self_tests();
 -
 -      event_trace_self_test_with_function();
 +      if (!tracing_selftest_disabled) {
 +              event_trace_self_tests();
 +              event_trace_self_test_with_function();
 +      }
  
        return 0;
  }
diff --combined lib/Kconfig.debug
@@@ -653,21 -653,6 +653,21 @@@ config DEBUG_NOTIFIER
          This is a relatively cheap check but if you care about maximum
          performance, say N.
  
 +config DEBUG_CREDENTIALS
 +      bool "Debug credential management"
 +      depends on DEBUG_KERNEL
 +      help
 +        Enable this to turn on some debug checking for credential
 +        management.  The additional code keeps track of the number of
 +        pointers from task_structs to any given cred struct, and checks to
 +        see that this number never exceeds the usage count of the cred
 +        struct.
 +
 +        Furthermore, if SELinux is enabled, this also checks that the
 +        security pointer in the cred struct is never seen to be invalid.
 +
 +        If unsure, say N.
 +
  #
  # Select this config option from the architecture Kconfig, if it
  # it is preferred to always offer frame pointers as a config
@@@ -740,7 -725,7 +740,7 @@@ config RCU_TORTURE_TEST_RUNNABL
  
  config RCU_CPU_STALL_DETECTOR
        bool "Check for stalled CPUs delaying RCU grace periods"
 -      depends on CLASSIC_RCU || TREE_RCU
 +      depends on TREE_RCU || TREE_PREEMPT_RCU
        default n
        help
          This option causes RCU to printk information on which
@@@ -805,6 -790,21 +805,21 @@@ config DEBUG_BLOCK_EXT_DEV
  
          Say N if you are unsure.
  
+ config DEBUG_FORCE_WEAK_PER_CPU
+       bool "Force weak per-cpu definitions"
+       depends on DEBUG_KERNEL
+       help
+         s390 and alpha require percpu variables in modules to be
+         defined weak to work around addressing range issue which
+         puts the following two restrictions on percpu variable
+         definitions.
+         1. percpu symbols must be unique whether static or not
+         2. percpu variables can't be defined inside a function
+         To ensure that generic code follows the above rules, this
+         option forces all percpu variables to be defined as weak.
  config LKDTM
        tristate "Linux Kernel Dump Test Tool Module"
        depends on DEBUG_KERNEL
diff --combined mm/Makefile
@@@ -8,7 -8,7 +8,7 @@@ mmu-$(CONFIG_MMU)        := fremap.o highmem.
                           vmalloc.o
  
  obj-y                 := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
 -                         maccess.o page_alloc.o page-writeback.o pdflush.o \
 +                         maccess.o page_alloc.o page-writeback.o \
                           readahead.o swap.o truncate.o vmscan.o shmem.o \
                           prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
                           page_isolation.o mm_init.o $(mmu-y)
@@@ -33,7 -33,7 +33,7 @@@ obj-$(CONFIG_FAILSLAB) += failslab.
  obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
  obj-$(CONFIG_FS_XIP) += filemap_xip.o
  obj-$(CONFIG_MIGRATION) += migrate.o
- ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
+ ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA
  obj-$(CONFIG_SMP) += percpu.o
  else
  obj-$(CONFIG_SMP) += allocpercpu.o
diff --combined mm/page-writeback.c
  #include <linux/pagevec.h>
  
  /*
 - * The maximum number of pages to writeout in a single bdflush/kupdate
 - * operation.  We do this so we don't hold I_SYNC against an inode for
 - * enormous amounts of time, which would block a userspace task which has
 - * been forced to throttle against that inode.  Also, the code reevaluates
 - * the dirty each time it has written this many pages.
 - */
 -#define MAX_WRITEBACK_PAGES   1024
 -
 -/*
   * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
   * will look to see if it needs to force writeback or throttling.
   */
@@@ -108,6 -117,8 +108,6 @@@ EXPORT_SYMBOL(laptop_mode)
  /* End of sysctl-exported parameters */
  
  
 -static void background_writeout(unsigned long _min_pages);
 -
  /*
   * Scale the writeback cache size proportional to the relative writeout speeds.
   *
@@@ -309,13 -320,15 +309,13 @@@ static void task_dirty_limit(struct tas
  /*
   *
   */
 -static DEFINE_SPINLOCK(bdi_lock);
  static unsigned int bdi_min_ratio;
  
  int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
  {
        int ret = 0;
 -      unsigned long flags;
  
 -      spin_lock_irqsave(&bdi_lock, flags);
 +      spin_lock(&bdi_lock);
        if (min_ratio > bdi->max_ratio) {
                ret = -EINVAL;
        } else {
                        ret = -EINVAL;
                }
        }
 -      spin_unlock_irqrestore(&bdi_lock, flags);
 +      spin_unlock(&bdi_lock);
  
        return ret;
  }
  
  int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
  {
 -      unsigned long flags;
        int ret = 0;
  
        if (max_ratio > 100)
                return -EINVAL;
  
 -      spin_lock_irqsave(&bdi_lock, flags);
 +      spin_lock(&bdi_lock);
        if (bdi->min_ratio > max_ratio) {
                ret = -EINVAL;
        } else {
                bdi->max_ratio = max_ratio;
                bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100;
        }
 -      spin_unlock_irqrestore(&bdi_lock, flags);
 +      spin_unlock(&bdi_lock);
  
        return ret;
  }
@@@ -532,7 -546,7 +532,7 @@@ static void balance_dirty_pages(struct 
                 * up.
                 */
                if (bdi_nr_reclaimable > bdi_thresh) {
 -                      writeback_inodes(&wbc);
 +                      writeback_inodes_wbc(&wbc);
                        pages_written += write_chunk - wbc.nr_to_write;
                        get_dirty_limits(&background_thresh, &dirty_thresh,
                                       &bdi_thresh, bdi);
                if (pages_written >= write_chunk)
                        break;          /* We've done our duty */
  
 -              congestion_wait(BLK_RW_ASYNC, HZ/10);
 +              schedule_timeout(1);
        }
  
        if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
         * background_thresh, to keep the amount of dirty memory low.
         */
        if ((laptop_mode && pages_written) ||
 -                      (!laptop_mode && (global_page_state(NR_FILE_DIRTY)
 -                                        + global_page_state(NR_UNSTABLE_NFS)
 -                                        > background_thresh)))
 -              pdflush_operation(background_writeout, 0);
 +          (!laptop_mode && ((nr_writeback = global_page_state(NR_FILE_DIRTY)
 +                                        + global_page_state(NR_UNSTABLE_NFS))
 +                                        > background_thresh))) {
 +              struct writeback_control wbc = {
 +                      .bdi            = bdi,
 +                      .sync_mode      = WB_SYNC_NONE,
 +                      .nr_to_write    = nr_writeback,
 +              };
 +
 +
 +              bdi_start_writeback(&wbc);
 +      }
  }
  
  void set_page_dirty_balance(struct page *page, int page_mkwrite)
        }
  }
  
+ static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0;
  /**
   * balance_dirty_pages_ratelimited_nr - balance dirty memory state
   * @mapping: address_space which was dirtied
  void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
                                        unsigned long nr_pages_dirtied)
  {
-       static DEFINE_PER_CPU(unsigned long, ratelimits) = 0;
        unsigned long ratelimit;
        unsigned long *p;
  
         * tasks in balance_dirty_pages(). Period.
         */
        preempt_disable();
-       p =  &__get_cpu_var(ratelimits);
+       p =  &__get_cpu_var(bdp_ratelimits);
        *p += nr_pages_dirtied;
        if (unlikely(*p >= ratelimit)) {
                *p = 0;
@@@ -675,35 -682,153 +676,35 @@@ void throttle_vm_writeout(gfp_t gfp_mas
          }
  }
  
 -/*
 - * writeback at least _min_pages, and keep writing until the amount of dirty
 - * memory is less than the background threshold, or until we're all clean.
 - */
 -static void background_writeout(unsigned long _min_pages)
 -{
 -      long min_pages = _min_pages;
 -      struct writeback_control wbc = {
 -              .bdi            = NULL,
 -              .sync_mode      = WB_SYNC_NONE,
 -              .older_than_this = NULL,
 -              .nr_to_write    = 0,
 -              .nonblocking    = 1,
 -              .range_cyclic   = 1,
 -      };
 -
 -      for ( ; ; ) {
 -              unsigned long background_thresh;
 -              unsigned long dirty_thresh;
 -
 -              get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
 -              if (global_page_state(NR_FILE_DIRTY) +
 -                      global_page_state(NR_UNSTABLE_NFS) < background_thresh
 -                              && min_pages <= 0)
 -                      break;
 -              wbc.more_io = 0;
 -              wbc.encountered_congestion = 0;
 -              wbc.nr_to_write = MAX_WRITEBACK_PAGES;
 -              wbc.pages_skipped = 0;
 -              writeback_inodes(&wbc);
 -              min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
 -              if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
 -                      /* Wrote less than expected */
 -                      if (wbc.encountered_congestion || wbc.more_io)
 -                              congestion_wait(BLK_RW_ASYNC, HZ/10);
 -                      else
 -                              break;
 -              }
 -      }
 -}
 -
 -/*
 - * Start writeback of `nr_pages' pages.  If `nr_pages' is zero, write back
 - * the whole world.  Returns 0 if a pdflush thread was dispatched.  Returns
 - * -1 if all pdflush threads were busy.
 - */
 -int wakeup_pdflush(long nr_pages)
 -{
 -      if (nr_pages == 0)
 -              nr_pages = global_page_state(NR_FILE_DIRTY) +
 -                              global_page_state(NR_UNSTABLE_NFS);
 -      return pdflush_operation(background_writeout, nr_pages);
 -}
 -
 -static void wb_timer_fn(unsigned long unused);
  static void laptop_timer_fn(unsigned long unused);
  
 -static DEFINE_TIMER(wb_timer, wb_timer_fn, 0, 0);
  static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
  
  /*
 - * Periodic writeback of "old" data.
 - *
 - * Define "old": the first time one of an inode's pages is dirtied, we mark the
 - * dirtying-time in the inode's address_space.  So this periodic writeback code
 - * just walks the superblock inode list, writing back any inodes which are
 - * older than a specific point in time.
 - *
 - * Try to run once per dirty_writeback_interval.  But if a writeback event
 - * takes longer than a dirty_writeback_interval interval, then leave a
 - * one-second gap.
 - *
 - * older_than_this takes precedence over nr_to_write.  So we'll only write back
 - * all dirty pages if they are all attached to "old" mappings.
 - */
 -static void wb_kupdate(unsigned long arg)
 -{
 -      unsigned long oldest_jif;
 -      unsigned long start_jif;
 -      unsigned long next_jif;
 -      long nr_to_write;
 -      struct writeback_control wbc = {
 -              .bdi            = NULL,
 -              .sync_mode      = WB_SYNC_NONE,
 -              .older_than_this = &oldest_jif,
 -              .nr_to_write    = 0,
 -              .nonblocking    = 1,
 -              .for_kupdate    = 1,
 -              .range_cyclic   = 1,
 -      };
 -
 -      sync_supers();
 -
 -      oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval * 10);
 -      start_jif = jiffies;
 -      next_jif = start_jif + msecs_to_jiffies(dirty_writeback_interval * 10);
 -      nr_to_write = global_page_state(NR_FILE_DIRTY) +
 -                      global_page_state(NR_UNSTABLE_NFS) +
 -                      (inodes_stat.nr_inodes - inodes_stat.nr_unused);
 -      while (nr_to_write > 0) {
 -              wbc.more_io = 0;
 -              wbc.encountered_congestion = 0;
 -              wbc.nr_to_write = MAX_WRITEBACK_PAGES;
 -              writeback_inodes(&wbc);
 -              if (wbc.nr_to_write > 0) {
 -                      if (wbc.encountered_congestion || wbc.more_io)
 -                              congestion_wait(BLK_RW_ASYNC, HZ/10);
 -                      else
 -                              break;  /* All the old data is written */
 -              }
 -              nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
 -      }
 -      if (time_before(next_jif, jiffies + HZ))
 -              next_jif = jiffies + HZ;
 -      if (dirty_writeback_interval)
 -              mod_timer(&wb_timer, next_jif);
 -}
 -
 -/*
   * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
   */
  int dirty_writeback_centisecs_handler(ctl_table *table, int write,
        struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
  {
        proc_dointvec(table, write, file, buffer, length, ppos);
 -      if (dirty_writeback_interval)
 -              mod_timer(&wb_timer, jiffies +
 -                      msecs_to_jiffies(dirty_writeback_interval * 10));
 -      else
 -              del_timer(&wb_timer);
        return 0;
  }
  
 -static void wb_timer_fn(unsigned long unused)
 +static void do_laptop_sync(struct work_struct *work)
  {
 -      if (pdflush_operation(wb_kupdate, 0) < 0)
 -              mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */
 -}
 -
 -static void laptop_flush(unsigned long unused)
 -{
 -      sys_sync();
 +      wakeup_flusher_threads(0);
 +      kfree(work);
  }
  
  static void laptop_timer_fn(unsigned long unused)
  {
 -      pdflush_operation(laptop_flush, 0);
 +      struct work_struct *work;
 +
 +      work = kmalloc(sizeof(*work), GFP_ATOMIC);
 +      if (work) {
 +              INIT_WORK(work, do_laptop_sync);
 +              schedule_work(work);
 +      }
  }
  
  /*
@@@ -786,6 -911,8 +787,6 @@@ void __init page_writeback_init(void
  {
        int shift;
  
 -      mod_timer(&wb_timer,
 -                jiffies + msecs_to_jiffies(dirty_writeback_interval * 10));
        writeback_set_ratelimit();
        register_cpu_notifier(&ratelimit_nb);
  
diff --combined mm/slub.c
+++ b/mm/slub.c
                                SLAB_POISON | SLAB_STORE_USER)
  
  /*
 + * Debugging flags that require metadata to be stored in the slab.  These get
 + * disabled when slub_debug=O is used and a cache's min order increases with
 + * metadata.
 + */
 +#define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
 +
 +/*
   * Set of flags that will prevent slab merging
   */
  #define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
@@@ -332,7 -325,6 +332,7 @@@ static int slub_debug
  #endif
  
  static char *slub_debug_slabs;
 +static int disable_higher_order_debug;
  
  /*
   * Object debugging
@@@ -654,7 -646,7 +654,7 @@@ static int slab_pad_check(struct kmem_c
        slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1);
        print_section("Padding", end - remainder, remainder);
  
 -      restore_bytes(s, "slab padding", POISON_INUSE, start, end);
 +      restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end);
        return 0;
  }
  
@@@ -984,15 -976,6 +984,15 @@@ static int __init setup_slub_debug(cha
                 */
                goto check_slabs;
  
 +      if (tolower(*str) == 'o') {
 +              /*
 +               * Avoid enabling debugging on caches if its minimum order
 +               * would increase as a result.
 +               */
 +              disable_higher_order_debug = 1;
 +              goto out;
 +      }
 +
        slub_debug = 0;
        if (*str == '-')
                /*
@@@ -1043,8 -1026,8 +1043,8 @@@ static unsigned long kmem_cache_flags(u
         * Enable debugging if selected on the kernel commandline.
         */
        if (slub_debug && (!slub_debug_slabs ||
 -          strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)) == 0))
 -                      flags |= slub_debug;
 +              !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs))))
 +              flags |= slub_debug;
  
        return flags;
  }
@@@ -1126,7 -1109,8 +1126,7 @@@ static struct page *allocate_slab(struc
        }
  
        if (kmemcheck_enabled
 -              && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS)))
 -      {
 +              && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
                int pages = 1 << oo_order(oo);
  
                kmemcheck_alloc_shadow(page, oo_order(oo), flags, node);
@@@ -1576,10 -1560,6 +1576,10 @@@ slab_out_of_memory(struct kmem_cache *s
                "default order: %d, min order: %d\n", s->name, s->objsize,
                s->size, oo_order(s->oo), oo_order(s->min));
  
 +      if (oo_order(s->min) > get_order(s->objsize))
 +              printk(KERN_WARNING "  %s debugging increased min order, use "
 +                     "slub_debug=O to disable.\n", s->name);
 +
        for_each_online_node(node) {
                struct kmem_cache_node *n = get_node(s, node);
                unsigned long nr_slabs;
@@@ -2021,7 -2001,7 +2021,7 @@@ static inline int calculate_order(int s
                                return order;
                        fraction /= 2;
                }
 -              min_objects --;
 +              min_objects--;
        }
  
        /*
@@@ -2111,8 -2091,8 +2111,8 @@@ init_kmem_cache_node(struct kmem_cache_
   */
  #define NR_KMEM_CACHE_CPU 100
  
- static DEFINE_PER_CPU(struct kmem_cache_cpu,
-                               kmem_cache_cpu)[NR_KMEM_CACHE_CPU];
+ static DEFINE_PER_CPU(struct kmem_cache_cpu [NR_KMEM_CACHE_CPU],
+                     kmem_cache_cpu);
  
  static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free);
  static DECLARE_BITMAP(kmem_cach_cpu_free_init_once, CONFIG_NR_CPUS);
@@@ -2420,7 -2400,6 +2420,7 @@@ static int calculate_sizes(struct kmem_
         * on bootup.
         */
        align = calculate_alignment(flags, align, s->objsize);
 +      s->align = align;
  
        /*
         * SLUB stores one object immediately after another beginning from
@@@ -2473,18 -2452,6 +2473,18 @@@ static int kmem_cache_open(struct kmem_
  
        if (!calculate_sizes(s, -1))
                goto error;
 +      if (disable_higher_order_debug) {
 +              /*
 +               * Disable debugging flags that store metadata if the min slab
 +               * order increased.
 +               */
 +              if (get_order(s->size) > get_order(s->objsize)) {
 +                      s->flags &= ~DEBUG_METADATA_FLAGS;
 +                      s->offset = 0;
 +                      if (!calculate_sizes(s, -1))
 +                              goto error;
 +              }
 +      }
  
        /*
         * The larger the object size is, the more pages we want on the partial
@@@ -2627,6 -2594,8 +2627,6 @@@ static inline int kmem_cache_close(stru
   */
  void kmem_cache_destroy(struct kmem_cache *s)
  {
 -      if (s->flags & SLAB_DESTROY_BY_RCU)
 -              rcu_barrier();
        down_write(&slub_lock);
        s->refcount--;
        if (!s->refcount) {
                                "still has objects.\n", s->name, __func__);
                        dump_stack();
                }
 +              if (s->flags & SLAB_DESTROY_BY_RCU)
 +                      rcu_barrier();
                sysfs_slab_remove(s);
        } else
                up_write(&slub_lock);
@@@ -2823,11 -2790,6 +2823,11 @@@ static s8 size_index[24] = 
        2       /* 192 */
  };
  
 +static inline int size_index_elem(size_t bytes)
 +{
 +      return (bytes - 1) / 8;
 +}
 +
  static struct kmem_cache *get_slab(size_t size, gfp_t flags)
  {
        int index;
                if (!size)
                        return ZERO_SIZE_PTR;
  
 -              index = size_index[(size - 1) / 8];
 +              index = size_index[size_index_elem(size)];
        } else
                index = fls(size - 1);
  
@@@ -3194,12 -3156,10 +3194,12 @@@ void __init kmem_cache_init(void
        slab_state = PARTIAL;
  
        /* Caches that are not of the two-to-the-power-of size */
 -      if (KMALLOC_MIN_SIZE <= 64) {
 +      if (KMALLOC_MIN_SIZE <= 32) {
                create_kmalloc_cache(&kmalloc_caches[1],
                                "kmalloc-96", 96, GFP_NOWAIT);
                caches++;
 +      }
 +      if (KMALLOC_MIN_SIZE <= 64) {
                create_kmalloc_cache(&kmalloc_caches[2],
                                "kmalloc-192", 192, GFP_NOWAIT);
                caches++;
        BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
                (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1)));
  
 -      for (i = 8; i < KMALLOC_MIN_SIZE; i += 8)
 -              size_index[(i - 1) / 8] = KMALLOC_SHIFT_LOW;
 +      for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) {
 +              int elem = size_index_elem(i);
 +              if (elem >= ARRAY_SIZE(size_index))
 +                      break;
 +              size_index[elem] = KMALLOC_SHIFT_LOW;
 +      }
  
 -      if (KMALLOC_MIN_SIZE == 128) {
 +      if (KMALLOC_MIN_SIZE == 64) {
 +              /*
 +               * The 96 byte size cache is not used if the alignment
 +               * is 64 byte.
 +               */
 +              for (i = 64 + 8; i <= 96; i += 8)
 +                      size_index[size_index_elem(i)] = 7;
 +      } else if (KMALLOC_MIN_SIZE == 128) {
                /*
                 * The 192 byte sized cache is not used if the alignment
                 * is 128 byte. Redirect kmalloc to use the 256 byte cache
                 * instead.
                 */
                for (i = 128 + 8; i <= 192; i += 8)
 -                      size_index[(i - 1) / 8] = 8;
 +                      size_index[size_index_elem(i)] = 8;
        }
  
        slab_state = UP;
@@@ -4594,11 -4543,8 +4594,11 @@@ static int sysfs_slab_add(struct kmem_c
        }
  
        err = sysfs_create_group(&s->kobj, &slab_attr_group);
 -      if (err)
 +      if (err) {
 +              kobject_del(&s->kobj);
 +              kobject_put(&s->kobj);
                return err;
 +      }
        kobject_uevent(&s->kobj, KOBJ_ADD);
        if (!unmergeable) {
                /* Setup first alias */
@@@ -4780,7 -4726,7 +4780,7 @@@ static const struct file_operations pro
  
  static int __init slab_proc_init(void)
  {
 -      proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations);
 +      proc_create("slabinfo", S_IRUGO, NULL, &proc_slabinfo_operations);
        return 0;
  }
  module_init(slab_proc_init);
diff --combined net/rds/ib_stats.c
@@@ -37,9 -37,9 +37,9 @@@
  #include "rds.h"
  #include "ib.h"
  
- DEFINE_PER_CPU(struct rds_ib_statistics, rds_ib_stats) ____cacheline_aligned;
+ DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_ib_statistics, rds_ib_stats);
  
 -static char *rds_ib_stat_names[] = {
 +static const char *const rds_ib_stat_names[] = {
        "ib_connect_raced",
        "ib_listen_closed_stale",
        "ib_tx_cq_call",
diff --combined net/rds/iw_stats.c
@@@ -37,9 -37,9 +37,9 @@@
  #include "rds.h"
  #include "iw.h"
  
- DEFINE_PER_CPU(struct rds_iw_statistics, rds_iw_stats) ____cacheline_aligned;
+ DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_iw_statistics, rds_iw_stats);
  
 -static char *rds_iw_stat_names[] = {
 +static const char *const rds_iw_stat_names[] = {
        "iw_connect_raced",
        "iw_listen_closed_stale",
        "iw_tx_cq_call",
diff --combined net/rds/page.c
@@@ -39,7 -39,7 +39,7 @@@ struct rds_page_remainder 
        unsigned long   r_offset;
  };
  
- DEFINE_PER_CPU(struct rds_page_remainder, rds_page_remainders) ____cacheline_aligned;
+ DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, rds_page_remainders);
  
  /*
   * returns 0 on success or -errno on failure.
@@@ -81,7 -81,6 +81,7 @@@ int rds_page_copy_user(struct page *pag
  
        return 0;
  }
 +EXPORT_SYMBOL_GPL(rds_page_copy_user);
  
  /*
   * Message allocation uses this to build up regions of a message.