Merge branch 'percpu-for-linus' into percpu-for-next
authorTejun Heo <tj@kernel.org>
Fri, 14 Aug 2009 05:41:02 +0000 (14:41 +0900)
committerTejun Heo <tj@kernel.org>
Fri, 14 Aug 2009 05:45:31 +0000 (14:45 +0900)
Conflicts:
arch/sparc/kernel/smp_64.c
arch/x86/kernel/cpu/perf_counter.c
arch/x86/kernel/setup_percpu.c
drivers/cpufreq/cpufreq_ondemand.c
mm/percpu.c

Conflicts in core and arch percpu codes are mostly from commit
ed78e1e078dd44249f88b1dd8c76dafb39567161 which substituted many
num_possible_cpus() with nr_cpu_ids.  As for-next branch has moved all
the first chunk allocators into mm/percpu.c, the changes are moved
from arch code to mm/percpu.c.

Signed-off-by: Tejun Heo <tj@kernel.org>
22 files changed:
1  2 
Makefile
arch/mn10300/kernel/vmlinux.lds.S
arch/sparc/kernel/smp_64.c
arch/x86/Kconfig
arch/x86/kernel/cpu/mcheck/mce.c
arch/x86/kernel/cpu/perf_counter.c
arch/x86/kernel/setup_percpu.c
arch/x86/kernel/vmlinux.lds.S
arch/x86/mm/pageattr.c
block/cfq-iosched.c
drivers/cpufreq/cpufreq_conservative.c
drivers/cpufreq/cpufreq_ondemand.c
drivers/xen/events.c
include/asm-generic/vmlinux.lds.h
init/main.c
kernel/module.c
kernel/perf_counter.c
kernel/sched.c
kernel/trace/trace_events.c
mm/page-writeback.c
mm/percpu.c
mm/slub.c

diff --combined Makefile
+++ b/Makefile
@@@ -1,7 -1,7 +1,7 @@@
  VERSION = 2
  PATCHLEVEL = 6
  SUBLEVEL = 31
- EXTRAVERSION = -rc1
+ EXTRAVERSION = -rc6
  NAME = Man-Eating Seals of Antiquity
  
  # *DOCUMENTATION*
@@@ -140,15 -140,13 +140,13 @@@ _all: module
  endif
  
  srctree               := $(if $(KBUILD_SRC),$(KBUILD_SRC),$(CURDIR))
- TOPDIR                := $(srctree)
- # FIXME - TOPDIR is obsolete, use srctree/objtree
  objtree               := $(CURDIR)
  src           := $(srctree)
  obj           := $(objtree)
  
  VPATH         := $(srctree)$(if $(KBUILD_EXTMOD),:$(KBUILD_EXTMOD))
  
- export srctree objtree VPATH TOPDIR
+ export srctree objtree VPATH
  
  
  # SUBARCH tells the usermode build what the underlying arch is.  That is set
@@@ -327,7 -325,7 +325,7 @@@ CHECKFLAGS     := -D__linux__ -Dlinux -
  MODFLAGS      = -DMODULE
  CFLAGS_MODULE   = $(MODFLAGS)
  AFLAGS_MODULE   = $(MODFLAGS)
 -LDFLAGS_MODULE  =
 +LDFLAGS_MODULE  = -T $(srctree)/scripts/module-common.lds
  CFLAGS_KERNEL =
  AFLAGS_KERNEL =
  CFLAGS_GCOV   = -fprofile-arcs -ftest-coverage
@@@ -344,7 -342,9 +342,9 @@@ KBUILD_CPPFLAGS := -D__KERNEL_
  
  KBUILD_CFLAGS   := -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \
                   -fno-strict-aliasing -fno-common \
-                  -Werror-implicit-function-declaration
+                  -Werror-implicit-function-declaration \
+                  -Wno-format-security \
+                  -fno-delete-null-pointer-checks
  KBUILD_AFLAGS   := -D__ASSEMBLY__
  
  # Read KERNELRELEASE from include/config/kernel.release (if it exists)
@@@ -566,7 -566,7 +566,7 @@@ KBUILD_CFLAGS += $(call cc-option,-Wdec
  KBUILD_CFLAGS += $(call cc-option,-Wno-pointer-sign,)
  
  # disable invalid "can't wrap" optimizations for signed / pointers
- KBUILD_CFLAGS += $(call cc-option,-fwrapv)
+ KBUILD_CFLAGS += $(call cc-option,-fno-strict-overflow)
  
  # revert to pre-gcc-4.4 behaviour of .eh_frame
  KBUILD_CFLAGS += $(call cc-option,-fno-dwarf2-cfi-asm)
@@@ -61,7 -61,7 +61,7 @@@ SECTION
        _edata = .;             /* End of data section */
    }
  
-   .data.init_task : { INIT_TASK(THREAD_SIZE); }
+   .data.init_task : { INIT_TASK_DATA(THREAD_SIZE); }
  
    /* might get freed after init */
    . = ALIGN(PAGE_SIZE);
    __init_end = .;
    /* freed after init ends here */
  
-   BSS(4)
+   BSS_SECTION(0, PAGE_SIZE, 4)
  
    _end = . ;
  
    . = ALIGN(PAGE_SIZE);
    pg0 = .;
  
 -  /* Sections to be discarded */
 -  /DISCARD/ : {
 -      EXIT_CALL
 -      }
 -
    STABS_DEBUG
  
    DWARF_DEBUG
 +
 +  /* Sections to be discarded */
 +  DISCARDS
  }
@@@ -1415,6 -1415,19 +1415,6 @@@ static void * __init pcpu_alloc_bootmem
  #endif
  }
  
 -static size_t pcpur_size __initdata;
 -static void **pcpur_ptrs __initdata;
 -
 -static struct page * __init pcpur_get_page(unsigned int cpu, int pageno)
 -{
 -      size_t off = (size_t)pageno << PAGE_SHIFT;
 -
 -      if (off >= pcpur_size)
 -              return NULL;
 -
 -      return virt_to_page(pcpur_ptrs[cpu] + off);
 -}
 -
  #define PCPU_CHUNK_SIZE (4UL * 1024UL * 1024UL)
  
  static void __init pcpu_map_range(unsigned long start, unsigned long end,
@@@ -1478,31 -1491,30 +1478,31 @@@ void __init setup_per_cpu_areas(void
        size_t dyn_size, static_size = __per_cpu_end - __per_cpu_start;
        static struct vm_struct vm;
        unsigned long delta, cpu;
 -      size_t pcpu_unit_size;
 +      size_t size_sum, pcpu_unit_size;
        size_t ptrs_size;
 +      void **ptrs;
  
 -      pcpur_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
 -                             PERCPU_DYNAMIC_RESERVE);
 -      dyn_size = pcpur_size - static_size - PERCPU_MODULE_RESERVE;
 +      size_sum = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
 +                           PERCPU_DYNAMIC_RESERVE);
 +      dyn_size = size_sum - static_size - PERCPU_MODULE_RESERVE;
  
  
-       ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(ptrs[0]));
 -      ptrs_size = PFN_ALIGN(nr_cpu_ids * sizeof(pcpur_ptrs[0]));
 -      pcpur_ptrs = alloc_bootmem(ptrs_size);
++      ptrs_size = PFN_ALIGN(nr_cpu_ids * sizeof(ptrs[0]));
 +      ptrs = alloc_bootmem(ptrs_size);
  
        for_each_possible_cpu(cpu) {
 -              pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PCPU_CHUNK_SIZE,
 -                                                   PCPU_CHUNK_SIZE);
 +              ptrs[cpu] = pcpu_alloc_bootmem(cpu, PCPU_CHUNK_SIZE,
 +                                             PCPU_CHUNK_SIZE);
  
 -              free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size),
 -                           PCPU_CHUNK_SIZE - pcpur_size);
 +              free_bootmem(__pa(ptrs[cpu] + size_sum),
 +                           PCPU_CHUNK_SIZE - size_sum);
  
 -              memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size);
 +              memcpy(ptrs[cpu], __per_cpu_load, static_size);
        }
  
        /* allocate address and map */
        vm.flags = VM_ALLOC;
-       vm.size = num_possible_cpus() * PCPU_CHUNK_SIZE;
+       vm.size = nr_cpu_ids * PCPU_CHUNK_SIZE;
        vm_area_register_early(&vm, PCPU_CHUNK_SIZE);
  
        for_each_possible_cpu(cpu) {
  
                start += cpu * PCPU_CHUNK_SIZE;
                end = start + PCPU_CHUNK_SIZE;
 -              pcpu_map_range(start, end, virt_to_page(pcpur_ptrs[cpu]));
 +              pcpu_map_range(start, end, virt_to_page(ptrs[cpu]));
        }
  
 -      pcpu_unit_size = pcpu_setup_first_chunk(pcpur_get_page, static_size,
 +      pcpu_unit_size = pcpu_setup_first_chunk(static_size,
                                                PERCPU_MODULE_RESERVE, dyn_size,
                                                PCPU_CHUNK_SIZE, vm.addr, NULL);
  
 -      free_bootmem(__pa(pcpur_ptrs), ptrs_size);
 +      free_bootmem(__pa(ptrs), ptrs_size);
  
        delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
        for_each_possible_cpu(cpu) {
diff --combined arch/x86/Kconfig
@@@ -24,6 -24,7 +24,7 @@@ config X8
        select HAVE_UNSTABLE_SCHED_CLOCK
        select HAVE_IDE
        select HAVE_OPROFILE
+       select HAVE_PERF_COUNTERS if (!M386 && !M486)
        select HAVE_IOREMAP_PROT
        select HAVE_KPROBES
        select ARCH_WANT_OPTIONAL_GPIOLIB
@@@ -149,6 -150,9 +150,6 @@@ config ARCH_HAS_CACHE_LINE_SIZ
  config HAVE_SETUP_PER_CPU_AREA
        def_bool y
  
 -config HAVE_DYNAMIC_PER_CPU_AREA
 -      def_bool y
 -
  config HAVE_CPUMASK_OF_CPU_MAP
        def_bool X86_64_SMP
  
@@@ -739,7 -743,6 +740,6 @@@ config X86_UP_IOAPI
  config X86_LOCAL_APIC
        def_bool y
        depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC
-       select HAVE_PERF_COUNTERS if (!M386 && !M486)
  
  config X86_IO_APIC
        def_bool y
@@@ -1910,6 -1913,18 +1910,18 @@@ config DMAR_DEFAULT_O
          recommended you say N here while the DMAR code remains
          experimental.
  
+ config DMAR_BROKEN_GFX_WA
+       def_bool n
+       prompt "Workaround broken graphics drivers (going away soon)"
+       depends on DMAR
+       ---help---
+         Current Graphics drivers tend to use physical address
+         for DMA and avoid using DMA APIs. Setting this config
+         option permits the IOMMU driver to set a unity map for
+         all the OS-visible memory. Hence the driver can continue
+         to use physical addresses for DMA, at least until this
+         option is removed in the 2.6.32 kernel.
  config DMAR_FLOPPY_WA
        def_bool y
        depends on DMAR
@@@ -194,14 -194,14 +194,14 @@@ static void print_mce(struct mce *m
                       m->cs, m->ip);
                if (m->cs == __KERNEL_CS)
                        print_symbol("{%s}", m->ip);
-               printk("\n");
+               printk(KERN_CONT "\n");
        }
        printk(KERN_EMERG "TSC %llx ", m->tsc);
        if (m->addr)
-               printk("ADDR %llx ", m->addr);
+               printk(KERN_CONT "ADDR %llx ", m->addr);
        if (m->misc)
-               printk("MISC %llx ", m->misc);
-       printk("\n");
+               printk(KERN_CONT "MISC %llx ", m->misc);
+       printk(KERN_CONT "\n");
        printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
                        m->cpuvendor, m->cpuid, m->time, m->socketid,
                        m->apicid);
  
  static void print_mce_head(void)
  {
-       printk(KERN_EMERG "\n" KERN_EMERG "HARDWARE ERROR\n");
+       printk(KERN_EMERG "\nHARDWARE ERROR\n");
  }
  
  static void print_mce_tail(void)
  {
        printk(KERN_EMERG "This is not a software problem!\n"
-              KERN_EMERG "Run through mcelog --ascii to decode and contact your hardware vendor\n");
+              "Run through mcelog --ascii to decode and contact your hardware vendor\n");
  }
  
  #define PANIC_TIMEOUT 5 /* 5 seconds */
@@@ -1091,7 -1091,7 +1091,7 @@@ void mce_log_therm_throt_event(__u64 st
   */
  static int check_interval = 5 * 60; /* 5 minutes */
  
 -static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
 +static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */
  static DEFINE_PER_CPU(struct timer_list, mce_timer);
  
  static void mcheck_timer(unsigned long data)
         * Alert userspace if needed.  If we logged an MCE, reduce the
         * polling interval, otherwise increase the polling interval.
         */
 -      n = &__get_cpu_var(next_interval);
 +      n = &__get_cpu_var(mce_next_interval);
        if (mce_notify_irq())
                *n = max(*n/2, HZ/100);
        else
@@@ -1311,7 -1311,7 +1311,7 @@@ static void mce_cpu_features(struct cpu
  static void mce_init_timer(void)
  {
        struct timer_list *t = &__get_cpu_var(mce_timer);
 -      int *n = &__get_cpu_var(next_interval);
 +      int *n = &__get_cpu_var(mce_next_interval);
  
        if (mce_ignore_ce)
                return;
@@@ -1692,17 -1692,15 +1692,15 @@@ static ssize_t set_trigger(struct sys_d
                                const char *buf, size_t siz)
  {
        char *p;
-       int len;
  
        strncpy(mce_helper, buf, sizeof(mce_helper));
        mce_helper[sizeof(mce_helper)-1] = 0;
-       len = strlen(mce_helper);
        p = strchr(mce_helper, '\n');
  
-       if (*p)
+       if (p)
                *p = 0;
  
-       return len;
+       return strlen(mce_helper) + !!p;
  }
  
  static ssize_t set_ignore_ce(struct sys_device *s,
@@@ -1914,7 -1912,7 +1912,7 @@@ mce_cpu_callback(struct notifier_block 
        case CPU_DOWN_FAILED:
        case CPU_DOWN_FAILED_FROZEN:
                t->expires = round_jiffies(jiffies +
 -                                              __get_cpu_var(next_interval));
 +                                         __get_cpu_var(mce_next_interval));
                add_timer_on(t, cpu);
                smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
                break;
@@@ -55,6 -55,7 +55,7 @@@ struct x86_pmu 
        int             num_counters_fixed;
        int             counter_bits;
        u64             counter_mask;
+       int             apic;
        u64             max_period;
        u64             intel_ctrl;
  };
@@@ -66,6 -67,52 +67,52 @@@ static DEFINE_PER_CPU(struct cpu_hw_cou
  };
  
  /*
+  * Not sure about some of these
+  */
+ static const u64 p6_perfmon_event_map[] =
+ {
+   [PERF_COUNT_HW_CPU_CYCLES]          = 0x0079,
+   [PERF_COUNT_HW_INSTRUCTIONS]                = 0x00c0,
+   [PERF_COUNT_HW_CACHE_REFERENCES]    = 0x0f2e,
+   [PERF_COUNT_HW_CACHE_MISSES]                = 0x012e,
+   [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
+   [PERF_COUNT_HW_BRANCH_MISSES]               = 0x00c5,
+   [PERF_COUNT_HW_BUS_CYCLES]          = 0x0062,
+ };
+ static u64 p6_pmu_event_map(int event)
+ {
+       return p6_perfmon_event_map[event];
+ }
+ /*
+  * Counter setting that is specified not to count anything.
+  * We use this to effectively disable a counter.
+  *
+  * L2_RQSTS with 0 MESI unit mask.
+  */
+ #define P6_NOP_COUNTER                        0x0000002EULL
+ static u64 p6_pmu_raw_event(u64 event)
+ {
+ #define P6_EVNTSEL_EVENT_MASK         0x000000FFULL
+ #define P6_EVNTSEL_UNIT_MASK          0x0000FF00ULL
+ #define P6_EVNTSEL_EDGE_MASK          0x00040000ULL
+ #define P6_EVNTSEL_INV_MASK           0x00800000ULL
+ #define P6_EVNTSEL_COUNTER_MASK               0xFF000000ULL
+ #define P6_EVNTSEL_MASK                       \
+       (P6_EVNTSEL_EVENT_MASK |        \
+        P6_EVNTSEL_UNIT_MASK  |        \
+        P6_EVNTSEL_EDGE_MASK  |        \
+        P6_EVNTSEL_INV_MASK   |        \
+        P6_EVNTSEL_COUNTER_MASK)
+       return event & P6_EVNTSEL_MASK;
+ }
+ /*
   * Intel PerfMon v3. Used on Core2 and later.
   */
  static const u64 intel_perfmon_event_map[] =
@@@ -567,6 -614,7 +614,7 @@@ static DEFINE_MUTEX(pmc_reserve_mutex)
  
  static bool reserve_pmc_hardware(void)
  {
+ #ifdef CONFIG_X86_LOCAL_APIC
        int i;
  
        if (nmi_watchdog == NMI_LOCAL_APIC)
                if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
                        goto eventsel_fail;
        }
+ #endif
  
        return true;
  
+ #ifdef CONFIG_X86_LOCAL_APIC
  eventsel_fail:
        for (i--; i >= 0; i--)
                release_evntsel_nmi(x86_pmu.eventsel + i);
@@@ -598,10 -648,12 +648,12 @@@ perfctr_fail
                enable_lapic_nmi_watchdog();
  
        return false;
+ #endif
  }
  
  static void release_pmc_hardware(void)
  {
+ #ifdef CONFIG_X86_LOCAL_APIC
        int i;
  
        for (i = 0; i < x86_pmu.num_counters; i++) {
  
        if (nmi_watchdog == NMI_LOCAL_APIC)
                enable_lapic_nmi_watchdog();
+ #endif
  }
  
  static void hw_perf_counter_destroy(struct perf_counter *counter)
@@@ -666,6 -719,7 +719,7 @@@ static int __hw_perf_counter_init(struc
  {
        struct perf_counter_attr *attr = &counter->attr;
        struct hw_perf_counter *hwc = &counter->hw;
+       u64 config;
        int err;
  
        if (!x86_pmu_initialized())
                hwc->sample_period = x86_pmu.max_period;
                hwc->last_period = hwc->sample_period;
                atomic64_set(&hwc->period_left, hwc->sample_period);
+       } else {
+               /*
+                * If we have a PMU initialized but no APIC
+                * interrupts, we cannot sample hardware
+                * counters (user-space has to fall back and
+                * sample via a hrtimer based software counter):
+                */
+               if (!x86_pmu.apic)
+                       return -EOPNOTSUPP;
        }
  
        counter->destroy = hw_perf_counter_destroy;
  
        if (attr->config >= x86_pmu.max_events)
                return -EINVAL;
        /*
         * The generic map:
         */
-       hwc->config |= x86_pmu.event_map(attr->config);
+       config = x86_pmu.event_map(attr->config);
+       if (config == 0)
+               return -ENOENT;
+       if (config == -1LL)
+               return -EINVAL;
+       hwc->config |= config;
  
        return 0;
  }
  
+ static void p6_pmu_disable_all(void)
+ {
+       struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+       u64 val;
+       if (!cpuc->enabled)
+               return;
+       cpuc->enabled = 0;
+       barrier();
+       /* p6 only has one enable register */
+       rdmsrl(MSR_P6_EVNTSEL0, val);
+       val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
+       wrmsrl(MSR_P6_EVNTSEL0, val);
+ }
  static void intel_pmu_disable_all(void)
  {
        wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
@@@ -767,6 -856,23 +856,23 @@@ void hw_perf_disable(void
        return x86_pmu.disable_all();
  }
  
+ static void p6_pmu_enable_all(void)
+ {
+       struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+       unsigned long val;
+       if (cpuc->enabled)
+               return;
+       cpuc->enabled = 1;
+       barrier();
+       /* p6 only has one enable register */
+       rdmsrl(MSR_P6_EVNTSEL0, val);
+       val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+       wrmsrl(MSR_P6_EVNTSEL0, val);
+ }
  static void intel_pmu_enable_all(void)
  {
        wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
@@@ -784,13 -890,13 +890,13 @@@ static void amd_pmu_enable_all(void
        barrier();
  
        for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+               struct perf_counter *counter = cpuc->counters[idx];
                u64 val;
  
                if (!test_bit(idx, cpuc->active_mask))
                        continue;
-               rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
-               if (val & ARCH_PERFMON_EVENTSEL0_ENABLE)
-                       continue;
+               val = counter->hw.config;
                val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
                wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
        }
@@@ -819,16 -925,13 +925,13 @@@ static inline void intel_pmu_ack_status
  
  static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
  {
-       int err;
-       err = checking_wrmsrl(hwc->config_base + idx,
+       (void)checking_wrmsrl(hwc->config_base + idx,
                              hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
  }
  
  static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
  {
-       int err;
-       err = checking_wrmsrl(hwc->config_base + idx,
-                             hwc->config);
+       (void)checking_wrmsrl(hwc->config_base + idx, hwc->config);
  }
  
  static inline void
@@@ -836,13 -939,24 +939,24 @@@ intel_pmu_disable_fixed(struct hw_perf_
  {
        int idx = __idx - X86_PMC_IDX_FIXED;
        u64 ctrl_val, mask;
-       int err;
  
        mask = 0xfULL << (idx * 4);
  
        rdmsrl(hwc->config_base, ctrl_val);
        ctrl_val &= ~mask;
-       err = checking_wrmsrl(hwc->config_base, ctrl_val);
+       (void)checking_wrmsrl(hwc->config_base, ctrl_val);
+ }
+ static inline void
+ p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
+ {
+       struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+       u64 val = P6_NOP_COUNTER;
+       if (cpuc->enabled)
+               val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+       (void)checking_wrmsrl(hwc->config_base + idx, val);
  }
  
  static inline void
@@@ -862,7 -976,7 +976,7 @@@ amd_pmu_disable_counter(struct hw_perf_
        x86_pmu_disable_counter(hwc, idx);
  }
  
 -static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
 +static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
  
  /*
   * Set the next IRQ period, based on the hwc->period_left value.
@@@ -901,7 -1015,7 +1015,7 @@@ x86_perf_counter_set_period(struct perf
        if (left > x86_pmu.max_period)
                left = x86_pmu.max_period;
  
 -      per_cpu(prev_left[idx], smp_processor_id()) = left;
 +      per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
  
        /*
         * The hw counter starts counting from this counter offset,
@@@ -943,6 -1057,19 +1057,19 @@@ intel_pmu_enable_fixed(struct hw_perf_c
        err = checking_wrmsrl(hwc->config_base, ctrl_val);
  }
  
+ static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
+ {
+       struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+       u64 val;
+       val = hwc->config;
+       if (cpuc->enabled)
+               val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+       (void)checking_wrmsrl(hwc->config_base + idx, val);
+ }
  static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
  {
        if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
@@@ -959,8 -1086,6 +1086,6 @@@ static void amd_pmu_enable_counter(stru
  
        if (cpuc->enabled)
                x86_pmu_enable_counter(hwc, idx);
-       else
-               x86_pmu_disable_counter(hwc, idx);
  }
  
  static int
@@@ -1086,7 -1211,7 +1211,7 @@@ void perf_counter_print_debug(void
                rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
                rdmsrl(x86_pmu.perfctr  + idx, pmc_count);
  
 -              prev_left = per_cpu(prev_left[idx], cpu);
 +              prev_left = per_cpu(pmc_prev_left[idx], cpu);
  
                pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
                        cpu, idx, pmc_ctrl);
@@@ -1176,6 -1301,49 +1301,49 @@@ static void intel_pmu_reset(void
        local_irq_restore(flags);
  }
  
+ static int p6_pmu_handle_irq(struct pt_regs *regs)
+ {
+       struct perf_sample_data data;
+       struct cpu_hw_counters *cpuc;
+       struct perf_counter *counter;
+       struct hw_perf_counter *hwc;
+       int idx, handled = 0;
+       u64 val;
+       data.regs = regs;
+       data.addr = 0;
+       cpuc = &__get_cpu_var(cpu_hw_counters);
+       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+               if (!test_bit(idx, cpuc->active_mask))
+                       continue;
+               counter = cpuc->counters[idx];
+               hwc = &counter->hw;
+               val = x86_perf_counter_update(counter, hwc, idx);
+               if (val & (1ULL << (x86_pmu.counter_bits - 1)))
+                       continue;
+               /*
+                * counter overflow
+                */
+               handled         = 1;
+               data.period     = counter->hw.last_period;
+               if (!x86_perf_counter_set_period(counter, hwc, idx))
+                       continue;
+               if (perf_counter_overflow(counter, 1, &data))
+                       p6_pmu_disable_counter(hwc, idx);
+       }
+       if (handled)
+               inc_irq_stat(apic_perf_irqs);
+       return handled;
+ }
  
  /*
   * This handler is triggered by the local APIC, so the APIC IRQ handling
@@@ -1185,14 -1353,13 +1353,13 @@@ static int intel_pmu_handle_irq(struct 
  {
        struct perf_sample_data data;
        struct cpu_hw_counters *cpuc;
-       int bit, cpu, loops;
+       int bit, loops;
        u64 ack, status;
  
        data.regs = regs;
        data.addr = 0;
  
-       cpu = smp_processor_id();
-       cpuc = &per_cpu(cpu_hw_counters, cpu);
+       cpuc = &__get_cpu_var(cpu_hw_counters);
  
        perf_disable();
        status = intel_pmu_get_status();
@@@ -1249,14 -1416,13 +1416,13 @@@ static int amd_pmu_handle_irq(struct pt
        struct cpu_hw_counters *cpuc;
        struct perf_counter *counter;
        struct hw_perf_counter *hwc;
-       int cpu, idx, handled = 0;
+       int idx, handled = 0;
        u64 val;
  
        data.regs = regs;
        data.addr = 0;
  
-       cpu = smp_processor_id();
-       cpuc = &per_cpu(cpu_hw_counters, cpu);
+       cpuc = &__get_cpu_var(cpu_hw_counters);
  
        for (idx = 0; idx < x86_pmu.num_counters; idx++) {
                if (!test_bit(idx, cpuc->active_mask))
@@@ -1299,18 -1465,22 +1465,22 @@@ void smp_perf_pending_interrupt(struct 
  
  void set_perf_counter_pending(void)
  {
+ #ifdef CONFIG_X86_LOCAL_APIC
        apic->send_IPI_self(LOCAL_PENDING_VECTOR);
+ #endif
  }
  
  void perf_counters_lapic_init(void)
  {
-       if (!x86_pmu_initialized())
+ #ifdef CONFIG_X86_LOCAL_APIC
+       if (!x86_pmu.apic || !x86_pmu_initialized())
                return;
  
        /*
         * Always use NMI for PMU
         */
        apic_write(APIC_LVTPC, APIC_DM_NMI);
+ #endif
  }
  
  static int __kprobes
@@@ -1334,7 -1504,9 +1504,9 @@@ perf_counter_nmi_handler(struct notifie
  
        regs = args->regs;
  
+ #ifdef CONFIG_X86_LOCAL_APIC
        apic_write(APIC_LVTPC, APIC_DM_NMI);
+ #endif
        /*
         * Can't rely on the handled return value to say it was our NMI, two
         * counters could trigger 'simultaneously' raising two back-to-back NMIs.
@@@ -1353,6 -1525,33 +1525,33 @@@ static __read_mostly struct notifier_bl
        .priority               = 1
  };
  
+ static struct x86_pmu p6_pmu = {
+       .name                   = "p6",
+       .handle_irq             = p6_pmu_handle_irq,
+       .disable_all            = p6_pmu_disable_all,
+       .enable_all             = p6_pmu_enable_all,
+       .enable                 = p6_pmu_enable_counter,
+       .disable                = p6_pmu_disable_counter,
+       .eventsel               = MSR_P6_EVNTSEL0,
+       .perfctr                = MSR_P6_PERFCTR0,
+       .event_map              = p6_pmu_event_map,
+       .raw_event              = p6_pmu_raw_event,
+       .max_events             = ARRAY_SIZE(p6_perfmon_event_map),
+       .apic                   = 1,
+       .max_period             = (1ULL << 31) - 1,
+       .version                = 0,
+       .num_counters           = 2,
+       /*
+        * Counters have 40 bits implemented. However they are designed such
+        * that bits [32-39] are sign extensions of bit 31. As such the
+        * effective width of a counter for P6-like PMU is 32 bits only.
+        *
+        * See IA-32 Intel Architecture Software developer manual Vol 3B
+        */
+       .counter_bits           = 32,
+       .counter_mask           = (1ULL << 32) - 1,
+ };
  static struct x86_pmu intel_pmu = {
        .name                   = "Intel",
        .handle_irq             = intel_pmu_handle_irq,
        .event_map              = intel_pmu_event_map,
        .raw_event              = intel_pmu_raw_event,
        .max_events             = ARRAY_SIZE(intel_perfmon_event_map),
+       .apic                   = 1,
        /*
         * Intel PMCs cannot be accessed sanely above 32 bit width,
         * so we install an artificial 1<<31 period regardless of
@@@ -1388,10 -1588,43 +1588,43 @@@ static struct x86_pmu amd_pmu = 
        .num_counters           = 4,
        .counter_bits           = 48,
        .counter_mask           = (1ULL << 48) - 1,
+       .apic                   = 1,
        /* use highest bit to detect overflow */
        .max_period             = (1ULL << 47) - 1,
  };
  
+ static int p6_pmu_init(void)
+ {
+       switch (boot_cpu_data.x86_model) {
+       case 1:
+       case 3:  /* Pentium Pro */
+       case 5:
+       case 6:  /* Pentium II */
+       case 7:
+       case 8:
+       case 11: /* Pentium III */
+               break;
+       case 9:
+       case 13:
+               /* Pentium M */
+               break;
+       default:
+               pr_cont("unsupported p6 CPU model %d ",
+                       boot_cpu_data.x86_model);
+               return -ENODEV;
+       }
+       x86_pmu = p6_pmu;
+       if (!cpu_has_apic) {
+               pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
+               pr_info("no hardware sampling interrupt available.\n");
+               x86_pmu.apic = 0;
+       }
+       return 0;
+ }
  static int intel_pmu_init(void)
  {
        union cpuid10_edx edx;
        unsigned int ebx;
        int version;
  
-       if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
+       if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
+               /* check for P6 processor family */
+          if (boot_cpu_data.x86 == 6) {
+               return p6_pmu_init();
+          } else {
                return -ENODEV;
+          }
+       }
  
        /*
         * Check whether the Architectural PerfMon supports
@@@ -1559,8 -1798,9 +1798,9 @@@ void callchain_store(struct perf_callch
                entry->ip[entry->nr++] = ip;
  }
  
 -static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry);
 -static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry);
 +static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
 +static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
+ static DEFINE_PER_CPU(int, in_nmi_frame);
  
  
  static void
@@@ -1576,7 -1816,9 +1816,9 @@@ static void backtrace_warning(void *dat
  
  static int backtrace_stack(void *data, char *name)
  {
-       /* Process all stacks: */
+       per_cpu(in_nmi_frame, smp_processor_id()) =
+                       x86_is_stack_id(NMI_STACK, name);
        return 0;
  }
  
@@@ -1584,6 -1826,9 +1826,9 @@@ static void backtrace_address(void *dat
  {
        struct perf_callchain_entry *entry = data;
  
+       if (per_cpu(in_nmi_frame, smp_processor_id()))
+               return;
        if (reliable)
                callchain_store(entry, addr);
  }
@@@ -1707,9 -1952,9 +1952,9 @@@ struct perf_callchain_entry *perf_callc
        struct perf_callchain_entry *entry;
  
        if (in_nmi())
 -              entry = &__get_cpu_var(nmi_entry);
 +              entry = &__get_cpu_var(pmc_nmi_entry);
        else
 -              entry = &__get_cpu_var(irq_entry);
 +              entry = &__get_cpu_var(pmc_irq_entry);
  
        entry->nr = 0;
  
@@@ -124,51 -124,60 +124,51 @@@ static void * __init pcpu_alloc_bootmem
  }
  
  /*
 - * Large page remap allocator
 - *
 - * This allocator uses PMD page as unit.  A PMD page is allocated for
 - * each cpu and each is remapped into vmalloc area using PMD mapping.
 - * As PMD page is quite large, only part of it is used for the first
 - * chunk.  Unused part is returned to the bootmem allocator.
 - *
 - * So, the PMD pages are mapped twice - once to the physical mapping
 - * and to the vmalloc area for the first percpu chunk.  The double
 - * mapping does add one more PMD TLB entry pressure but still is much
 - * better than only using 4k mappings while still being NUMA friendly.
 + * Helpers for first chunk memory allocation
   */
 -#ifdef CONFIG_NEED_MULTIPLE_NODES
 -struct pcpul_ent {
 -      unsigned int    cpu;
 -      void            *ptr;
 -};
 +static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size)
 +{
 +      return pcpu_alloc_bootmem(cpu, size, size);
 +}
  
 -static size_t pcpul_size;
 -static struct pcpul_ent *pcpul_map;
 -static struct vm_struct pcpul_vm;
 +static void __init pcpu_fc_free(void *ptr, size_t size)
 +{
 +      free_bootmem(__pa(ptr), size);
 +}
  
 -static struct page * __init pcpul_get_page(unsigned int cpu, int pageno)
 +/*
 + * Large page remapping allocator
 + */
 +#ifdef CONFIG_NEED_MULTIPLE_NODES
 +static void __init pcpul_map(void *ptr, size_t size, void *addr)
  {
 -      size_t off = (size_t)pageno << PAGE_SHIFT;
 +      pmd_t *pmd, pmd_v;
  
 -      if (off >= pcpul_size)
 -              return NULL;
 +      pmd = populate_extra_pmd((unsigned long)addr);
 +      pmd_v = pfn_pmd(page_to_pfn(virt_to_page(ptr)), PAGE_KERNEL_LARGE);
 +      set_pmd(pmd, pmd_v);
 +}
  
 -      return virt_to_page(pcpul_map[cpu].ptr + off);
 +static int pcpu_lpage_cpu_distance(unsigned int from, unsigned int to)
 +{
 +      if (early_cpu_to_node(from) == early_cpu_to_node(to))
 +              return LOCAL_DISTANCE;
 +      else
 +              return REMOTE_DISTANCE;
  }
  
  static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
  {
 -      size_t map_size, dyn_size;
 -      unsigned int cpu;
 -      int i, j;
 +      size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
 +      size_t dyn_size = reserve - PERCPU_FIRST_CHUNK_RESERVE;
 +      size_t unit_map_size, unit_size;
 +      int *unit_map;
 +      int nr_units;
        ssize_t ret;
  
 -      if (!chosen) {
 -              size_t vm_size = VMALLOC_END - VMALLOC_START;
 -              size_t tot_size = nr_cpu_ids * PMD_SIZE;
 -
 -              /* on non-NUMA, embedding is better */
 -              if (!pcpu_need_numa())
 -                      return -EINVAL;
 -
 -              /* don't consume more than 20% of vmalloc area */
 -              if (tot_size > vm_size / 5) {
 -                      pr_info("PERCPU: too large chunk size %zuMB for "
 -                              "large page remap\n", tot_size >> 20);
 -                      return -EINVAL;
 -              }
 -      }
 +      /* on non-NUMA, embedding is better */
 +      if (!chosen && !pcpu_need_numa())
 +              return -EINVAL;
  
        /* need PSE */
        if (!cpu_has_pse) {
                return -EINVAL;
        }
  
 -      /*
 -       * Currently supports only single page.  Supporting multiple
 -       * pages won't be too difficult if it ever becomes necessary.
 -       */
 -      pcpul_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
 -                             PERCPU_DYNAMIC_RESERVE);
 -      if (pcpul_size > PMD_SIZE) {
 -              pr_warning("PERCPU: static data is larger than large page, "
 -                         "can't use large page\n");
 -              return -EINVAL;
 -      }
 -      dyn_size = pcpul_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
 -
 -      /* allocate pointer array and alloc large pages */
 -      map_size = PFN_ALIGN(nr_cpu_ids * sizeof(pcpul_map[0]));
 -      pcpul_map = alloc_bootmem(map_size);
 -
 -      for_each_possible_cpu(cpu) {
 -              pcpul_map[cpu].cpu = cpu;
 -              pcpul_map[cpu].ptr = pcpu_alloc_bootmem(cpu, PMD_SIZE,
 -                                                      PMD_SIZE);
 -              if (!pcpul_map[cpu].ptr) {
 -                      pr_warning("PERCPU: failed to allocate large page "
 -                                 "for cpu%u\n", cpu);
 -                      goto enomem;
 -              }
 -
 -              /*
 -               * Only use pcpul_size bytes and give back the rest.
 -               *
 -               * Ingo: The 2MB up-rounding bootmem is needed to make
 -               * sure the partial 2MB page is still fully RAM - it's
 -               * not well-specified to have a PAT-incompatible area
 -               * (unmapped RAM, device memory, etc.) in that hole.
 -               */
 -              free_bootmem(__pa(pcpul_map[cpu].ptr + pcpul_size),
 -                           PMD_SIZE - pcpul_size);
 -
 -              memcpy(pcpul_map[cpu].ptr, __per_cpu_load, static_size);
 +      /* allocate and build unit_map */
-       unit_map_size = num_possible_cpus() * sizeof(int);
++      unit_map_size = nr_cpu_ids * sizeof(int);
 +      unit_map = alloc_bootmem_nopanic(unit_map_size);
 +      if (!unit_map) {
 +              pr_warning("PERCPU: failed to allocate unit_map\n");
 +              return -ENOMEM;
        }
  
 -      /* allocate address and map */
 -      pcpul_vm.flags = VM_ALLOC;
 -      pcpul_vm.size = nr_cpu_ids * PMD_SIZE;
 -      vm_area_register_early(&pcpul_vm, PMD_SIZE);
 -
 -      for_each_possible_cpu(cpu) {
 -              pmd_t *pmd, pmd_v;
 -
 -              pmd = populate_extra_pmd((unsigned long)pcpul_vm.addr +
 -                                       cpu * PMD_SIZE);
 -              pmd_v = pfn_pmd(page_to_pfn(virt_to_page(pcpul_map[cpu].ptr)),
 -                              PAGE_KERNEL_LARGE);
 -              set_pmd(pmd, pmd_v);
 +      ret = pcpu_lpage_build_unit_map(static_size,
 +                                      PERCPU_FIRST_CHUNK_RESERVE,
 +                                      &dyn_size, &unit_size, PMD_SIZE,
 +                                      unit_map, pcpu_lpage_cpu_distance);
 +      if (ret < 0) {
 +              pr_warning("PERCPU: failed to build unit_map\n");
 +              goto out_free;
        }
 +      nr_units = ret;
  
 -      /* we're ready, commit */
 -      pr_info("PERCPU: Remapped at %p with large pages, static data "
 -              "%zu bytes\n", pcpul_vm.addr, static_size);
 -
 -      ret = pcpu_setup_first_chunk(pcpul_get_page, static_size,
 -                                   PERCPU_FIRST_CHUNK_RESERVE, dyn_size,
 -                                   PMD_SIZE, pcpul_vm.addr, NULL);
 -
 -      /* sort pcpul_map array for pcpu_lpage_remapped() */
 -      for (i = 0; i < nr_cpu_ids - 1; i++)
 -              for (j = i + 1; j < nr_cpu_ids; j++)
 -                      if (pcpul_map[i].ptr > pcpul_map[j].ptr) {
 -                              struct pcpul_ent tmp = pcpul_map[i];
 -                              pcpul_map[i] = pcpul_map[j];
 -                              pcpul_map[j] = tmp;
 -                      }
 -
 -      return ret;
 -
 -enomem:
 -      for_each_possible_cpu(cpu)
 -              if (pcpul_map[cpu].ptr)
 -                      free_bootmem(__pa(pcpul_map[cpu].ptr), pcpul_size);
 -      free_bootmem(__pa(pcpul_map), map_size);
 -      return -ENOMEM;
 -}
 +      /* do the parameters look okay? */
 +      if (!chosen) {
 +              size_t vm_size = VMALLOC_END - VMALLOC_START;
 +              size_t tot_size = nr_units * unit_size;
  
 -/**
 - * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area
 - * @kaddr: the kernel address in question
 - *
 - * Determine whether @kaddr falls in the pcpul recycled area.  This is
 - * used by pageattr to detect VM aliases and break up the pcpu PMD
 - * mapping such that the same physical page is not mapped under
 - * different attributes.
 - *
 - * The recycled area is always at the tail of a partially used PMD
 - * page.
 - *
 - * RETURNS:
 - * Address of corresponding remapped pcpu address if match is found;
 - * otherwise, NULL.
 - */
 -void *pcpu_lpage_remapped(void *kaddr)
 -{
 -      void *pmd_addr = (void *)((unsigned long)kaddr & PMD_MASK);
 -      unsigned long offset = (unsigned long)kaddr & ~PMD_MASK;
 -      int left = 0, right = nr_cpu_ids - 1;
 -      int pos;
 -
 -      /* pcpul in use at all? */
 -      if (!pcpul_map)
 -              return NULL;
 -
 -      /* okay, perform binary search */
 -      while (left <= right) {
 -              pos = (left + right) / 2;
 -
 -              if (pcpul_map[pos].ptr < pmd_addr)
 -                      left = pos + 1;
 -              else if (pcpul_map[pos].ptr > pmd_addr)
 -                      right = pos - 1;
 -              else {
 -                      /* it shouldn't be in the area for the first chunk */
 -                      WARN_ON(offset < pcpul_size);
 -
 -                      return pcpul_vm.addr +
 -                              pcpul_map[pos].cpu * PMD_SIZE + offset;
 +              /* don't consume more than 20% of vmalloc area */
 +              if (tot_size > vm_size / 5) {
 +                      pr_info("PERCPU: too large chunk size %zuMB for "
 +                              "large page remap\n", tot_size >> 20);
 +                      ret = -EINVAL;
 +                      goto out_free;
                }
        }
  
 -      return NULL;
 +      ret = pcpu_lpage_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
 +                                   dyn_size, unit_size, PMD_SIZE,
 +                                   unit_map, nr_units,
 +                                   pcpu_fc_alloc, pcpu_fc_free, pcpul_map);
 +out_free:
 +      if (ret < 0)
 +              free_bootmem(__pa(unit_map), unit_map_size);
 +      return ret;
  }
  #else
  static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
@@@ -245,15 -342,26 +245,15 @@@ static ssize_t __init setup_pcpu_embed(
                return -EINVAL;
  
        return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
 -                                    reserve - PERCPU_FIRST_CHUNK_RESERVE, -1);
 +                                    reserve - PERCPU_FIRST_CHUNK_RESERVE);
  }
  
  /*
 - * 4k page allocator
 + * 4k allocator
   *
 - * This is the basic allocator.  Static percpu area is allocated
 - * page-by-page and most of initialization is done by the generic
 - * setup function.
 + * Boring fallback 4k allocator.  This allocator puts more pressure on
 + * PTE TLBs but other than that behaves nicely on both UMA and NUMA.
   */
 -static struct page **pcpu4k_pages __initdata;
 -static int pcpu4k_nr_static_pages __initdata;
 -
 -static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno)
 -{
 -      if (pageno < pcpu4k_nr_static_pages)
 -              return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno];
 -      return NULL;
 -}
 -
  static void __init pcpu4k_populate_pte(unsigned long addr)
  {
        populate_extra_pte(addr);
  
  static ssize_t __init setup_pcpu_4k(size_t static_size)
  {
 -      size_t pages_size;
 -      unsigned int cpu;
 -      int i, j;
 -      ssize_t ret;
 -
 -      pcpu4k_nr_static_pages = PFN_UP(static_size);
 -
 -      /* unaligned allocations can't be freed, round up to page size */
 -      pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * nr_cpu_ids
 -                             * sizeof(pcpu4k_pages[0]));
 -      pcpu4k_pages = alloc_bootmem(pages_size);
 -
 -      /* allocate and copy */
 -      j = 0;
 -      for_each_possible_cpu(cpu)
 -              for (i = 0; i < pcpu4k_nr_static_pages; i++) {
 -                      void *ptr;
 -
 -                      ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE);
 -                      if (!ptr) {
 -                              pr_warning("PERCPU: failed to allocate "
 -                                         "4k page for cpu%u\n", cpu);
 -                              goto enomem;
 -                      }
 -
 -                      memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE);
 -                      pcpu4k_pages[j++] = virt_to_page(ptr);
 -              }
 -
 -      /* we're ready, commit */
 -      pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n",
 -              pcpu4k_nr_static_pages, static_size);
 -
 -      ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size,
 -                                   PERCPU_FIRST_CHUNK_RESERVE, -1,
 -                                   -1, NULL, pcpu4k_populate_pte);
 -      goto out_free_ar;
 -
 -enomem:
 -      while (--j >= 0)
 -              free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE);
 -      ret = -ENOMEM;
 -out_free_ar:
 -      free_bootmem(__pa(pcpu4k_pages), pages_size);
 -      return ret;
 +      return pcpu_4k_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
 +                                 pcpu_fc_alloc, pcpu_fc_free,
 +                                 pcpu4k_populate_pte);
  }
  
  /* for explicit first chunk allocator selection */
@@@ -336,8 -486,7 +336,8 @@@ void __init setup_per_cpu_areas(void
        /* alrighty, percpu areas up and running */
        delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
        for_each_possible_cpu(cpu) {
 -              per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size;
 +              per_cpu_offset(cpu) =
 +                      delta + pcpu_unit_map[cpu] * pcpu_unit_size;
                per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
                per_cpu(cpu_number, cpu) = cpu;
                setup_percpu_segment(cpu);
@@@ -112,11 -112,6 +112,6 @@@ SECTION
                _sdata = .;
                DATA_DATA
                CONSTRUCTORS
- #ifdef CONFIG_X86_64
-               /* End of data section */
-               _edata = .;
- #endif
        } :data
  
  #ifdef CONFIG_X86_32
        .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
                *(.data.read_mostly)
  
- #ifdef CONFIG_X86_32
                /* End of data section */
                _edata = .;
- #endif
        }
  
  #ifdef CONFIG_X86_64
                _end = .;
        }
  
 -      /* Sections to be discarded */
 -      /DISCARD/ : {
 -              *(.exitcall.exit)
 -              *(.eh_frame)
 -              *(.discard)
 -      }
 -
          STABS_DEBUG
          DWARF_DEBUG
 +
 +      /* Sections to be discarded */
 +      DISCARDS
 +      /DISCARD/ : { *(.eh_frame) }
  }
  
  
  #ifdef CONFIG_X86_32
- ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
-         "kernel image bigger than KERNEL_IMAGE_SIZE")
. = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
+          "kernel image bigger than KERNEL_IMAGE_SIZE");
  #else
  /*
   * Per-cpu symbols which need to be offset from __per_cpu_load
@@@ -411,12 -407,12 +404,12 @@@ INIT_PER_CPU(irq_stack_union)
  /*
   * Build-time check on the image size:
   */
- ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
-       "kernel image bigger than KERNEL_IMAGE_SIZE")
. = ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
+          "kernel image bigger than KERNEL_IMAGE_SIZE");
  
  #ifdef CONFIG_SMP
- ASSERT((per_cpu__irq_stack_union == 0),
-         "irq_stack_union is not at start of per-cpu area");
. = ASSERT((per_cpu__irq_stack_union == 0),
+            "irq_stack_union is not at start of per-cpu area");
  #endif
  
  #endif /* CONFIG_X86_32 */
  #ifdef CONFIG_KEXEC
  #include <asm/kexec.h>
  
- ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
-        "kexec control code size is too big")
. = ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
+            "kexec control code size is too big");
  #endif
  
diff --combined arch/x86/mm/pageattr.c
@@@ -12,7 -12,6 +12,7 @@@
  #include <linux/seq_file.h>
  #include <linux/debugfs.h>
  #include <linux/pfn.h>
 +#include <linux/percpu.h>
  
  #include <asm/e820.h>
  #include <asm/processor.h>
@@@ -592,9 -591,12 +592,12 @@@ static int __change_page_attr(struct cp
        unsigned int level;
        pte_t *kpte, old_pte;
  
-       if (cpa->flags & CPA_PAGES_ARRAY)
-               address = (unsigned long)page_address(cpa->pages[cpa->curpage]);
-       else if (cpa->flags & CPA_ARRAY)
+       if (cpa->flags & CPA_PAGES_ARRAY) {
+               struct page *page = cpa->pages[cpa->curpage];
+               if (unlikely(PageHighMem(page)))
+                       return 0;
+               address = (unsigned long)page_address(page);
+       } else if (cpa->flags & CPA_ARRAY)
                address = cpa->vaddr[cpa->curpage];
        else
                address = *cpa->vaddr;
@@@ -698,9 -700,12 +701,12 @@@ static int cpa_process_alias(struct cpa
         * No need to redo, when the primary call touched the direct
         * mapping already:
         */
-       if (cpa->flags & CPA_PAGES_ARRAY)
-               vaddr = (unsigned long)page_address(cpa->pages[cpa->curpage]);
-       else if (cpa->flags & CPA_ARRAY)
+       if (cpa->flags & CPA_PAGES_ARRAY) {
+               struct page *page = cpa->pages[cpa->curpage];
+               if (unlikely(PageHighMem(page)))
+                       return 0;
+               vaddr = (unsigned long)page_address(page);
+       } else if (cpa->flags & CPA_ARRAY)
                vaddr = cpa->vaddr[cpa->curpage];
        else
                vaddr = *cpa->vaddr;
@@@ -998,12 -1003,15 +1004,15 @@@ EXPORT_SYMBOL(set_memory_array_uc)
  int _set_memory_wc(unsigned long addr, int numpages)
  {
        int ret;
+       unsigned long addr_copy = addr;
        ret = change_page_attr_set(&addr, numpages,
                                    __pgprot(_PAGE_CACHE_UC_MINUS), 0);
        if (!ret) {
-               ret = change_page_attr_set(&addr, numpages,
-                                   __pgprot(_PAGE_CACHE_WC), 0);
+               ret = change_page_attr_set_clr(&addr_copy, numpages,
+                                              __pgprot(_PAGE_CACHE_WC),
+                                              __pgprot(_PAGE_CACHE_MASK),
+                                              0, 0, NULL);
        }
        return ret;
  }
@@@ -1120,7 -1128,9 +1129,9 @@@ int set_pages_array_uc(struct page **pa
        int free_idx;
  
        for (i = 0; i < addrinarray; i++) {
-               start = (unsigned long)page_address(pages[i]);
+               if (PageHighMem(pages[i]))
+                       continue;
+               start = page_to_pfn(pages[i]) << PAGE_SHIFT;
                end = start + PAGE_SIZE;
                if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL))
                        goto err_out;
  err_out:
        free_idx = i;
        for (i = 0; i < free_idx; i++) {
-               start = (unsigned long)page_address(pages[i]);
+               if (PageHighMem(pages[i]))
+                       continue;
+               start = page_to_pfn(pages[i]) << PAGE_SHIFT;
                end = start + PAGE_SIZE;
                free_memtype(start, end);
        }
@@@ -1162,7 -1174,9 +1175,9 @@@ int set_pages_array_wb(struct page **pa
                return retval;
  
        for (i = 0; i < addrinarray; i++) {
-               start = (unsigned long)page_address(pages[i]);
+               if (PageHighMem(pages[i]))
+                       continue;
+               start = page_to_pfn(pages[i]) << PAGE_SHIFT;
                end = start + PAGE_SIZE;
                free_memtype(start, end);
        }
diff --combined block/cfq-iosched.c
@@@ -48,7 -48,7 +48,7 @@@ static int cfq_slice_idle = HZ / 125
  static struct kmem_cache *cfq_pool;
  static struct kmem_cache *cfq_ioc_pool;
  
 -static DEFINE_PER_CPU(unsigned long, ioc_count);
 +static DEFINE_PER_CPU(unsigned long, cfq_ioc_count);
  static struct completion *ioc_gone;
  static DEFINE_SPINLOCK(ioc_gone_lock);
  
@@@ -1427,7 -1427,7 +1427,7 @@@ static void cfq_cic_free_rcu(struct rcu
        cic = container_of(head, struct cfq_io_context, rcu_head);
  
        kmem_cache_free(cfq_ioc_pool, cic);
 -      elv_ioc_count_dec(ioc_count);
 +      elv_ioc_count_dec(cfq_ioc_count);
  
        if (ioc_gone) {
                /*
                 * complete ioc_gone and set it back to NULL
                 */
                spin_lock(&ioc_gone_lock);
 -              if (ioc_gone && !elv_ioc_count_read(ioc_count)) {
 +              if (ioc_gone && !elv_ioc_count_read(cfq_ioc_count)) {
                        complete(ioc_gone);
                        ioc_gone = NULL;
                }
@@@ -1562,7 -1562,7 +1562,7 @@@ cfq_alloc_io_context(struct cfq_data *c
                INIT_HLIST_NODE(&cic->cic_list);
                cic->dtor = cfq_free_io_context;
                cic->exit = cfq_exit_io_context;
 -              elv_ioc_count_inc(ioc_count);
 +              elv_ioc_count_inc(cfq_ioc_count);
        }
  
        return cic;
@@@ -2311,7 -2311,7 +2311,7 @@@ cfq_set_request(struct request_queue *q
                goto queue_fail;
  
        cfqq = cic_to_cfqq(cic, is_sync);
-       if (!cfqq) {
+       if (!cfqq || cfqq == &cfqd->oom_cfqq) {
                cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask);
                cic_set_cfqq(cic, cfqq, is_sync);
        }
@@@ -2668,7 -2668,7 +2668,7 @@@ static void __exit cfq_exit(void
         * this also protects us from entering cfq_slab_kill() with
         * pending RCU callbacks
         */
 -      if (elv_ioc_count_read(ioc_count))
 +      if (elv_ioc_count_read(cfq_ioc_count))
                wait_for_completion(&all_gone);
        cfq_slab_kill();
  }
@@@ -64,21 -64,20 +64,20 @@@ struct cpu_dbs_info_s 
        unsigned int requested_freq;
        int cpu;
        unsigned int enable:1;
+       /*
+        * percpu mutex that serializes governor limit change with
+        * do_dbs_timer invocation. We do not want do_dbs_timer to run
+        * when user is changing the governor or limits.
+        */
+       struct mutex timer_mutex;
  };
 -static DEFINE_PER_CPU(struct cpu_dbs_info_s, cpu_dbs_info);
 +static DEFINE_PER_CPU(struct cpu_dbs_info_s, cs_cpu_dbs_info);
  
  static unsigned int dbs_enable;       /* number of CPUs using this policy */
  
  /*
-  * DEADLOCK ALERT! There is a ordering requirement between cpu_hotplug
-  * lock and dbs_mutex. cpu_hotplug lock should always be held before
-  * dbs_mutex. If any function that can potentially take cpu_hotplug lock
-  * (like __cpufreq_driver_target()) is being called with dbs_mutex taken, then
-  * cpu_hotplug lock should be taken before that. Note that cpu_hotplug lock
-  * is recursive for the same process. -Venki
-  * DEADLOCK ALERT! (2) : do_dbs_timer() must not take the dbs_mutex, because it
-  * would deadlock with cancel_delayed_work_sync(), which is needed for proper
-  * raceless workqueue teardown.
+  * dbs_mutex protects data in dbs_tuners_ins from concurrent changes on
+  * different CPUs. It protects dbs_enable in governor start/stop.
   */
  static DEFINE_MUTEX(dbs_mutex);
  
@@@ -138,7 -137,7 +137,7 @@@ dbs_cpufreq_notifier(struct notifier_bl
                     void *data)
  {
        struct cpufreq_freqs *freq = data;
 -      struct cpu_dbs_info_s *this_dbs_info = &per_cpu(cpu_dbs_info,
 +      struct cpu_dbs_info_s *this_dbs_info = &per_cpu(cs_cpu_dbs_info,
                                                        freq->cpu);
  
        struct cpufreq_policy *policy;
@@@ -298,7 -297,7 +297,7 @@@ static ssize_t store_ignore_nice_load(s
        /* we need to re-evaluate prev_cpu_idle */
        for_each_online_cpu(j) {
                struct cpu_dbs_info_s *dbs_info;
 -              dbs_info = &per_cpu(cpu_dbs_info, j);
 +              dbs_info = &per_cpu(cs_cpu_dbs_info, j);
                dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
                                                &dbs_info->prev_cpu_wall);
                if (dbs_tuners_ins.ignore_nice)
@@@ -388,7 -387,7 +387,7 @@@ static void dbs_check_cpu(struct cpu_db
                cputime64_t cur_wall_time, cur_idle_time;
                unsigned int idle_time, wall_time;
  
 -              j_dbs_info = &per_cpu(cpu_dbs_info, j);
 +              j_dbs_info = &per_cpu(cs_cpu_dbs_info, j);
  
                cur_idle_time = get_cpu_idle_time(j, &cur_wall_time);
  
@@@ -488,18 -487,12 +487,12 @@@ static void do_dbs_timer(struct work_st
  
        delay -= jiffies % delay;
  
-       if (lock_policy_rwsem_write(cpu) < 0)
-               return;
-       if (!dbs_info->enable) {
-               unlock_policy_rwsem_write(cpu);
-               return;
-       }
+       mutex_lock(&dbs_info->timer_mutex);
  
        dbs_check_cpu(dbs_info);
  
        queue_delayed_work_on(cpu, kconservative_wq, &dbs_info->work, delay);
-       unlock_policy_rwsem_write(cpu);
+       mutex_unlock(&dbs_info->timer_mutex);
  }
  
  static inline void dbs_timer_init(struct cpu_dbs_info_s *dbs_info)
@@@ -528,16 -521,13 +521,13 @@@ static int cpufreq_governor_dbs(struct 
        unsigned int j;
        int rc;
  
 -      this_dbs_info = &per_cpu(cpu_dbs_info, cpu);
 +      this_dbs_info = &per_cpu(cs_cpu_dbs_info, cpu);
  
        switch (event) {
        case CPUFREQ_GOV_START:
                if ((!cpu_online(cpu)) || (!policy->cur))
                        return -EINVAL;
  
-               if (this_dbs_info->enable) /* Already enabled */
-                       break;
                mutex_lock(&dbs_mutex);
  
                rc = sysfs_create_group(&policy->kobj, &dbs_attr_group);
  
                for_each_cpu(j, policy->cpus) {
                        struct cpu_dbs_info_s *j_dbs_info;
 -                      j_dbs_info = &per_cpu(cpu_dbs_info, j);
 +                      j_dbs_info = &per_cpu(cs_cpu_dbs_info, j);
                        j_dbs_info->cur_policy = policy;
  
                        j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
                this_dbs_info->down_skip = 0;
                this_dbs_info->requested_freq = policy->cur;
  
+               mutex_init(&this_dbs_info->timer_mutex);
                dbs_enable++;
                /*
                 * Start the timerschedule work, when this governor
                                        &dbs_cpufreq_notifier_block,
                                        CPUFREQ_TRANSITION_NOTIFIER);
                }
-               dbs_timer_init(this_dbs_info);
                mutex_unlock(&dbs_mutex);
  
+               dbs_timer_init(this_dbs_info);
                break;
  
        case CPUFREQ_GOV_STOP:
-               mutex_lock(&dbs_mutex);
                dbs_timer_exit(this_dbs_info);
+               mutex_lock(&dbs_mutex);
                sysfs_remove_group(&policy->kobj, &dbs_attr_group);
                dbs_enable--;
+               mutex_destroy(&this_dbs_info->timer_mutex);
  
                /*
                 * Stop the timerschedule work, when this governor
                break;
  
        case CPUFREQ_GOV_LIMITS:
-               mutex_lock(&dbs_mutex);
+               mutex_lock(&this_dbs_info->timer_mutex);
                if (policy->max < this_dbs_info->cur_policy->cur)
                        __cpufreq_driver_target(
                                        this_dbs_info->cur_policy,
                        __cpufreq_driver_target(
                                        this_dbs_info->cur_policy,
                                        policy->min, CPUFREQ_RELATION_L);
-               mutex_unlock(&dbs_mutex);
+               mutex_unlock(&this_dbs_info->timer_mutex);
  
                break;
        }
@@@ -70,23 -70,21 +70,21 @@@ struct cpu_dbs_info_s 
        unsigned int freq_lo_jiffies;
        unsigned int freq_hi_jiffies;
        int cpu;
-       unsigned int enable:1,
-               sample_type:1;
+       unsigned int sample_type:1;
+       /*
+        * percpu mutex that serializes governor limit change with
+        * do_dbs_timer invocation. We do not want do_dbs_timer to run
+        * when user is changing the governor or limits.
+        */
+       struct mutex timer_mutex;
  };
 -static DEFINE_PER_CPU(struct cpu_dbs_info_s, cpu_dbs_info);
 +static DEFINE_PER_CPU(struct cpu_dbs_info_s, od_cpu_dbs_info);
  
  static unsigned int dbs_enable;       /* number of CPUs using this policy */
  
  /*
-  * DEADLOCK ALERT! There is a ordering requirement between cpu_hotplug
-  * lock and dbs_mutex. cpu_hotplug lock should always be held before
-  * dbs_mutex. If any function that can potentially take cpu_hotplug lock
-  * (like __cpufreq_driver_target()) is being called with dbs_mutex taken, then
-  * cpu_hotplug lock should be taken before that. Note that cpu_hotplug lock
-  * is recursive for the same process. -Venki
-  * DEADLOCK ALERT! (2) : do_dbs_timer() must not take the dbs_mutex, because it
-  * would deadlock with cancel_delayed_work_sync(), which is needed for proper
-  * raceless workqueue teardown.
+  * dbs_mutex protects data in dbs_tuners_ins from concurrent changes on
+  * different CPUs. It protects dbs_enable in governor start/stop.
   */
  static DEFINE_MUTEX(dbs_mutex);
  
@@@ -151,8 -149,7 +149,8 @@@ static unsigned int powersave_bias_targ
        unsigned int freq_hi, freq_lo;
        unsigned int index = 0;
        unsigned int jiffies_total, jiffies_hi, jiffies_lo;
 -      struct cpu_dbs_info_s *dbs_info = &per_cpu(cpu_dbs_info, policy->cpu);
 +      struct cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info,
 +                                                 policy->cpu);
  
        if (!dbs_info->freq_table) {
                dbs_info->freq_lo = 0;
        return freq_hi;
  }
  
 -      struct cpu_dbs_info_s *dbs_info = &per_cpu(cpu_dbs_info, cpu);
+ static void ondemand_powersave_bias_init_cpu(int cpu)
+ {
++      struct cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
+       dbs_info->freq_table = cpufreq_frequency_get_table(cpu);
+       dbs_info->freq_lo = 0;
+ }
  static void ondemand_powersave_bias_init(void)
  {
        int i;
        for_each_online_cpu(i) {
-               struct cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, i);
-               dbs_info->freq_table = cpufreq_frequency_get_table(i);
-               dbs_info->freq_lo = 0;
+               ondemand_powersave_bias_init_cpu(i);
        }
  }
  
@@@ -241,12 -243,10 +244,10 @@@ static ssize_t store_sampling_rate(stru
        unsigned int input;
        int ret;
        ret = sscanf(buf, "%u", &input);
+       if (ret != 1)
+               return -EINVAL;
  
        mutex_lock(&dbs_mutex);
-       if (ret != 1) {
-               mutex_unlock(&dbs_mutex);
-               return -EINVAL;
-       }
        dbs_tuners_ins.sampling_rate = max(input, min_sampling_rate);
        mutex_unlock(&dbs_mutex);
  
@@@ -260,13 -260,12 +261,12 @@@ static ssize_t store_up_threshold(struc
        int ret;
        ret = sscanf(buf, "%u", &input);
  
-       mutex_lock(&dbs_mutex);
        if (ret != 1 || input > MAX_FREQUENCY_UP_THRESHOLD ||
                        input < MIN_FREQUENCY_UP_THRESHOLD) {
-               mutex_unlock(&dbs_mutex);
                return -EINVAL;
        }
  
+       mutex_lock(&dbs_mutex);
        dbs_tuners_ins.up_threshold = input;
        mutex_unlock(&dbs_mutex);
  
@@@ -298,7 -297,7 +298,7 @@@ static ssize_t store_ignore_nice_load(s
        /* we need to re-evaluate prev_cpu_idle */
        for_each_online_cpu(j) {
                struct cpu_dbs_info_s *dbs_info;
 -              dbs_info = &per_cpu(cpu_dbs_info, j);
 +              dbs_info = &per_cpu(od_cpu_dbs_info, j);
                dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
                                                &dbs_info->prev_cpu_wall);
                if (dbs_tuners_ins.ignore_nice)
@@@ -364,9 -363,6 +364,6 @@@ static void dbs_check_cpu(struct cpu_db
        struct cpufreq_policy *policy;
        unsigned int j;
  
-       if (!this_dbs_info->enable)
-               return;
        this_dbs_info->freq_lo = 0;
        policy = this_dbs_info->cur_policy;
  
                unsigned int load, load_freq;
                int freq_avg;
  
 -              j_dbs_info = &per_cpu(cpu_dbs_info, j);
 +              j_dbs_info = &per_cpu(od_cpu_dbs_info, j);
  
                cur_idle_time = get_cpu_idle_time(j, &cur_wall_time);
  
@@@ -494,14 -490,7 +491,7 @@@ static void do_dbs_timer(struct work_st
        int delay = usecs_to_jiffies(dbs_tuners_ins.sampling_rate);
  
        delay -= jiffies % delay;
-       if (lock_policy_rwsem_write(cpu) < 0)
-               return;
-       if (!dbs_info->enable) {
-               unlock_policy_rwsem_write(cpu);
-               return;
-       }
+       mutex_lock(&dbs_info->timer_mutex);
  
        /* Common NORMAL_SAMPLE setup */
        dbs_info->sample_type = DBS_NORMAL_SAMPLE;
                        dbs_info->freq_lo, CPUFREQ_RELATION_H);
        }
        queue_delayed_work_on(cpu, kondemand_wq, &dbs_info->work, delay);
-       unlock_policy_rwsem_write(cpu);
+       mutex_unlock(&dbs_info->timer_mutex);
  }
  
  static inline void dbs_timer_init(struct cpu_dbs_info_s *dbs_info)
        int delay = usecs_to_jiffies(dbs_tuners_ins.sampling_rate);
        delay -= jiffies % delay;
  
-       dbs_info->enable = 1;
-       ondemand_powersave_bias_init();
        dbs_info->sample_type = DBS_NORMAL_SAMPLE;
        INIT_DELAYED_WORK_DEFERRABLE(&dbs_info->work, do_dbs_timer);
        queue_delayed_work_on(dbs_info->cpu, kondemand_wq, &dbs_info->work,
  
  static inline void dbs_timer_exit(struct cpu_dbs_info_s *dbs_info)
  {
-       dbs_info->enable = 0;
        cancel_delayed_work_sync(&dbs_info->work);
  }
  
@@@ -549,29 -535,25 +536,25 @@@ static int cpufreq_governor_dbs(struct 
        unsigned int j;
        int rc;
  
 -      this_dbs_info = &per_cpu(cpu_dbs_info, cpu);
 +      this_dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
  
        switch (event) {
        case CPUFREQ_GOV_START:
                if ((!cpu_online(cpu)) || (!policy->cur))
                        return -EINVAL;
  
-               if (this_dbs_info->enable) /* Already enabled */
-                       break;
                mutex_lock(&dbs_mutex);
-               dbs_enable++;
  
                rc = sysfs_create_group(&policy->kobj, &dbs_attr_group);
                if (rc) {
-                       dbs_enable--;
                        mutex_unlock(&dbs_mutex);
                        return rc;
                }
  
+               dbs_enable++;
                for_each_cpu(j, policy->cpus) {
                        struct cpu_dbs_info_s *j_dbs_info;
 -                      j_dbs_info = &per_cpu(cpu_dbs_info, j);
 +                      j_dbs_info = &per_cpu(od_cpu_dbs_info, j);
                        j_dbs_info->cur_policy = policy;
  
                        j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
                        }
                }
                this_dbs_info->cpu = cpu;
+               ondemand_powersave_bias_init_cpu(cpu);
+               mutex_init(&this_dbs_info->timer_mutex);
                /*
                 * Start the timerschedule work, when this governor
                 * is used for first time
                                max(min_sampling_rate,
                                    latency * LATENCY_MULTIPLIER);
                }
-               dbs_timer_init(this_dbs_info);
                mutex_unlock(&dbs_mutex);
+               dbs_timer_init(this_dbs_info);
                break;
  
        case CPUFREQ_GOV_STOP:
-               mutex_lock(&dbs_mutex);
                dbs_timer_exit(this_dbs_info);
+               mutex_lock(&dbs_mutex);
                sysfs_remove_group(&policy->kobj, &dbs_attr_group);
+               mutex_destroy(&this_dbs_info->timer_mutex);
                dbs_enable--;
                mutex_unlock(&dbs_mutex);
  
                break;
  
        case CPUFREQ_GOV_LIMITS:
-               mutex_lock(&dbs_mutex);
+               mutex_lock(&this_dbs_info->timer_mutex);
                if (policy->max < this_dbs_info->cur_policy->cur)
                        __cpufreq_driver_target(this_dbs_info->cur_policy,
                                policy->max, CPUFREQ_RELATION_H);
                else if (policy->min > this_dbs_info->cur_policy->cur)
                        __cpufreq_driver_target(this_dbs_info->cur_policy,
                                policy->min, CPUFREQ_RELATION_L);
-               mutex_unlock(&dbs_mutex);
+               mutex_unlock(&this_dbs_info->timer_mutex);
                break;
        }
        return 0;
diff --combined drivers/xen/events.c
  static DEFINE_SPINLOCK(irq_mapping_update_lock);
  
  /* IRQ <-> VIRQ mapping. */
 -static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1};
 +static DEFINE_PER_CPU(int [NR_VIRQS], virq_to_irq) = {[0 ... NR_VIRQS-1] = -1};
  
  /* IRQ <-> IPI mapping */
 -static DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = -1};
 +static DEFINE_PER_CPU(int [XEN_NR_IPIS], ipi_to_irq) = {[0 ... XEN_NR_IPIS-1] = -1};
  
  /* Interrupt types. */
  enum xen_irq_type {
@@@ -602,8 -602,6 +602,8 @@@ irqreturn_t xen_debug_interrupt(int irq
        return IRQ_HANDLED;
  }
  
 +static DEFINE_PER_CPU(unsigned, xed_nesting_count);
 +
  /*
   * Search the CPUs pending events bitmasks.  For each one found, map
   * the event number to an irq, and feed it into do_IRQ() for
@@@ -619,6 -617,7 +619,6 @@@ void xen_evtchn_do_upcall(struct pt_reg
        struct pt_regs *old_regs = set_irq_regs(regs);
        struct shared_info *s = HYPERVISOR_shared_info;
        struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
 -      static DEFINE_PER_CPU(unsigned, nesting_count);
        unsigned count;
  
        exit_idle();
  
                vcpu_info->evtchn_upcall_pending = 0;
  
 -              if (__get_cpu_var(nesting_count)++)
 +              if (__get_cpu_var(xed_nesting_count)++)
                        goto out;
  
  #ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */
  
                BUG_ON(!irqs_disabled());
  
 -              count = __get_cpu_var(nesting_count);
 -              __get_cpu_var(nesting_count) = 0;
 +              count = __get_cpu_var(xed_nesting_count);
 +              __get_cpu_var(xed_nesting_count) = 0;
        } while(count != 1);
  
  out:
@@@ -928,9 -927,9 +928,9 @@@ static struct irq_chip xen_dynamic_chi
  void __init xen_init_IRQ(void)
  {
        int i;
-       size_t size = nr_cpu_ids * sizeof(struct cpu_evtchn_s);
  
-       cpu_evtchn_mask_p = alloc_bootmem(size);
+       cpu_evtchn_mask_p = kcalloc(nr_cpu_ids, sizeof(struct cpu_evtchn_s),
+                                   GFP_KERNEL);
        BUG_ON(cpu_evtchn_mask_p == NULL);
  
        init_evtchn_cpu_bindings();
   *    EXCEPTION_TABLE(...)
   *    NOTES
   *
-  *    __bss_start = .;
-  *    BSS_SECTION(0, 0)
-  *    __bss_stop = .;
+  *    BSS_SECTION(0, 0, 0)
   *    _end = .;
   *
 - *    /DISCARD/ : {
 - *            EXIT_TEXT
 - *            EXIT_DATA
 - *            EXIT_CALL
 - *    }
   *    STABS_DEBUG
   *    DWARF_DEBUG
 + *
 + *    DISCARDS                // must be the last
   * }
   *
   * [__init_begin, __init_end] is the init section that may be freed after init
        . = ALIGN(align);                                               \
        *(.data.cacheline_aligned)
  
- #define INIT_TASK(align)                                              \
+ #define INIT_TASK_DATA(align)                                         \
        . = ALIGN(align);                                               \
        *(.data.init_task)
  
  /*
   * Init task
   */
- #define INIT_TASK_DATA(align)                                         \
+ #define INIT_TASK_DATA_SECTION(align)                                 \
        . = ALIGN(align);                                               \
        .data.init_task : {                                             \
-               INIT_TASK                                               \
+               INIT_TASK_DATA(align)                                   \
        }
  
  #ifdef CONFIG_CONSTRUCTORS
   * bss (Block Started by Symbol) - uninitialized data
   * zeroed during startup
   */
- #define SBSS                                                          \
+ #define SBSS(sbss_align)                                              \
+       . = ALIGN(sbss_align);                                          \
        .sbss : AT(ADDR(.sbss) - LOAD_OFFSET) {                         \
                *(.sbss)                                                \
                *(.scommon)                                             \
  #define BSS(bss_align)                                                        \
        . = ALIGN(bss_align);                                           \
        .bss : AT(ADDR(.bss) - LOAD_OFFSET) {                           \
-               VMLINUX_SYMBOL(__bss_start) = .;                        \
                *(.bss.page_aligned)                                    \
                *(.dynbss)                                              \
                *(.bss)                                                 \
                *(COMMON)                                               \
-               VMLINUX_SYMBOL(__bss_stop) = .;                         \
        }
  
  /*
  #define INIT_RAM_FS
  #endif
  
 +/*
 + * Default discarded sections.
 + *
 + * Some archs want to discard exit text/data at runtime rather than
 + * link time due to cross-section references such as alt instructions,
 + * bug table, eh_frame, etc.  DISCARDS must be the last of output
 + * section definitions so that such archs put those in earlier section
 + * definitions.
 + */
 +#define DISCARDS                                                      \
 +      /DISCARD/ : {                                                   \
 +      EXIT_TEXT                                                       \
 +      EXIT_DATA                                                       \
 +      EXIT_CALL                                                       \
 +      *(.discard)                                                     \
 +      }
 +
  /**
   * PERCPU_VADDR - define output section for percpu area
   * @vaddr: explicit base address (optional)
   * matches the requirment of PAGE_ALIGNED_DATA.
   *
   * use 0 as page_align if page_aligned data is not used */
- #define RW_DATA_SECTION(cacheline, nosave, pagealigned, inittask)     \
+ #define RW_DATA_SECTION(cacheline, pagealigned, inittask)             \
        . = ALIGN(PAGE_SIZE);                                           \
        .data : AT(ADDR(.data) - LOAD_OFFSET) {                         \
-               INIT_TASK(inittask)                                     \
+               INIT_TASK_DATA(inittask)                                \
                CACHELINE_ALIGNED_DATA(cacheline)                       \
                READ_MOSTLY_DATA(cacheline)                             \
                DATA_DATA                                               \
                CONSTRUCTORS                                            \
-               NOSAVE_DATA(nosave)                                     \
+               NOSAVE_DATA                                             \
                PAGE_ALIGNED_DATA(pagealigned)                          \
        }
  
                INIT_RAM_FS                                             \
        }
  
- #define BSS_SECTION(sbss_align, bss_align)                            \
-       SBSS                                                            \
+ #define BSS_SECTION(sbss_align, bss_align, stop_align)                        \
+       . = ALIGN(sbss_align);                                          \
+       VMLINUX_SYMBOL(__bss_start) = .;                                \
+       SBSS(sbss_align)                                                \
        BSS(bss_align)                                                  \
-       . = ALIGN(4);
+       . = ALIGN(stop_align);                                          \
+       VMLINUX_SYMBOL(__bss_stop) = .;
diff --combined init/main.c
@@@ -353,6 -353,7 +353,6 @@@ static void __init smp_init(void
  #define smp_init()    do { } while (0)
  #endif
  
 -static inline void setup_per_cpu_areas(void) { }
  static inline void setup_nr_cpu_ids(void) { }
  static inline void smp_prepare_cpus(unsigned int maxcpus) { }
  
@@@ -373,6 -374,29 +373,6 @@@ static void __init setup_nr_cpu_ids(voi
        nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1;
  }
  
 -#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
 -unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
 -
 -EXPORT_SYMBOL(__per_cpu_offset);
 -
 -static void __init setup_per_cpu_areas(void)
 -{
 -      unsigned long size, i;
 -      char *ptr;
 -      unsigned long nr_possible_cpus = num_possible_cpus();
 -
 -      /* Copy section for each CPU (we discard the original) */
 -      size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE);
 -      ptr = alloc_bootmem_pages(size * nr_possible_cpus);
 -
 -      for_each_possible_cpu(i) {
 -              __per_cpu_offset[i] = ptr - __per_cpu_start;
 -              memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
 -              ptr += size;
 -      }
 -}
 -#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
 -
  /* Called by boot processor to activate the rest. */
  static void __init smp_init(void)
  {
@@@ -560,8 -584,8 +560,8 @@@ asmlinkage void __init start_kernel(voi
        setup_arch(&command_line);
        mm_init_owner(&init_mm, &init_task);
        setup_command_line(command_line);
-       setup_per_cpu_areas();
        setup_nr_cpu_ids();
+       setup_per_cpu_areas();
        smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
  
        build_all_zonelists();
diff --combined kernel/module.c
@@@ -364,7 -364,7 +364,7 @@@ EXPORT_SYMBOL_GPL(find_module)
  
  #ifdef CONFIG_SMP
  
 -#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
 +#ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA
  
  static void *percpu_modalloc(unsigned long size, unsigned long align,
                             const char *name)
@@@ -389,7 -389,7 +389,7 @@@ static void percpu_modfree(void *freeme
        free_percpu(freeme);
  }
  
 -#else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
 +#else /* ... CONFIG_HAVE_LEGACY_PER_CPU_AREA */
  
  /* Number of blocks used and allocated. */
  static unsigned int pcpu_num_used, pcpu_num_allocated;
@@@ -535,7 -535,7 +535,7 @@@ static int percpu_modinit(void
  }
  __initcall(percpu_modinit);
  
 -#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
 +#endif /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */
  
  static unsigned int find_pcpusec(Elf_Ehdr *hdr,
                                 Elf_Shdr *sechdrs,
@@@ -1068,7 -1068,8 +1068,8 @@@ static inline int check_modstruct_versi
  {
        const unsigned long *crc;
  
-       if (!find_symbol("module_layout", NULL, &crc, true, false))
+       if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL,
+                        &crc, true, false))
                BUG();
        return check_version(sechdrs, versindex, "module_layout", mod, crc);
  }
@@@ -2451,9 -2452,9 +2452,9 @@@ SYSCALL_DEFINE3(init_module, void __use
                return ret;
        }
        if (ret > 0) {
-               printk(KERN_WARNING "%s: '%s'->init suspiciously returned %d, "
                                  "it should follow 0/-E convention\n"
                     KERN_WARNING "%s: loading module anyway...\n",
+               printk(KERN_WARNING
"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n"
+ "%s: loading module anyway...\n",
                       __func__, mod->name, ret,
                       __func__);
                dump_stack();
diff --combined kernel/perf_counter.c
@@@ -42,6 -42,7 +42,7 @@@ static int perf_overcommit __read_mostl
  static atomic_t nr_counters __read_mostly;
  static atomic_t nr_mmap_counters __read_mostly;
  static atomic_t nr_comm_counters __read_mostly;
+ static atomic_t nr_task_counters __read_mostly;
  
  /*
   * perf counter paranoia level:
@@@ -87,6 -88,7 +88,7 @@@ void __weak hw_perf_disable(void)             { ba
  void __weak hw_perf_enable(void)              { barrier(); }
  
  void __weak hw_perf_counter_setup(int cpu)    { barrier(); }
+ void __weak hw_perf_counter_setup_online(int cpu)     { barrier(); }
  
  int __weak
  hw_perf_group_sched_in(struct perf_counter *group_leader,
  
  void __weak perf_counter_print_debug(void)    { }
  
 -static DEFINE_PER_CPU(int, disable_count);
 +static DEFINE_PER_CPU(int, perf_disable_count);
  
  void __perf_disable(void)
  {
 -      __get_cpu_var(disable_count)++;
 +      __get_cpu_var(perf_disable_count)++;
  }
  
  bool __perf_enable(void)
  {
 -      return !--__get_cpu_var(disable_count);
 +      return !--__get_cpu_var(perf_disable_count);
  }
  
  void perf_disable(void)
@@@ -146,6 -148,28 +148,28 @@@ static void put_ctx(struct perf_counter
        }
  }
  
+ static void unclone_ctx(struct perf_counter_context *ctx)
+ {
+       if (ctx->parent_ctx) {
+               put_ctx(ctx->parent_ctx);
+               ctx->parent_ctx = NULL;
+       }
+ }
+ /*
+  * If we inherit counters we want to return the parent counter id
+  * to userspace.
+  */
+ static u64 primary_counter_id(struct perf_counter *counter)
+ {
+       u64 id = counter->id;
+       if (counter->parent)
+               id = counter->parent->id;
+       return id;
+ }
  /*
   * Get the perf_counter_context for a task and lock it.
   * This has to cope with with the fact that until it is locked,
@@@ -283,6 -307,10 +307,10 @@@ counter_sched_out(struct perf_counter *
                return;
  
        counter->state = PERF_COUNTER_STATE_INACTIVE;
+       if (counter->pending_disable) {
+               counter->pending_disable = 0;
+               counter->state = PERF_COUNTER_STATE_OFF;
+       }
        counter->tstamp_stopped = ctx->time;
        counter->pmu->disable(counter);
        counter->oncpu = -1;
@@@ -1081,7 -1109,7 +1109,7 @@@ static void perf_counter_sync_stat(stru
                __perf_counter_sync_stat(counter, next_counter);
  
                counter = list_next_entry(counter, event_entry);
-               next_counter = list_next_entry(counter, event_entry);
+               next_counter = list_next_entry(next_counter, event_entry);
        }
  }
  
@@@ -1288,7 -1316,6 +1316,6 @@@ static void perf_counter_cpu_sched_in(s
  #define MAX_INTERRUPTS (~0ULL)
  
  static void perf_log_throttle(struct perf_counter *counter, int enable);
- static void perf_log_period(struct perf_counter *counter, u64 period);
  
  static void perf_adjust_period(struct perf_counter *counter, u64 events)
  {
        if (!sample_period)
                sample_period = 1;
  
-       perf_log_period(counter, sample_period);
        hwc->sample_period = sample_period;
  }
  
@@@ -1463,10 -1488,8 +1488,8 @@@ static void perf_counter_enable_on_exec
        /*
         * Unclone this context if we enabled any counter.
         */
-       if (enabled && ctx->parent_ctx) {
-               put_ctx(ctx->parent_ctx);
-               ctx->parent_ctx = NULL;
-       }
+       if (enabled)
+               unclone_ctx(ctx);
  
        spin_unlock(&ctx->lock);
  
@@@ -1526,7 -1549,6 +1549,6 @@@ __perf_counter_init_context(struct perf
  
  static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
  {
-       struct perf_counter_context *parent_ctx;
        struct perf_counter_context *ctx;
        struct perf_cpu_context *cpuctx;
        struct task_struct *task;
   retry:
        ctx = perf_lock_task_context(task, &flags);
        if (ctx) {
-               parent_ctx = ctx->parent_ctx;
-               if (parent_ctx) {
-                       put_ctx(parent_ctx);
-                       ctx->parent_ctx = NULL;         /* no longer a clone */
-               }
+               unclone_ctx(ctx);
                spin_unlock_irqrestore(&ctx->lock, flags);
        }
  
@@@ -1642,6 -1660,8 +1660,8 @@@ static void free_counter(struct perf_co
                        atomic_dec(&nr_mmap_counters);
                if (counter->attr.comm)
                        atomic_dec(&nr_comm_counters);
+               if (counter->attr.task)
+                       atomic_dec(&nr_task_counters);
        }
  
        if (counter->destroy)
@@@ -1676,14 -1696,133 +1696,133 @@@ static int perf_release(struct inode *i
        return 0;
  }
  
+ static int perf_counter_read_size(struct perf_counter *counter)
+ {
+       int entry = sizeof(u64); /* value */
+       int size = 0;
+       int nr = 1;
+       if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+               size += sizeof(u64);
+       if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+               size += sizeof(u64);
+       if (counter->attr.read_format & PERF_FORMAT_ID)
+               entry += sizeof(u64);
+       if (counter->attr.read_format & PERF_FORMAT_GROUP) {
+               nr += counter->group_leader->nr_siblings;
+               size += sizeof(u64);
+       }
+       size += entry * nr;
+       return size;
+ }
+ static u64 perf_counter_read_value(struct perf_counter *counter)
+ {
+       struct perf_counter *child;
+       u64 total = 0;
+       total += perf_counter_read(counter);
+       list_for_each_entry(child, &counter->child_list, child_list)
+               total += perf_counter_read(child);
+       return total;
+ }
+ static int perf_counter_read_entry(struct perf_counter *counter,
+                                  u64 read_format, char __user *buf)
+ {
+       int n = 0, count = 0;
+       u64 values[2];
+       values[n++] = perf_counter_read_value(counter);
+       if (read_format & PERF_FORMAT_ID)
+               values[n++] = primary_counter_id(counter);
+       count = n * sizeof(u64);
+       if (copy_to_user(buf, values, count))
+               return -EFAULT;
+       return count;
+ }
+ static int perf_counter_read_group(struct perf_counter *counter,
+                                  u64 read_format, char __user *buf)
+ {
+       struct perf_counter *leader = counter->group_leader, *sub;
+       int n = 0, size = 0, err = -EFAULT;
+       u64 values[3];
+       values[n++] = 1 + leader->nr_siblings;
+       if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+               values[n++] = leader->total_time_enabled +
+                       atomic64_read(&leader->child_total_time_enabled);
+       }
+       if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+               values[n++] = leader->total_time_running +
+                       atomic64_read(&leader->child_total_time_running);
+       }
+       size = n * sizeof(u64);
+       if (copy_to_user(buf, values, size))
+               return -EFAULT;
+       err = perf_counter_read_entry(leader, read_format, buf + size);
+       if (err < 0)
+               return err;
+       size += err;
+       list_for_each_entry(sub, &leader->sibling_list, list_entry) {
+               err = perf_counter_read_entry(counter, read_format,
+                               buf + size);
+               if (err < 0)
+                       return err;
+               size += err;
+       }
+       return size;
+ }
+ static int perf_counter_read_one(struct perf_counter *counter,
+                                u64 read_format, char __user *buf)
+ {
+       u64 values[4];
+       int n = 0;
+       values[n++] = perf_counter_read_value(counter);
+       if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+               values[n++] = counter->total_time_enabled +
+                       atomic64_read(&counter->child_total_time_enabled);
+       }
+       if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+               values[n++] = counter->total_time_running +
+                       atomic64_read(&counter->child_total_time_running);
+       }
+       if (read_format & PERF_FORMAT_ID)
+               values[n++] = primary_counter_id(counter);
+       if (copy_to_user(buf, values, n * sizeof(u64)))
+               return -EFAULT;
+       return n * sizeof(u64);
+ }
  /*
   * Read the performance counter - simple non blocking version for now
   */
  static ssize_t
  perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
  {
-       u64 values[4];
-       int n;
+       u64 read_format = counter->attr.read_format;
+       int ret;
  
        /*
         * Return end-of-file for a read on a counter that is in
        if (counter->state == PERF_COUNTER_STATE_ERROR)
                return 0;
  
+       if (count < perf_counter_read_size(counter))
+               return -ENOSPC;
        WARN_ON_ONCE(counter->ctx->parent_ctx);
        mutex_lock(&counter->child_mutex);
-       values[0] = perf_counter_read(counter);
-       n = 1;
-       if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
-               values[n++] = counter->total_time_enabled +
-                       atomic64_read(&counter->child_total_time_enabled);
-       if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
-               values[n++] = counter->total_time_running +
-                       atomic64_read(&counter->child_total_time_running);
-       if (counter->attr.read_format & PERF_FORMAT_ID)
-               values[n++] = counter->id;
+       if (read_format & PERF_FORMAT_GROUP)
+               ret = perf_counter_read_group(counter, read_format, buf);
+       else
+               ret = perf_counter_read_one(counter, read_format, buf);
        mutex_unlock(&counter->child_mutex);
  
-       if (count < n * sizeof(u64))
-               return -EINVAL;
-       count = n * sizeof(u64);
-       if (copy_to_user(buf, values, count))
-               return -EFAULT;
-       return count;
+       return ret;
  }
  
  static ssize_t
@@@ -1811,8 -1940,6 +1940,6 @@@ static int perf_counter_period(struct p
  
                counter->attr.sample_freq = value;
        } else {
-               perf_log_period(counter, value);
                counter->attr.sample_period = value;
                counter->hw.sample_period = value;
        }
@@@ -2020,7 -2147,7 +2147,7 @@@ fail
  
  static void perf_mmap_free_page(unsigned long addr)
  {
-       struct page *page = virt_to_page(addr);
+       struct page *page = virt_to_page((void *)addr);
  
        page->mapping = NULL;
        __free_page(page);
@@@ -2220,7 -2347,7 +2347,7 @@@ static void perf_pending_counter(struc
  
        if (counter->pending_disable) {
                counter->pending_disable = 0;
-               perf_counter_disable(counter);
+               __perf_counter_disable(counter);
        }
  
        if (counter->pending_wakeup) {
@@@ -2605,7 -2732,80 +2732,80 @@@ static u32 perf_counter_tid(struct perf
        return task_pid_nr_ns(p, counter->ns);
  }
  
- static void perf_counter_output(struct perf_counter *counter, int nmi,
+ static void perf_output_read_one(struct perf_output_handle *handle,
+                                struct perf_counter *counter)
+ {
+       u64 read_format = counter->attr.read_format;
+       u64 values[4];
+       int n = 0;
+       values[n++] = atomic64_read(&counter->count);
+       if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+               values[n++] = counter->total_time_enabled +
+                       atomic64_read(&counter->child_total_time_enabled);
+       }
+       if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+               values[n++] = counter->total_time_running +
+                       atomic64_read(&counter->child_total_time_running);
+       }
+       if (read_format & PERF_FORMAT_ID)
+               values[n++] = primary_counter_id(counter);
+       perf_output_copy(handle, values, n * sizeof(u64));
+ }
+ /*
+  * XXX PERF_FORMAT_GROUP vs inherited counters seems difficult.
+  */
+ static void perf_output_read_group(struct perf_output_handle *handle,
+                           struct perf_counter *counter)
+ {
+       struct perf_counter *leader = counter->group_leader, *sub;
+       u64 read_format = counter->attr.read_format;
+       u64 values[5];
+       int n = 0;
+       values[n++] = 1 + leader->nr_siblings;
+       if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+               values[n++] = leader->total_time_enabled;
+       if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+               values[n++] = leader->total_time_running;
+       if (leader != counter)
+               leader->pmu->read(leader);
+       values[n++] = atomic64_read(&leader->count);
+       if (read_format & PERF_FORMAT_ID)
+               values[n++] = primary_counter_id(leader);
+       perf_output_copy(handle, values, n * sizeof(u64));
+       list_for_each_entry(sub, &leader->sibling_list, list_entry) {
+               n = 0;
+               if (sub != counter)
+                       sub->pmu->read(sub);
+               values[n++] = atomic64_read(&sub->count);
+               if (read_format & PERF_FORMAT_ID)
+                       values[n++] = primary_counter_id(sub);
+               perf_output_copy(handle, values, n * sizeof(u64));
+       }
+ }
+ static void perf_output_read(struct perf_output_handle *handle,
+                            struct perf_counter *counter)
+ {
+       if (counter->attr.read_format & PERF_FORMAT_GROUP)
+               perf_output_read_group(handle, counter);
+       else
+               perf_output_read_one(handle, counter);
+ }
+ void perf_counter_output(struct perf_counter *counter, int nmi,
                                struct perf_sample_data *data)
  {
        int ret;
        struct {
                u32 pid, tid;
        } tid_entry;
-       struct {
-               u64 id;
-               u64 counter;
-       } group_entry;
        struct perf_callchain_entry *callchain = NULL;
        int callchain_size = 0;
        u64 time;
        if (sample_type & PERF_SAMPLE_ID)
                header.size += sizeof(u64);
  
+       if (sample_type & PERF_SAMPLE_STREAM_ID)
+               header.size += sizeof(u64);
        if (sample_type & PERF_SAMPLE_CPU) {
                header.size += sizeof(cpu_entry);
  
                cpu_entry.cpu = raw_smp_processor_id();
+               cpu_entry.reserved = 0;
        }
  
        if (sample_type & PERF_SAMPLE_PERIOD)
                header.size += sizeof(u64);
  
-       if (sample_type & PERF_SAMPLE_GROUP) {
-               header.size += sizeof(u64) +
-                       counter->nr_siblings * sizeof(group_entry);
-       }
+       if (sample_type & PERF_SAMPLE_READ)
+               header.size += perf_counter_read_size(counter);
  
        if (sample_type & PERF_SAMPLE_CALLCHAIN) {
                callchain = perf_callchain(data->regs);
                        header.size += sizeof(u64);
        }
  
+       if (sample_type & PERF_SAMPLE_RAW) {
+               int size = sizeof(u32);
+               if (data->raw)
+                       size += data->raw->size;
+               else
+                       size += sizeof(u32);
+               WARN_ON_ONCE(size & (sizeof(u64)-1));
+               header.size += size;
+       }
        ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
        if (ret)
                return;
        if (sample_type & PERF_SAMPLE_ADDR)
                perf_output_put(&handle, data->addr);
  
-       if (sample_type & PERF_SAMPLE_ID)
+       if (sample_type & PERF_SAMPLE_ID) {
+               u64 id = primary_counter_id(counter);
+               perf_output_put(&handle, id);
+       }
+       if (sample_type & PERF_SAMPLE_STREAM_ID)
                perf_output_put(&handle, counter->id);
  
        if (sample_type & PERF_SAMPLE_CPU)
        if (sample_type & PERF_SAMPLE_PERIOD)
                perf_output_put(&handle, data->period);
  
-       /*
-        * XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult.
-        */
-       if (sample_type & PERF_SAMPLE_GROUP) {
-               struct perf_counter *leader, *sub;
-               u64 nr = counter->nr_siblings;
-               perf_output_put(&handle, nr);
-               leader = counter->group_leader;
-               list_for_each_entry(sub, &leader->sibling_list, list_entry) {
-                       if (sub != counter)
-                               sub->pmu->read(sub);
-                       group_entry.id = sub->id;
-                       group_entry.counter = atomic64_read(&sub->count);
-                       perf_output_put(&handle, group_entry);
-               }
-       }
+       if (sample_type & PERF_SAMPLE_READ)
+               perf_output_read(&handle, counter);
  
        if (sample_type & PERF_SAMPLE_CALLCHAIN) {
                if (callchain)
                }
        }
  
+       if (sample_type & PERF_SAMPLE_RAW) {
+               if (data->raw) {
+                       perf_output_put(&handle, data->raw->size);
+                       perf_output_copy(&handle, data->raw->data, data->raw->size);
+               } else {
+                       struct {
+                               u32     size;
+                               u32     data;
+                       } raw = {
+                               .size = sizeof(u32),
+                               .data = 0,
+                       };
+                       perf_output_put(&handle, raw);
+               }
+       }
        perf_output_end(&handle);
  }
  
@@@ -2754,8 -2968,6 +2968,6 @@@ struct perf_read_event 
  
        u32                             pid;
        u32                             tid;
-       u64                             value;
-       u64                             format[3];
  };
  
  static void
@@@ -2767,87 -2979,74 +2979,74 @@@ perf_counter_read_event(struct perf_cou
                .header = {
                        .type = PERF_EVENT_READ,
                        .misc = 0,
-                       .size = sizeof(event) - sizeof(event.format),
+                       .size = sizeof(event) + perf_counter_read_size(counter),
                },
                .pid = perf_counter_pid(counter, task),
                .tid = perf_counter_tid(counter, task),
-               .value = atomic64_read(&counter->count),
        };
-       int ret, i = 0;
-       if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
-               event.header.size += sizeof(u64);
-               event.format[i++] = counter->total_time_enabled;
-       }
-       if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
-               event.header.size += sizeof(u64);
-               event.format[i++] = counter->total_time_running;
-       }
-       if (counter->attr.read_format & PERF_FORMAT_ID) {
-               u64 id;
-               event.header.size += sizeof(u64);
-               if (counter->parent)
-                       id = counter->parent->id;
-               else
-                       id = counter->id;
-               event.format[i++] = id;
-       }
+       int ret;
  
        ret = perf_output_begin(&handle, counter, event.header.size, 0, 0);
        if (ret)
                return;
  
-       perf_output_copy(&handle, &event, event.header.size);
+       perf_output_put(&handle, event);
+       perf_output_read(&handle, counter);
        perf_output_end(&handle);
  }
  
  /*
-  * fork tracking
+  * task tracking -- fork/exit
+  *
+  * enabled by: attr.comm | attr.mmap | attr.task
   */
  
- struct perf_fork_event {
-       struct task_struct      *task;
+ struct perf_task_event {
+       struct task_struct              *task;
+       struct perf_counter_context     *task_ctx;
  
        struct {
                struct perf_event_header        header;
  
                u32                             pid;
                u32                             ppid;
+               u32                             tid;
+               u32                             ptid;
        } event;
  };
  
- static void perf_counter_fork_output(struct perf_counter *counter,
-                                    struct perf_fork_event *fork_event)
+ static void perf_counter_task_output(struct perf_counter *counter,
+                                    struct perf_task_event *task_event)
  {
        struct perf_output_handle handle;
-       int size = fork_event->event.header.size;
-       struct task_struct *task = fork_event->task;
+       int size = task_event->event.header.size;
+       struct task_struct *task = task_event->task;
        int ret = perf_output_begin(&handle, counter, size, 0, 0);
  
        if (ret)
                return;
  
-       fork_event->event.pid = perf_counter_pid(counter, task);
-       fork_event->event.ppid = perf_counter_pid(counter, task->real_parent);
+       task_event->event.pid = perf_counter_pid(counter, task);
+       task_event->event.ppid = perf_counter_pid(counter, current);
+       task_event->event.tid = perf_counter_tid(counter, task);
+       task_event->event.ptid = perf_counter_tid(counter, current);
  
-       perf_output_put(&handle, fork_event->event);
+       perf_output_put(&handle, task_event->event);
        perf_output_end(&handle);
  }
  
- static int perf_counter_fork_match(struct perf_counter *counter)
+ static int perf_counter_task_match(struct perf_counter *counter)
  {
-       if (counter->attr.comm || counter->attr.mmap)
+       if (counter->attr.comm || counter->attr.mmap || counter->attr.task)
                return 1;
  
        return 0;
  }
  
- static void perf_counter_fork_ctx(struct perf_counter_context *ctx,
-                                 struct perf_fork_event *fork_event)
+ static void perf_counter_task_ctx(struct perf_counter_context *ctx,
+                                 struct perf_task_event *task_event)
  {
        struct perf_counter *counter;
  
  
        rcu_read_lock();
        list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
-               if (perf_counter_fork_match(counter))
-                       perf_counter_fork_output(counter, fork_event);
+               if (perf_counter_task_match(counter))
+                       perf_counter_task_output(counter, task_event);
        }
        rcu_read_unlock();
  }
  
- static void perf_counter_fork_event(struct perf_fork_event *fork_event)
+ static void perf_counter_task_event(struct perf_task_event *task_event)
  {
        struct perf_cpu_context *cpuctx;
-       struct perf_counter_context *ctx;
+       struct perf_counter_context *ctx = task_event->task_ctx;
  
        cpuctx = &get_cpu_var(perf_cpu_context);
-       perf_counter_fork_ctx(&cpuctx->ctx, fork_event);
+       perf_counter_task_ctx(&cpuctx->ctx, task_event);
        put_cpu_var(perf_cpu_context);
  
        rcu_read_lock();
-       /*
-        * doesn't really matter which of the child contexts the
-        * events ends up in.
-        */
-       ctx = rcu_dereference(current->perf_counter_ctxp);
+       if (!ctx)
+               ctx = rcu_dereference(task_event->task->perf_counter_ctxp);
        if (ctx)
-               perf_counter_fork_ctx(ctx, fork_event);
+               perf_counter_task_ctx(ctx, task_event);
        rcu_read_unlock();
  }
  
- void perf_counter_fork(struct task_struct *task)
+ static void perf_counter_task(struct task_struct *task,
+                             struct perf_counter_context *task_ctx,
+                             int new)
  {
-       struct perf_fork_event fork_event;
+       struct perf_task_event task_event;
  
        if (!atomic_read(&nr_comm_counters) &&
-           !atomic_read(&nr_mmap_counters))
+           !atomic_read(&nr_mmap_counters) &&
+           !atomic_read(&nr_task_counters))
                return;
  
-       fork_event = (struct perf_fork_event){
-               .task   = task,
-               .event  = {
+       task_event = (struct perf_task_event){
+               .task     = task,
+               .task_ctx = task_ctx,
+               .event    = {
                        .header = {
-                               .type = PERF_EVENT_FORK,
-                               .size = sizeof(fork_event.event),
+                               .type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT,
+                               .misc = 0,
+                               .size = sizeof(task_event.event),
                        },
+                       /* .pid  */
+                       /* .ppid */
+                       /* .tid  */
+                       /* .ptid */
                },
        };
  
-       perf_counter_fork_event(&fork_event);
+       perf_counter_task_event(&task_event);
+ }
+ void perf_counter_fork(struct task_struct *task)
+ {
+       perf_counter_task(task, NULL, 1);
  }
  
  /*
@@@ -2968,8 -3178,10 +3178,10 @@@ static void perf_counter_comm_event(str
        struct perf_cpu_context *cpuctx;
        struct perf_counter_context *ctx;
        unsigned int size;
-       char *comm = comm_event->task->comm;
+       char comm[TASK_COMM_LEN];
  
+       memset(comm, 0, sizeof(comm));
+       strncpy(comm, comm_event->task->comm, sizeof(comm));
        size = ALIGN(strlen(comm)+1, sizeof(u64));
  
        comm_event->comm = comm;
@@@ -3004,8 -3216,16 +3216,16 @@@ void perf_counter_comm(struct task_stru
  
        comm_event = (struct perf_comm_event){
                .task   = task,
+               /* .comm      */
+               /* .comm_size */
                .event  = {
-                       .header = { .type = PERF_EVENT_COMM, },
+                       .header = {
+                               .type = PERF_EVENT_COMM,
+                               .misc = 0,
+                               /* .size */
+                       },
+                       /* .pid */
+                       /* .tid */
                },
        };
  
@@@ -3088,8 -3308,15 +3308,15 @@@ static void perf_counter_mmap_event(str
        char *buf = NULL;
        const char *name;
  
+       memset(tmp, 0, sizeof(tmp));
        if (file) {
-               buf = kzalloc(PATH_MAX, GFP_KERNEL);
+               /*
+                * d_path works from the end of the buffer backwards, so we
+                * need to add enough zero bytes after the string to handle
+                * the 64bit alignment we do later.
+                */
+               buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
                if (!buf) {
                        name = strncpy(tmp, "//enomem", sizeof(tmp));
                        goto got_name;
                        goto got_name;
                }
        } else {
-               name = arch_vma_name(mmap_event->vma);
-               if (name)
+               if (arch_vma_name(mmap_event->vma)) {
+                       name = strncpy(tmp, arch_vma_name(mmap_event->vma),
+                                      sizeof(tmp));
                        goto got_name;
+               }
  
                if (!vma->vm_mm) {
                        name = strncpy(tmp, "[vdso]", sizeof(tmp));
@@@ -3147,8 -3376,16 +3376,16 @@@ void __perf_counter_mmap(struct vm_area
  
        mmap_event = (struct perf_mmap_event){
                .vma    = vma,
+               /* .file_name */
+               /* .file_size */
                .event  = {
-                       .header = { .type = PERF_EVENT_MMAP, },
+                       .header = {
+                               .type = PERF_EVENT_MMAP,
+                               .misc = 0,
+                               /* .size */
+                       },
+                       /* .pid */
+                       /* .tid */
                        .start  = vma->vm_start,
                        .len    = vma->vm_end - vma->vm_start,
                        .pgoff  = vma->vm_pgoff,
  }
  
  /*
-  * Log sample_period changes so that analyzing tools can re-normalize the
-  * event flow.
-  */
- struct freq_event {
-       struct perf_event_header        header;
-       u64                             time;
-       u64                             id;
-       u64                             period;
- };
- static void perf_log_period(struct perf_counter *counter, u64 period)
- {
-       struct perf_output_handle handle;
-       struct freq_event event;
-       int ret;
-       if (counter->hw.sample_period == period)
-               return;
-       if (counter->attr.sample_type & PERF_SAMPLE_PERIOD)
-               return;
-       event = (struct freq_event) {
-               .header = {
-                       .type = PERF_EVENT_PERIOD,
-                       .misc = 0,
-                       .size = sizeof(event),
-               },
-               .time = sched_clock(),
-               .id = counter->id,
-               .period = period,
-       };
-       ret = perf_output_begin(&handle, counter, sizeof(event), 1, 0);
-       if (ret)
-               return;
-       perf_output_put(&handle, event);
-       perf_output_end(&handle);
- }
- /*
   * IRQ throttle logging
   */
  
@@@ -3214,16 -3408,21 +3408,21 @@@ static void perf_log_throttle(struct pe
                struct perf_event_header        header;
                u64                             time;
                u64                             id;
+               u64                             stream_id;
        } throttle_event = {
                .header = {
-                       .type = PERF_EVENT_THROTTLE + 1,
+                       .type = PERF_EVENT_THROTTLE,
                        .misc = 0,
                        .size = sizeof(throttle_event),
                },
-               .time   = sched_clock(),
-               .id     = counter->id,
+               .time           = sched_clock(),
+               .id             = primary_counter_id(counter),
+               .stream_id      = counter->id,
        };
  
+       if (enable)
+               throttle_event.header.type = PERF_EVENT_UNTHROTTLE;
        ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0);
        if (ret)
                return;
@@@ -3300,125 -3499,111 +3499,111 @@@ int perf_counter_overflow(struct perf_c
   * Generic software counter infrastructure
   */
  
- static void perf_swcounter_update(struct perf_counter *counter)
+ /*
+  * We directly increment counter->count and keep a second value in
+  * counter->hw.period_left to count intervals. This period counter
+  * is kept in the range [-sample_period, 0] so that we can use the
+  * sign as trigger.
+  */
+ static u64 perf_swcounter_set_period(struct perf_counter *counter)
  {
        struct hw_perf_counter *hwc = &counter->hw;
-       u64 prev, now;
-       s64 delta;
+       u64 period = hwc->last_period;
+       u64 nr, offset;
+       s64 old, val;
+       hwc->last_period = hwc->sample_period;
  
  again:
-       prev = atomic64_read(&hwc->prev_count);
-       now = atomic64_read(&hwc->count);
-       if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev)
-               goto again;
+       old = val = atomic64_read(&hwc->period_left);
+       if (val < 0)
+               return 0;
  
-       delta = now - prev;
+       nr = div64_u64(period + val, period);
+       offset = nr * period;
+       val -= offset;
+       if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
+               goto again;
  
-       atomic64_add(delta, &counter->count);
-       atomic64_sub(delta, &hwc->period_left);
+       return nr;
  }
  
- static void perf_swcounter_set_period(struct perf_counter *counter)
+ static void perf_swcounter_overflow(struct perf_counter *counter,
+                                   int nmi, struct perf_sample_data *data)
  {
        struct hw_perf_counter *hwc = &counter->hw;
-       s64 left = atomic64_read(&hwc->period_left);
-       s64 period = hwc->sample_period;
+       u64 overflow;
  
-       if (unlikely(left <= -period)) {
-               left = period;
-               atomic64_set(&hwc->period_left, left);
-               hwc->last_period = period;
-       }
+       data->period = counter->hw.last_period;
+       overflow = perf_swcounter_set_period(counter);
  
-       if (unlikely(left <= 0)) {
-               left += period;
-               atomic64_add(period, &hwc->period_left);
-               hwc->last_period = period;
-       }
+       if (hwc->interrupts == MAX_INTERRUPTS)
+               return;
  
-       atomic64_set(&hwc->prev_count, -left);
-       atomic64_set(&hwc->count, -left);
+       for (; overflow; overflow--) {
+               if (perf_counter_overflow(counter, nmi, data)) {
+                       /*
+                        * We inhibit the overflow from happening when
+                        * hwc->interrupts == MAX_INTERRUPTS.
+                        */
+                       break;
+               }
+       }
  }
  
- static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
+ static void perf_swcounter_unthrottle(struct perf_counter *counter)
  {
-       enum hrtimer_restart ret = HRTIMER_RESTART;
-       struct perf_sample_data data;
-       struct perf_counter *counter;
-       u64 period;
-       counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
-       counter->pmu->read(counter);
-       data.addr = 0;
-       data.regs = get_irq_regs();
        /*
-        * In case we exclude kernel IPs or are somehow not in interrupt
-        * context, provide the next best thing, the user IP.
+        * Nothing to do, we already reset hwc->interrupts.
         */
-       if ((counter->attr.exclude_kernel || !data.regs) &&
-                       !counter->attr.exclude_user)
-               data.regs = task_pt_regs(current);
+ }
  
-       if (data.regs) {
-               if (perf_counter_overflow(counter, 0, &data))
-                       ret = HRTIMER_NORESTART;
-       }
+ static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
+                              int nmi, struct perf_sample_data *data)
+ {
+       struct hw_perf_counter *hwc = &counter->hw;
  
-       period = max_t(u64, 10000, counter->hw.sample_period);
-       hrtimer_forward_now(hrtimer, ns_to_ktime(period));
+       atomic64_add(nr, &counter->count);
  
-       return ret;
- }
+       if (!hwc->sample_period)
+               return;
  
- static void perf_swcounter_overflow(struct perf_counter *counter,
-                                   int nmi, struct perf_sample_data *data)
- {
-       data->period = counter->hw.last_period;
+       if (!data->regs)
+               return;
  
-       perf_swcounter_update(counter);
-       perf_swcounter_set_period(counter);
-       if (perf_counter_overflow(counter, nmi, data))
-               /* soft-disable the counter */
-               ;
+       if (!atomic64_add_negative(nr, &hwc->period_left))
+               perf_swcounter_overflow(counter, nmi, data);
  }
  
  static int perf_swcounter_is_counting(struct perf_counter *counter)
  {
-       struct perf_counter_context *ctx;
-       unsigned long flags;
-       int count;
+       /*
+        * The counter is active, we're good!
+        */
        if (counter->state == PERF_COUNTER_STATE_ACTIVE)
                return 1;
  
+       /*
+        * The counter is off/error, not counting.
+        */
        if (counter->state != PERF_COUNTER_STATE_INACTIVE)
                return 0;
  
        /*
-        * If the counter is inactive, it could be just because
-        * its task is scheduled out, or because it's in a group
-        * which could not go on the PMU.  We want to count in
-        * the first case but not the second.  If the context is
-        * currently active then an inactive software counter must
-        * be the second case.  If it's not currently active then
-        * we need to know whether the counter was active when the
-        * context was last active, which we can determine by
-        * comparing counter->tstamp_stopped with ctx->time.
-        *
-        * We are within an RCU read-side critical section,
-        * which protects the existence of *ctx.
+        * The counter is inactive, if the context is active
+        * we're part of a group that didn't make it on the 'pmu',
+        * not counting.
         */
-       ctx = counter->ctx;
-       spin_lock_irqsave(&ctx->lock, flags);
-       count = 1;
-       /* Re-check state now we have the lock */
-       if (counter->state < PERF_COUNTER_STATE_INACTIVE ||
-           counter->ctx->is_active ||
-           counter->tstamp_stopped < ctx->time)
-               count = 0;
-       spin_unlock_irqrestore(&ctx->lock, flags);
-       return count;
+       if (counter->ctx->is_active)
+               return 0;
+       /*
+        * We're inactive and the context is too, this means the
+        * task is scheduled out, we're counting events that happen
+        * to us, like migration events.
+        */
+       return 1;
  }
  
  static int perf_swcounter_match(struct perf_counter *counter,
        return 1;
  }
  
- static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
-                              int nmi, struct perf_sample_data *data)
- {
-       int neg = atomic64_add_negative(nr, &counter->hw.count);
-       if (counter->hw.sample_period && !neg && data->regs)
-               perf_swcounter_overflow(counter, nmi, data);
- }
  static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
                                     enum perf_type_id type,
                                     u32 event, u64 nr, int nmi,
@@@ -3531,27 -3707,66 +3707,66 @@@ void __perf_swcounter_event(u32 event, 
  
  static void perf_swcounter_read(struct perf_counter *counter)
  {
-       perf_swcounter_update(counter);
  }
  
  static int perf_swcounter_enable(struct perf_counter *counter)
  {
-       perf_swcounter_set_period(counter);
+       struct hw_perf_counter *hwc = &counter->hw;
+       if (hwc->sample_period) {
+               hwc->last_period = hwc->sample_period;
+               perf_swcounter_set_period(counter);
+       }
        return 0;
  }
  
  static void perf_swcounter_disable(struct perf_counter *counter)
  {
-       perf_swcounter_update(counter);
  }
  
  static const struct pmu perf_ops_generic = {
        .enable         = perf_swcounter_enable,
        .disable        = perf_swcounter_disable,
        .read           = perf_swcounter_read,
+       .unthrottle     = perf_swcounter_unthrottle,
  };
  
  /*
+  * hrtimer based swcounter callback
+  */
+ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
+ {
+       enum hrtimer_restart ret = HRTIMER_RESTART;
+       struct perf_sample_data data;
+       struct perf_counter *counter;
+       u64 period;
+       counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
+       counter->pmu->read(counter);
+       data.addr = 0;
+       data.regs = get_irq_regs();
+       /*
+        * In case we exclude kernel IPs or are somehow not in interrupt
+        * context, provide the next best thing, the user IP.
+        */
+       if ((counter->attr.exclude_kernel || !data.regs) &&
+                       !counter->attr.exclude_user)
+               data.regs = task_pt_regs(current);
+       if (data.regs) {
+               if (perf_counter_overflow(counter, 0, &data))
+                       ret = HRTIMER_NORESTART;
+       }
+       period = max_t(u64, 10000, counter->hw.sample_period);
+       hrtimer_forward_now(hrtimer, ns_to_ktime(period));
+       return ret;
+ }
+ /*
   * Software counter: cpu wall time clock
   */
  
@@@ -3668,17 -3883,24 +3883,24 @@@ static const struct pmu perf_ops_task_c
  };
  
  #ifdef CONFIG_EVENT_PROFILE
- void perf_tpcounter_event(int event_id)
+ void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record,
+                         int entry_size)
  {
+       struct perf_raw_record raw = {
+               .size = entry_size,
+               .data = record,
+       };
        struct perf_sample_data data = {
-               .regs = get_irq_regs();
-               .addr = 0,
+               .regs = get_irq_regs(),
+               .addr = addr,
+               .raw = &raw,
        };
  
        if (!data.regs)
                data.regs = task_pt_regs(current);
  
-       do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, &data);
+       do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data);
  }
  EXPORT_SYMBOL_GPL(perf_tpcounter_event);
  
@@@ -3687,16 -3909,20 +3909,20 @@@ extern void ftrace_profile_disable(int)
  
  static void tp_perf_counter_destroy(struct perf_counter *counter)
  {
-       ftrace_profile_disable(perf_event_id(&counter->attr));
+       ftrace_profile_disable(counter->attr.config);
  }
  
  static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
  {
-       int event_id = perf_event_id(&counter->attr);
-       int ret;
+       /*
+        * Raw tracepoint data is a severe data leak, only allow root to
+        * have these.
+        */
+       if ((counter->attr.sample_type & PERF_SAMPLE_RAW) &&
+                       !capable(CAP_SYS_ADMIN))
+               return ERR_PTR(-EPERM);
  
-       ret = ftrace_profile_enable(event_id);
-       if (ret)
+       if (ftrace_profile_enable(counter->attr.config))
                return NULL;
  
        counter->destroy = tp_perf_counter_destroy;
@@@ -3829,9 -4055,9 +4055,9 @@@ perf_counter_alloc(struct perf_counter_
        atomic64_set(&hwc->period_left, hwc->sample_period);
  
        /*
-        * we currently do not support PERF_SAMPLE_GROUP on inherited counters
+        * we currently do not support PERF_FORMAT_GROUP on inherited counters
         */
-       if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP))
+       if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
                goto done;
  
        switch (attr->type) {
@@@ -3874,6 -4100,8 +4100,8 @@@ done
                        atomic_inc(&nr_mmap_counters);
                if (counter->attr.comm)
                        atomic_inc(&nr_comm_counters);
+               if (counter->attr.task)
+                       atomic_inc(&nr_task_counters);
        }
  
        return counter;
@@@ -4235,8 -4463,10 +4463,10 @@@ void perf_counter_exit_task(struct task
        struct perf_counter_context *child_ctx;
        unsigned long flags;
  
-       if (likely(!child->perf_counter_ctxp))
+       if (likely(!child->perf_counter_ctxp)) {
+               perf_counter_task(child, NULL, 0);
                return;
+       }
  
        local_irq_save(flags);
        /*
         */
        spin_lock(&child_ctx->lock);
        child->perf_counter_ctxp = NULL;
-       if (child_ctx->parent_ctx) {
-               /*
-                * This context is a clone; unclone it so it can't get
-                * swapped to another process while we're removing all
-                * the counters from it.
-                */
-               put_ctx(child_ctx->parent_ctx);
-               child_ctx->parent_ctx = NULL;
-       }
-       spin_unlock(&child_ctx->lock);
-       local_irq_restore(flags);
+       /*
+        * If this context is a clone; unclone it so it can't get
+        * swapped to another process while we're removing all
+        * the counters from it.
+        */
+       unclone_ctx(child_ctx);
+       spin_unlock_irqrestore(&child_ctx->lock, flags);
+       /*
+        * Report the task dead after unscheduling the counters so that we
+        * won't get any samples after PERF_EVENT_EXIT. We can however still
+        * get a few PERF_EVENT_READ events.
+        */
+       perf_counter_task(child, child_ctx, 0);
  
        /*
         * We can recurse on the same lock type through:
@@@ -4486,6 -4719,11 +4719,11 @@@ perf_cpu_notify(struct notifier_block *
                perf_counter_init_cpu(cpu);
                break;
  
+       case CPU_ONLINE:
+       case CPU_ONLINE_FROZEN:
+               hw_perf_counter_setup_online(cpu);
+               break;
        case CPU_DOWN_PREPARE:
        case CPU_DOWN_PREPARE_FROZEN:
                perf_counter_exit_cpu(cpu);
@@@ -4510,6 -4748,8 +4748,8 @@@ void __init perf_counter_init(void
  {
        perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
                        (void *)(long)smp_processor_id());
+       perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
+                       (void *)(long)smp_processor_id());
        register_cpu_notifier(&perf_cpu_nb);
  }
  
diff --combined kernel/sched.c
@@@ -318,12 -318,12 +318,12 @@@ struct task_group root_task_group
  /* Default task group's sched entity on each cpu */
  static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
  /* Default task group's cfs_rq on each cpu */
 -static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
 +static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_cfs_rq);
  #endif /* CONFIG_FAIR_GROUP_SCHED */
  
  #ifdef CONFIG_RT_GROUP_SCHED
  static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
 -static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
 +static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq);
  #endif /* CONFIG_RT_GROUP_SCHED */
  #else /* !CONFIG_USER_SCHED */
  #define root_task_group init_task_group
@@@ -493,6 -493,7 +493,7 @@@ struct rt_rq 
  #endif
  #ifdef CONFIG_SMP
        unsigned long rt_nr_migratory;
+       unsigned long rt_nr_total;
        int overloaded;
        struct plist_head pushable_tasks;
  #endif
@@@ -2571,15 -2572,37 +2572,37 @@@ static void __sched_fork(struct task_st
        p->se.avg_wakeup                = sysctl_sched_wakeup_granularity;
  
  #ifdef CONFIG_SCHEDSTATS
-       p->se.wait_start                = 0;
-       p->se.sum_sleep_runtime         = 0;
-       p->se.sleep_start               = 0;
-       p->se.block_start               = 0;
-       p->se.sleep_max                 = 0;
-       p->se.block_max                 = 0;
-       p->se.exec_max                  = 0;
-       p->se.slice_max                 = 0;
-       p->se.wait_max                  = 0;
+       p->se.wait_start                        = 0;
+       p->se.wait_max                          = 0;
+       p->se.wait_count                        = 0;
+       p->se.wait_sum                          = 0;
+       p->se.sleep_start                       = 0;
+       p->se.sleep_max                         = 0;
+       p->se.sum_sleep_runtime                 = 0;
+       p->se.block_start                       = 0;
+       p->se.block_max                         = 0;
+       p->se.exec_max                          = 0;
+       p->se.slice_max                         = 0;
+       p->se.nr_migrations_cold                = 0;
+       p->se.nr_failed_migrations_affine       = 0;
+       p->se.nr_failed_migrations_running      = 0;
+       p->se.nr_failed_migrations_hot          = 0;
+       p->se.nr_forced_migrations              = 0;
+       p->se.nr_forced2_migrations             = 0;
+       p->se.nr_wakeups                        = 0;
+       p->se.nr_wakeups_sync                   = 0;
+       p->se.nr_wakeups_migrate                = 0;
+       p->se.nr_wakeups_local                  = 0;
+       p->se.nr_wakeups_remote                 = 0;
+       p->se.nr_wakeups_affine                 = 0;
+       p->se.nr_wakeups_affine_attempts        = 0;
+       p->se.nr_wakeups_passive                = 0;
+       p->se.nr_wakeups_idle                   = 0;
  #endif
  
        INIT_LIST_HEAD(&p->rt.run_list);
@@@ -6541,6 -6564,11 +6564,11 @@@ SYSCALL_DEFINE0(sched_yield
        return 0;
  }
  
+ static inline int should_resched(void)
+ {
+       return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
+ }
  static void __cond_resched(void)
  {
  #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
  
  int __sched _cond_resched(void)
  {
-       if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
-                                       system_state == SYSTEM_RUNNING) {
+       if (should_resched()) {
                __cond_resched();
                return 1;
        }
@@@ -6579,12 -6606,12 +6606,12 @@@ EXPORT_SYMBOL(_cond_resched)
   */
  int cond_resched_lock(spinlock_t *lock)
  {
-       int resched = need_resched() && system_state == SYSTEM_RUNNING;
+       int resched = should_resched();
        int ret = 0;
  
        if (spin_needbreak(lock) || resched) {
                spin_unlock(lock);
-               if (resched && need_resched())
+               if (resched)
                        __cond_resched();
                else
                        cpu_relax();
@@@ -6599,7 -6626,7 +6626,7 @@@ int __sched cond_resched_softirq(void
  {
        BUG_ON(!in_softirq());
  
-       if (need_resched() && system_state == SYSTEM_RUNNING) {
+       if (should_resched()) {
                local_bh_enable();
                __cond_resched();
                local_bh_disable();
@@@ -7262,6 -7289,7 +7289,7 @@@ static void migrate_dead_tasks(unsigne
  static void calc_global_load_remove(struct rq *rq)
  {
        atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
+       rq->calc_load_active = 0;
  }
  #endif /* CONFIG_HOTPLUG_CPU */
  
@@@ -7488,6 -7516,7 +7516,7 @@@ migration_call(struct notifier_block *n
                task_rq_unlock(rq, &flags);
                get_task_struct(p);
                cpu_rq(cpu)->migration_thread = p;
+               rq->calc_load_update = calc_load_update;
                break;
  
        case CPU_ONLINE:
                /* Update our root-domain */
                rq = cpu_rq(cpu);
                spin_lock_irqsave(&rq->lock, flags);
-               rq->calc_load_update = calc_load_update;
-               rq->calc_load_active = 0;
                if (rq->rd) {
                        BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
  
@@@ -9070,7 -9097,7 +9097,7 @@@ static void init_rt_rq(struct rt_rq *rt
  #ifdef CONFIG_SMP
        rt_rq->rt_nr_migratory = 0;
        rt_rq->overloaded = 0;
-       plist_head_init(&rq->rt.pushable_tasks, &rq->lock);
+       plist_head_init(&rt_rq->pushable_tasks, &rq->lock);
  #endif
  
        rt_rq->rt_time = 0;
@@@ -376,7 -376,7 +376,7 @@@ ftrace_event_seq_open(struct inode *ino
        const struct seq_operations *seq_ops;
  
        if ((file->f_mode & FMODE_WRITE) &&
-           !(file->f_flags & O_APPEND))
+           (file->f_flags & O_TRUNC))
                ftrace_clear_events();
  
        seq_ops = inode->i_private;
@@@ -940,7 -940,7 +940,7 @@@ event_create_dir(struct ftrace_event_ca
                entry = trace_create_file("enable", 0644, call->dir, call,
                                          enable);
  
-       if (call->id)
+       if (call->id && call->profile_enable)
                entry = trace_create_file("id", 0444, call->dir, call,
                                          id);
  
@@@ -1334,7 -1334,7 +1334,7 @@@ static __init void event_trace_self_tes
  
  #ifdef CONFIG_FUNCTION_TRACER
  
 -static DEFINE_PER_CPU(atomic_t, test_event_disable);
 +static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable);
  
  static void
  function_test_events_call(unsigned long ip, unsigned long parent_ip)
        pc = preempt_count();
        resched = ftrace_preempt_disable();
        cpu = raw_smp_processor_id();
 -      disabled = atomic_inc_return(&per_cpu(test_event_disable, cpu));
 +      disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu));
  
        if (disabled != 1)
                goto out;
        trace_nowake_buffer_unlock_commit(event, flags, pc);
  
   out:
 -      atomic_dec(&per_cpu(test_event_disable, cpu));
 +      atomic_dec(&per_cpu(ftrace_test_event_disable, cpu));
        ftrace_preempt_enable(resched);
  }
  
diff --combined mm/page-writeback.c
@@@ -575,7 -575,7 +575,7 @@@ static void balance_dirty_pages(struct 
                if (pages_written >= write_chunk)
                        break;          /* We've done our duty */
  
-               congestion_wait(WRITE, HZ/10);
+               congestion_wait(BLK_RW_ASYNC, HZ/10);
        }
  
        if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
@@@ -610,8 -610,6 +610,8 @@@ void set_page_dirty_balance(struct pag
        }
  }
  
 +static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0;
 +
  /**
   * balance_dirty_pages_ratelimited_nr - balance dirty memory state
   * @mapping: address_space which was dirtied
  void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
                                        unsigned long nr_pages_dirtied)
  {
 -      static DEFINE_PER_CPU(unsigned long, ratelimits) = 0;
        unsigned long ratelimit;
        unsigned long *p;
  
         * tasks in balance_dirty_pages(). Period.
         */
        preempt_disable();
 -      p =  &__get_cpu_var(ratelimits);
 +      p =  &__get_cpu_var(bdp_ratelimits);
        *p += nr_pages_dirtied;
        if (unlikely(*p >= ratelimit)) {
                *p = 0;
@@@ -670,7 -669,7 +670,7 @@@ void throttle_vm_writeout(gfp_t gfp_mas
                  if (global_page_state(NR_UNSTABLE_NFS) +
                        global_page_state(NR_WRITEBACK) <= dirty_thresh)
                                break;
-                 congestion_wait(WRITE, HZ/10);
+                 congestion_wait(BLK_RW_ASYNC, HZ/10);
  
                /*
                 * The caller might hold locks which can prevent IO completion
@@@ -716,7 -715,7 +716,7 @@@ static void background_writeout(unsigne
                if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
                        /* Wrote less than expected */
                        if (wbc.encountered_congestion || wbc.more_io)
-                               congestion_wait(WRITE, HZ/10);
+                               congestion_wait(BLK_RW_ASYNC, HZ/10);
                        else
                                break;
                }
@@@ -788,7 -787,7 +788,7 @@@ static void wb_kupdate(unsigned long ar
                writeback_inodes(&wbc);
                if (wbc.nr_to_write > 0) {
                        if (wbc.encountered_congestion || wbc.more_io)
-                               congestion_wait(WRITE, HZ/10);
+                               congestion_wait(BLK_RW_ASYNC, HZ/10);
                        else
                                break;  /* All the old data is written */
                }
diff --combined mm/percpu.c
@@@ -8,13 -8,12 +8,13 @@@
   *
   * This is percpu allocator which can handle both static and dynamic
   * areas.  Percpu areas are allocated in chunks in vmalloc area.  Each
 - * chunk is consisted of nr_cpu_ids units and the first chunk is used
 - * for static percpu variables in the kernel image (special boot time
 - * alloc/init handling necessary as these areas need to be brought up
 - * before allocation services are running).  Unit grows as necessary
 - * and all units grow or shrink in unison.  When a chunk is filled up,
 - * another chunk is allocated.  ie. in vmalloc area
 + * chunk is consisted of boot-time determined number of units and the
 + * first chunk is used for static percpu variables in the kernel image
 + * (special boot time alloc/init handling necessary as these areas
 + * need to be brought up before allocation services are running).
 + * Unit grows as necessary and all units grow or shrink in unison.
 + * When a chunk is filled up, another chunk is allocated.  ie. in
 + * vmalloc area
   *
   *  c0                           c1                         c2
   *  -------------------          -------------------        ------------
   *
   * Allocation is done in offset-size areas of single unit space.  Ie,
   * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0,
 - * c1:u1, c1:u2 and c1:u3.  Percpu access can be done by configuring
 - * percpu base registers pcpu_unit_size apart.
 + * c1:u1, c1:u2 and c1:u3.  On UMA, units corresponds directly to
 + * cpus.  On NUMA, the mapping can be non-linear and even sparse.
 + * Percpu access can be done by configuring percpu base registers
 + * according to cpu to unit mapping and pcpu_unit_size.
   *
 - * There are usually many small percpu allocations many of them as
 - * small as 4 bytes.  The allocator organizes chunks into lists
 + * There are usually many small percpu allocations many of them being
 + * as small as 4 bytes.  The allocator organizes chunks into lists
   * according to free size and tries to allocate from the fullest one.
   * Each chunk keeps the maximum contiguous area size hint which is
   * guaranteed to be eqaul to or larger than the maximum contiguous
@@@ -46,7 -43,7 +46,7 @@@
   *
   * To use this allocator, arch code should do the followings.
   *
 - * - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
 + * - drop CONFIG_HAVE_LEGACY_PER_CPU_AREA
   *
   * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
   *   regular address to percpu pointer and back if they need to be
@@@ -59,7 -56,6 +59,7 @@@
  #include <linux/bitmap.h>
  #include <linux/bootmem.h>
  #include <linux/list.h>
 +#include <linux/log2.h>
  #include <linux/mm.h>
  #include <linux/module.h>
  #include <linux/mutex.h>
@@@ -98,27 -94,20 +98,27 @@@ struct pcpu_chunk 
        int                     map_alloc;      /* # of map entries allocated */
        int                     *map;           /* allocation map */
        bool                    immutable;      /* no [de]population allowed */
 -      struct page             **page;         /* points to page array */
 -      struct page             *page_ar[];     /* #cpus * UNIT_PAGES */
 +      unsigned long           populated[];    /* populated bitmap */
  };
  
  static int pcpu_unit_pages __read_mostly;
  static int pcpu_unit_size __read_mostly;
 +static int pcpu_nr_units __read_mostly;
  static int pcpu_chunk_size __read_mostly;
  static int pcpu_nr_slots __read_mostly;
  static size_t pcpu_chunk_struct_size __read_mostly;
  
 +/* cpus with the lowest and highest unit numbers */
 +static unsigned int pcpu_first_unit_cpu __read_mostly;
 +static unsigned int pcpu_last_unit_cpu __read_mostly;
 +
  /* the address of the first chunk which starts with the kernel static area */
  void *pcpu_base_addr __read_mostly;
  EXPORT_SYMBOL_GPL(pcpu_base_addr);
  
 +/* cpu -> unit map */
 +const int *pcpu_unit_map __read_mostly;
 +
  /*
   * The first chunk which always exists.  Note that unlike other
   * chunks, this one can be allocated and mapped in several different
@@@ -140,9 -129,9 +140,9 @@@ static int pcpu_reserved_chunk_limit
   * Synchronization rules.
   *
   * There are two locks - pcpu_alloc_mutex and pcpu_lock.  The former
 - * protects allocation/reclaim paths, chunks and chunk->page arrays.
 - * The latter is a spinlock and protects the index data structures -
 - * chunk slots, chunks and area maps in chunks.
 + * protects allocation/reclaim paths, chunks, populated bitmap and
 + * vmalloc mapping.  The latter is a spinlock and protects the index
 + * data structures - chunk slots, chunks and area maps in chunks.
   *
   * During allocation, pcpu_alloc_mutex is kept locked all the time and
   * pcpu_lock is grabbed and released as necessary.  All actual memory
@@@ -189,7 -178,13 +189,7 @@@ static int pcpu_chunk_slot(const struc
  
  static int pcpu_page_idx(unsigned int cpu, int page_idx)
  {
 -      return cpu * pcpu_unit_pages + page_idx;
 -}
 -
 -static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk,
 -                                    unsigned int cpu, int page_idx)
 -{
 -      return &chunk->page[pcpu_page_idx(cpu, page_idx)];
 +      return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
  }
  
  static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
                (pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT);
  }
  
 -static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
 -                                   int page_idx)
 +static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
 +                                  unsigned int cpu, int page_idx)
  {
 -      return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL;
 +      /* must not be used on pre-mapped chunk */
 +      WARN_ON(chunk->immutable);
 +
 +      return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx));
  }
  
  /* set the pointer to a chunk in a page struct */
@@@ -220,34 -212,6 +220,34 @@@ static struct pcpu_chunk *pcpu_get_page
        return (struct pcpu_chunk *)page->index;
  }
  
 +static void pcpu_next_unpop(struct pcpu_chunk *chunk, int *rs, int *re, int end)
 +{
 +      *rs = find_next_zero_bit(chunk->populated, end, *rs);
 +      *re = find_next_bit(chunk->populated, end, *rs + 1);
 +}
 +
 +static void pcpu_next_pop(struct pcpu_chunk *chunk, int *rs, int *re, int end)
 +{
 +      *rs = find_next_bit(chunk->populated, end, *rs);
 +      *re = find_next_zero_bit(chunk->populated, end, *rs + 1);
 +}
 +
 +/*
 + * (Un)populated page region iterators.  Iterate over (un)populated
 + * page regions betwen @start and @end in @chunk.  @rs and @re should
 + * be integer variables and will be set to start and end page index of
 + * the current region.
 + */
 +#define pcpu_for_each_unpop_region(chunk, rs, re, start, end)             \
 +      for ((rs) = (start), pcpu_next_unpop((chunk), &(rs), &(re), (end)); \
 +           (rs) < (re);                                                   \
 +           (rs) = (re) + 1, pcpu_next_unpop((chunk), &(rs), &(re), (end)))
 +
 +#define pcpu_for_each_pop_region(chunk, rs, re, start, end)               \
 +      for ((rs) = (start), pcpu_next_pop((chunk), &(rs), &(re), (end));   \
 +           (rs) < (re);                                                   \
 +           (rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end)))
 +
  /**
   * pcpu_mem_alloc - allocate memory
   * @size: bytes to allocate
@@@ -326,21 -290,13 +326,21 @@@ static struct pcpu_chunk *pcpu_chunk_ad
        void *first_start = pcpu_first_chunk->vm->addr;
  
        /* is it in the first chunk? */
 -      if (addr >= first_start && addr < first_start + pcpu_chunk_size) {
 +      if (addr >= first_start && addr < first_start + pcpu_unit_size) {
                /* is it in the reserved area? */
                if (addr < first_start + pcpu_reserved_chunk_limit)
                        return pcpu_reserved_chunk;
                return pcpu_first_chunk;
        }
  
 +      /*
 +       * The address is relative to unit0 which might be unused and
 +       * thus unmapped.  Offset the address to the unit space of the
 +       * current processor before looking it up in the vmalloc
 +       * space.  Note that any possible cpu id can be used here, so
 +       * there's no need to worry about preemption or cpu hotplug.
 +       */
 +      addr += pcpu_unit_map[smp_processor_id()] * pcpu_unit_size;
        return pcpu_get_page_chunk(vmalloc_to_page(addr));
  }
  
@@@ -589,327 -545,125 +589,327 @@@ static void pcpu_free_area(struct pcpu_
  }
  
  /**
 - * pcpu_unmap - unmap pages out of a pcpu_chunk
 + * pcpu_get_pages_and_bitmap - get temp pages array and bitmap
   * @chunk: chunk of interest
 - * @page_start: page index of the first page to unmap
 - * @page_end: page index of the last page to unmap + 1
 - * @flush_tlb: whether to flush tlb or not
 + * @bitmapp: output parameter for bitmap
 + * @may_alloc: may allocate the array
   *
 - * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
 - * If @flush is true, vcache is flushed before unmapping and tlb
 - * after.
 + * Returns pointer to array of pointers to struct page and bitmap,
 + * both of which can be indexed with pcpu_page_idx().  The returned
 + * array is cleared to zero and *@bitmapp is copied from
 + * @chunk->populated.  Note that there is only one array and bitmap
 + * and access exclusion is the caller's responsibility.
 + *
 + * CONTEXT:
 + * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc.
 + * Otherwise, don't care.
 + *
 + * RETURNS:
 + * Pointer to temp pages array on success, NULL on failure.
   */
 -static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
 -                     bool flush_tlb)
 +static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk,
 +                                             unsigned long **bitmapp,
 +                                             bool may_alloc)
  {
 -      unsigned int last = nr_cpu_ids - 1;
 -      unsigned int cpu;
 +      static struct page **pages;
 +      static unsigned long *bitmap;
 +      size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]);
 +      size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) *
 +                           sizeof(unsigned long);
 +
 +      if (!pages || !bitmap) {
 +              if (may_alloc && !pages)
 +                      pages = pcpu_mem_alloc(pages_size);
 +              if (may_alloc && !bitmap)
 +                      bitmap = pcpu_mem_alloc(bitmap_size);
 +              if (!pages || !bitmap)
 +                      return NULL;
 +      }
  
 -      /* unmap must not be done on immutable chunk */
 -      WARN_ON(chunk->immutable);
 +      memset(pages, 0, pages_size);
 +      bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages);
  
 -      /*
 -       * Each flushing trial can be very expensive, issue flush on
 -       * the whole region at once rather than doing it for each cpu.
 -       * This could be an overkill but is more scalable.
 -       */
 -      flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start),
 -                         pcpu_chunk_addr(chunk, last, page_end));
 +      *bitmapp = bitmap;
 +      return pages;
 +}
  
 -      for_each_possible_cpu(cpu)
 -              unmap_kernel_range_noflush(
 -                              pcpu_chunk_addr(chunk, cpu, page_start),
 -                              (page_end - page_start) << PAGE_SHIFT);
 -
 -      /* ditto as flush_cache_vunmap() */
 -      if (flush_tlb)
 -              flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start),
 -                                     pcpu_chunk_addr(chunk, last, page_end));
 +/**
 + * pcpu_free_pages - free pages which were allocated for @chunk
 + * @chunk: chunk pages were allocated for
 + * @pages: array of pages to be freed, indexed by pcpu_page_idx()
 + * @populated: populated bitmap
 + * @page_start: page index of the first page to be freed
 + * @page_end: page index of the last page to be freed + 1
 + *
 + * Free pages [@page_start and @page_end) in @pages for all units.
 + * The pages were allocated for @chunk.
 + */
 +static void pcpu_free_pages(struct pcpu_chunk *chunk,
 +                          struct page **pages, unsigned long *populated,
 +                          int page_start, int page_end)
 +{
 +      unsigned int cpu;
 +      int i;
 +
 +      for_each_possible_cpu(cpu) {
 +              for (i = page_start; i < page_end; i++) {
 +                      struct page *page = pages[pcpu_page_idx(cpu, i)];
 +
 +                      if (page)
 +                              __free_page(page);
 +              }
 +      }
  }
  
  /**
 - * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
 - * @chunk: chunk to depopulate
 - * @off: offset to the area to depopulate
 - * @size: size of the area to depopulate in bytes
 - * @flush: whether to flush cache and tlb or not
 - *
 - * For each cpu, depopulate and unmap pages [@page_start,@page_end)
 - * from @chunk.  If @flush is true, vcache is flushed before unmapping
 - * and tlb after.
 - *
 - * CONTEXT:
 - * pcpu_alloc_mutex.
 + * pcpu_alloc_pages - allocates pages for @chunk
 + * @chunk: target chunk
 + * @pages: array to put the allocated pages into, indexed by pcpu_page_idx()
 + * @populated: populated bitmap
 + * @page_start: page index of the first page to be allocated
 + * @page_end: page index of the last page to be allocated + 1
 + *
 + * Allocate pages [@page_start,@page_end) into @pages for all units.
 + * The allocation is for @chunk.  Percpu core doesn't care about the
 + * content of @pages and will pass it verbatim to pcpu_map_pages().
   */
 -static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size,
 -                                bool flush)
 +static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
 +                          struct page **pages, unsigned long *populated,
 +                          int page_start, int page_end)
  {
 -      int page_start = PFN_DOWN(off);
 -      int page_end = PFN_UP(off + size);
 -      int unmap_start = -1;
 -      int uninitialized_var(unmap_end);
 +      const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
        unsigned int cpu;
        int i;
  
 -      for (i = page_start; i < page_end; i++) {
 -              for_each_possible_cpu(cpu) {
 -                      struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
 +      for_each_possible_cpu(cpu) {
 +              for (i = page_start; i < page_end; i++) {
 +                      struct page **pagep = &pages[pcpu_page_idx(cpu, i)];
 +
 +                      *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0);
 +                      if (!*pagep) {
 +                              pcpu_free_pages(chunk, pages, populated,
 +                                              page_start, page_end);
 +                              return -ENOMEM;
 +                      }
 +              }
 +      }
 +      return 0;
 +}
  
 -                      if (!*pagep)
 -                              continue;
 +/**
 + * pcpu_pre_unmap_flush - flush cache prior to unmapping
 + * @chunk: chunk the regions to be flushed belongs to
 + * @page_start: page index of the first page to be flushed
 + * @page_end: page index of the last page to be flushed + 1
 + *
 + * Pages in [@page_start,@page_end) of @chunk are about to be
 + * unmapped.  Flush cache.  As each flushing trial can be very
 + * expensive, issue flush on the whole region at once rather than
 + * doing it for each cpu.  This could be an overkill but is more
 + * scalable.
 + */
 +static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk,
 +                               int page_start, int page_end)
 +{
 +      flush_cache_vunmap(
 +              pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
 +              pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
 +}
  
 -                      __free_page(*pagep);
 +static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
 +{
 +      unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT);
 +}
  
 -                      /*
 -                       * If it's partial depopulation, it might get
 -                       * populated or depopulated again.  Mark the
 -                       * page gone.
 -                       */
 -                      *pagep = NULL;
 +/**
 + * pcpu_unmap_pages - unmap pages out of a pcpu_chunk
 + * @chunk: chunk of interest
 + * @pages: pages array which can be used to pass information to free
 + * @populated: populated bitmap
 + * @page_start: page index of the first page to unmap
 + * @page_end: page index of the last page to unmap + 1
 + *
 + * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
 + * Corresponding elements in @pages were cleared by the caller and can
 + * be used to carry information to pcpu_free_pages() which will be
 + * called after all unmaps are finished.  The caller should call
 + * proper pre/post flush functions.
 + */
 +static void pcpu_unmap_pages(struct pcpu_chunk *chunk,
 +                           struct page **pages, unsigned long *populated,
 +                           int page_start, int page_end)
 +{
 +      unsigned int cpu;
 +      int i;
  
 -                      unmap_start = unmap_start < 0 ? i : unmap_start;
 -                      unmap_end = i + 1;
 +      for_each_possible_cpu(cpu) {
 +              for (i = page_start; i < page_end; i++) {
 +                      struct page *page;
 +
 +                      page = pcpu_chunk_page(chunk, cpu, i);
 +                      WARN_ON(!page);
 +                      pages[pcpu_page_idx(cpu, i)] = page;
                }
 +              __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start),
 +                                 page_end - page_start);
        }
  
 -      if (unmap_start >= 0)
 -              pcpu_unmap(chunk, unmap_start, unmap_end, flush);
 +      for (i = page_start; i < page_end; i++)
 +              __clear_bit(i, populated);
  }
  
  /**
 - * pcpu_map - map pages into a pcpu_chunk
 + * pcpu_post_unmap_tlb_flush - flush TLB after unmapping
 + * @chunk: pcpu_chunk the regions to be flushed belong to
 + * @page_start: page index of the first page to be flushed
 + * @page_end: page index of the last page to be flushed + 1
 + *
 + * Pages [@page_start,@page_end) of @chunk have been unmapped.  Flush
 + * TLB for the regions.  This can be skipped if the area is to be
 + * returned to vmalloc as vmalloc will handle TLB flushing lazily.
 + *
 + * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
 + * for the whole region.
 + */
 +static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
 +                                    int page_start, int page_end)
 +{
 +      flush_tlb_kernel_range(
 +              pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
 +              pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
 +}
 +
 +static int __pcpu_map_pages(unsigned long addr, struct page **pages,
 +                          int nr_pages)
 +{
 +      return map_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT,
 +                                      PAGE_KERNEL, pages);
 +}
 +
 +/**
 + * pcpu_map_pages - map pages into a pcpu_chunk
   * @chunk: chunk of interest
 + * @pages: pages array containing pages to be mapped
 + * @populated: populated bitmap
   * @page_start: page index of the first page to map
   * @page_end: page index of the last page to map + 1
   *
 - * For each cpu, map pages [@page_start,@page_end) into @chunk.
 - * vcache is flushed afterwards.
 + * For each cpu, map pages [@page_start,@page_end) into @chunk.  The
 + * caller is responsible for calling pcpu_post_map_flush() after all
 + * mappings are complete.
 + *
 + * This function is responsible for setting corresponding bits in
 + * @chunk->populated bitmap and whatever is necessary for reverse
 + * lookup (addr -> chunk).
   */
 -static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
 +static int pcpu_map_pages(struct pcpu_chunk *chunk,
 +                        struct page **pages, unsigned long *populated,
 +                        int page_start, int page_end)
  {
 -      unsigned int last = nr_cpu_ids - 1;
 -      unsigned int cpu;
 -      int err;
 -
 -      /* map must not be done on immutable chunk */
 -      WARN_ON(chunk->immutable);
 +      unsigned int cpu, tcpu;
 +      int i, err;
  
        for_each_possible_cpu(cpu) {
 -              err = map_kernel_range_noflush(
 -                              pcpu_chunk_addr(chunk, cpu, page_start),
 -                              (page_end - page_start) << PAGE_SHIFT,
 -                              PAGE_KERNEL,
 -                              pcpu_chunk_pagep(chunk, cpu, page_start));
 +              err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start),
 +                                     &pages[pcpu_page_idx(cpu, page_start)],
 +                                     page_end - page_start);
                if (err < 0)
 -                      return err;
 +                      goto err;
 +      }
 +
 +      /* mapping successful, link chunk and mark populated */
 +      for (i = page_start; i < page_end; i++) {
 +              for_each_possible_cpu(cpu)
 +                      pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)],
 +                                          chunk);
 +              __set_bit(i, populated);
        }
  
 -      /* flush at once, please read comments in pcpu_unmap() */
 -      flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start),
 -                       pcpu_chunk_addr(chunk, last, page_end));
        return 0;
 +
 +err:
 +      for_each_possible_cpu(tcpu) {
 +              if (tcpu == cpu)
 +                      break;
 +              __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start),
 +                                 page_end - page_start);
 +      }
 +      return err;
 +}
 +
 +/**
 + * pcpu_post_map_flush - flush cache after mapping
 + * @chunk: pcpu_chunk the regions to be flushed belong to
 + * @page_start: page index of the first page to be flushed
 + * @page_end: page index of the last page to be flushed + 1
 + *
 + * Pages [@page_start,@page_end) of @chunk have been mapped.  Flush
 + * cache.
 + *
 + * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
 + * for the whole region.
 + */
 +static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
 +                              int page_start, int page_end)
 +{
 +      flush_cache_vmap(
 +              pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
 +              pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
 +}
 +
 +/**
 + * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
 + * @chunk: chunk to depopulate
 + * @off: offset to the area to depopulate
 + * @size: size of the area to depopulate in bytes
 + * @flush: whether to flush cache and tlb or not
 + *
 + * For each cpu, depopulate and unmap pages [@page_start,@page_end)
 + * from @chunk.  If @flush is true, vcache is flushed before unmapping
 + * and tlb after.
 + *
 + * CONTEXT:
 + * pcpu_alloc_mutex.
 + */
 +static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size)
 +{
 +      int page_start = PFN_DOWN(off);
 +      int page_end = PFN_UP(off + size);
 +      struct page **pages;
 +      unsigned long *populated;
 +      int rs, re;
 +
 +      /* quick path, check whether it's empty already */
 +      pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
 +              if (rs == page_start && re == page_end)
 +                      return;
 +              break;
 +      }
 +
 +      /* immutable chunks can't be depopulated */
 +      WARN_ON(chunk->immutable);
 +
 +      /*
 +       * If control reaches here, there must have been at least one
 +       * successful population attempt so the temp pages array must
 +       * be available now.
 +       */
 +      pages = pcpu_get_pages_and_bitmap(chunk, &populated, false);
 +      BUG_ON(!pages);
 +
 +      /* unmap and free */
 +      pcpu_pre_unmap_flush(chunk, page_start, page_end);
 +
 +      pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
 +              pcpu_unmap_pages(chunk, pages, populated, rs, re);
 +
 +      /* no need to flush tlb, vmalloc will handle it lazily */
 +
 +      pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
 +              pcpu_free_pages(chunk, pages, populated, rs, re);
 +
 +      /* commit new bitmap */
 +      bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
  }
  
  /**
   */
  static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
  {
 -      const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
        int page_start = PFN_DOWN(off);
        int page_end = PFN_UP(off + size);
 -      int map_start = -1;
 -      int uninitialized_var(map_end);
 +      int free_end = page_start, unmap_end = page_start;
 +      struct page **pages;
 +      unsigned long *populated;
        unsigned int cpu;
 -      int i;
 +      int rs, re, rc;
  
 -      for (i = page_start; i < page_end; i++) {
 -              if (pcpu_chunk_page_occupied(chunk, i)) {
 -                      if (map_start >= 0) {
 -                              if (pcpu_map(chunk, map_start, map_end))
 -                                      goto err;
 -                              map_start = -1;
 -                      }
 -                      continue;
 -              }
 +      /* quick path, check whether all pages are already there */
 +      pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) {
 +              if (rs == page_start && re == page_end)
 +                      goto clear;
 +              break;
 +      }
  
 -              map_start = map_start < 0 ? i : map_start;
 -              map_end = i + 1;
 +      /* need to allocate and map pages, this chunk can't be immutable */
 +      WARN_ON(chunk->immutable);
  
 -              for_each_possible_cpu(cpu) {
 -                      struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
 +      pages = pcpu_get_pages_and_bitmap(chunk, &populated, true);
 +      if (!pages)
 +              return -ENOMEM;
  
 -                      *pagep = alloc_pages_node(cpu_to_node(cpu),
 -                                                alloc_mask, 0);
 -                      if (!*pagep)
 -                              goto err;
 -                      pcpu_set_page_chunk(*pagep, chunk);
 -              }
 +      /* alloc and map */
 +      pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
 +              rc = pcpu_alloc_pages(chunk, pages, populated, rs, re);
 +              if (rc)
 +                      goto err_free;
 +              free_end = re;
        }
  
 -      if (map_start >= 0 && pcpu_map(chunk, map_start, map_end))
 -              goto err;
 +      pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
 +              rc = pcpu_map_pages(chunk, pages, populated, rs, re);
 +              if (rc)
 +                      goto err_unmap;
 +              unmap_end = re;
 +      }
 +      pcpu_post_map_flush(chunk, page_start, page_end);
  
 +      /* commit new bitmap */
 +      bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
 +clear:
        for_each_possible_cpu(cpu)
 -              memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0,
 -                     size);
 -
 +              memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
        return 0;
 -err:
 -      /* likely under heavy memory pressure, give memory back */
 -      pcpu_depopulate_chunk(chunk, off, size, true);
 -      return -ENOMEM;
 +
 +err_unmap:
 +      pcpu_pre_unmap_flush(chunk, page_start, unmap_end);
 +      pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end)
 +              pcpu_unmap_pages(chunk, pages, populated, rs, re);
 +      pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end);
 +err_free:
 +      pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end)
 +              pcpu_free_pages(chunk, pages, populated, rs, re);
 +      return rc;
  }
  
  static void free_pcpu_chunk(struct pcpu_chunk *chunk)
@@@ -1003,8 -747,9 +1003,8 @@@ static struct pcpu_chunk *alloc_pcpu_ch
        chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
        chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
        chunk->map[chunk->map_used++] = pcpu_unit_size;
 -      chunk->page = chunk->page_ar;
  
-       chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL);
+       chunk->vm = get_vm_area(pcpu_chunk_size, VM_ALLOC);
        if (!chunk->vm) {
                free_pcpu_chunk(chunk);
                return NULL;
@@@ -1102,7 -847,6 +1102,7 @@@ area_found
  
        mutex_unlock(&pcpu_alloc_mutex);
  
 +      /* return address relative to unit0 */
        return __addr_to_pcpu_ptr(chunk->vm->addr + off);
  
  fail_unlock:
@@@ -1184,7 -928,7 +1184,7 @@@ static void pcpu_reclaim(struct work_st
        mutex_unlock(&pcpu_alloc_mutex);
  
        list_for_each_entry_safe(chunk, next, &todo, list) {
 -              pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);
 +              pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size);
                free_pcpu_chunk(chunk);
        }
  }
@@@ -1232,16 -976,26 +1232,16 @@@ EXPORT_SYMBOL_GPL(free_percpu)
  
  /**
   * pcpu_setup_first_chunk - initialize the first percpu chunk
 - * @get_page_fn: callback to fetch page pointer
   * @static_size: the size of static percpu area in bytes
 - * @reserved_size: the size of reserved percpu area in bytes
 + * @reserved_size: the size of reserved percpu area in bytes, 0 for none
   * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
 - * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto
 - * @base_addr: mapped address, NULL for auto
 - * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary
 + * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE
 + * @base_addr: mapped address
 + * @unit_map: cpu -> unit map, NULL for sequential mapping
   *
   * Initialize the first percpu chunk which contains the kernel static
   * perpcu area.  This function is to be called from arch percpu area
 - * setup path.  The first two parameters are mandatory.  The rest are
 - * optional.
 - *
 - * @get_page_fn() should return pointer to percpu page given cpu
 - * number and page number.  It should at least return enough pages to
 - * cover the static area.  The returned pages for static area should
 - * have been initialized with valid data.  If @unit_size is specified,
 - * it can also return pages after the static area.  NULL return
 - * indicates end of pages for the cpu.  Note that @get_page_fn() must
 - * return the same number of pages for all cpus.
 + * setup path.
   *
   * @reserved_size, if non-zero, specifies the amount of bytes to
   * reserve after the static area in the first chunk.  This reserves
   * non-negative value makes percpu leave alone the area beyond
   * @static_size + @reserved_size + @dyn_size.
   *
 - * @unit_size, if non-negative, specifies unit size and must be
 - * aligned to PAGE_SIZE and equal to or larger than @static_size +
 - * @reserved_size + if non-negative, @dyn_size.
 - *
 - * Non-null @base_addr means that the caller already allocated virtual
 - * region for the first chunk and mapped it.  percpu must not mess
 - * with the chunk.  Note that @base_addr with 0 @unit_size or non-NULL
 - * @populate_pte_fn doesn't make any sense.
 + * @unit_size specifies unit size and must be aligned to PAGE_SIZE and
 + * equal to or larger than @static_size + @reserved_size + if
 + * non-negative, @dyn_size.
   *
 - * @populate_pte_fn is used to populate the pagetable.  NULL means the
 - * caller already populated the pagetable.
 + * The caller should have mapped the first chunk at @base_addr and
 + * copied static data to each unit.
   *
   * If the first chunk ends up with both reserved and dynamic areas, it
   * is served by two chunks - one to serve the core static and reserved
   * The determined pcpu_unit_size which can be used to initialize
   * percpu access.
   */
 -size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 -                                   size_t static_size, size_t reserved_size,
 -                                   ssize_t dyn_size, ssize_t unit_size,
 -                                   void *base_addr,
 -                                   pcpu_populate_pte_fn_t populate_pte_fn)
 +size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size,
 +                                   ssize_t dyn_size, size_t unit_size,
 +                                   void *base_addr, const int *unit_map)
  {
        static struct vm_struct first_vm;
        static int smap[2], dmap[2];
        size_t size_sum = static_size + reserved_size +
                          (dyn_size >= 0 ? dyn_size : 0);
        struct pcpu_chunk *schunk, *dchunk = NULL;
 -      unsigned int cpu;
 -      int nr_pages;
 -      int err, i;
 +      unsigned int cpu, tcpu;
 +      int i;
  
 -      /* santiy checks */
 +      /* sanity checks */
        BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
                     ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC);
        BUG_ON(!static_size);
 -      if (unit_size >= 0) {
 -              BUG_ON(unit_size < size_sum);
 -              BUG_ON(unit_size & ~PAGE_MASK);
 -              BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE);
 -      } else
 -              BUG_ON(base_addr);
 -      BUG_ON(base_addr && populate_pte_fn);
 -
 -      if (unit_size >= 0)
 -              pcpu_unit_pages = unit_size >> PAGE_SHIFT;
 -      else
 -              pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT,
 -                                      PFN_UP(size_sum));
 +      BUG_ON(!base_addr);
 +      BUG_ON(unit_size < size_sum);
 +      BUG_ON(unit_size & ~PAGE_MASK);
 +      BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE);
 +
 +      /* determine number of units and verify and initialize pcpu_unit_map */
 +      if (unit_map) {
 +              int first_unit = INT_MAX, last_unit = INT_MIN;
 +
 +              for_each_possible_cpu(cpu) {
 +                      int unit = unit_map[cpu];
 +
 +                      BUG_ON(unit < 0);
 +                      for_each_possible_cpu(tcpu) {
 +                              if (tcpu == cpu)
 +                                      break;
 +                              /* the mapping should be one-to-one */
 +                              BUG_ON(unit_map[tcpu] == unit);
 +                      }
 +
 +                      if (unit < first_unit) {
 +                              pcpu_first_unit_cpu = cpu;
 +                              first_unit = unit;
 +                      }
 +                      if (unit > last_unit) {
 +                              pcpu_last_unit_cpu = cpu;
 +                              last_unit = unit;
 +                      }
 +              }
 +              pcpu_nr_units = last_unit + 1;
 +              pcpu_unit_map = unit_map;
 +      } else {
 +              int *identity_map;
 +
 +              /* #units == #cpus, identity mapped */
-               identity_map = alloc_bootmem(num_possible_cpus() *
++              identity_map = alloc_bootmem(nr_cpu_ids *
 +                                           sizeof(identity_map[0]));
  
-               pcpu_nr_units = num_possible_cpus();
 +              for_each_possible_cpu(cpu)
 +                      identity_map[cpu] = cpu;
 +
 +              pcpu_first_unit_cpu = 0;
 +              pcpu_last_unit_cpu = pcpu_nr_units - 1;
++              pcpu_nr_units = nr_cpu_ids;
 +              pcpu_unit_map = identity_map;
 +      }
 +
 +      /* determine basic parameters */
 +      pcpu_unit_pages = unit_size >> PAGE_SHIFT;
        pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
 -      pcpu_chunk_size = nr_cpu_ids * pcpu_unit_size;
 -      pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
 -              + nr_cpu_ids * pcpu_unit_pages * sizeof(struct page *);
 +      pcpu_chunk_size = pcpu_nr_units * pcpu_unit_size;
 +      pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +
 +              BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long);
  
        if (dyn_size < 0)
                dyn_size = pcpu_unit_size - static_size - reserved_size;
  
 +      first_vm.flags = VM_ALLOC;
 +      first_vm.size = pcpu_chunk_size;
 +      first_vm.addr = base_addr;
 +
        /*
         * Allocate chunk slots.  The additional last slot is for
         * empty chunks.
        schunk->vm = &first_vm;
        schunk->map = smap;
        schunk->map_alloc = ARRAY_SIZE(smap);
 -      schunk->page = schunk->page_ar;
 +      schunk->immutable = true;
 +      bitmap_fill(schunk->populated, pcpu_unit_pages);
  
        if (reserved_size) {
                schunk->free_size = reserved_size;
  
        /* init dynamic chunk if necessary */
        if (dyn_size) {
 -              dchunk = alloc_bootmem(sizeof(struct pcpu_chunk));
 +              dchunk = alloc_bootmem(pcpu_chunk_struct_size);
                INIT_LIST_HEAD(&dchunk->list);
                dchunk->vm = &first_vm;
                dchunk->map = dmap;
                dchunk->map_alloc = ARRAY_SIZE(dmap);
 -              dchunk->page = schunk->page_ar; /* share page map with schunk */
 +              dchunk->immutable = true;
 +              bitmap_fill(dchunk->populated, pcpu_unit_pages);
  
                dchunk->contig_hint = dchunk->free_size = dyn_size;
                dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit;
                dchunk->map[dchunk->map_used++] = dchunk->free_size;
        }
  
 -      /* allocate vm address */
 -      first_vm.flags = VM_ALLOC;
 -      first_vm.size = pcpu_chunk_size;
 -
 -      if (!base_addr)
 -              vm_area_register_early(&first_vm, PAGE_SIZE);
 -      else {
 -              /*
 -               * Pages already mapped.  No need to remap into
 -               * vmalloc area.  In this case the first chunks can't
 -               * be mapped or unmapped by percpu and are marked
 -               * immutable.
 -               */
 -              first_vm.addr = base_addr;
 -              schunk->immutable = true;
 -              if (dchunk)
 -                      dchunk->immutable = true;
 -      }
 -
 -      /* assign pages */
 -      nr_pages = -1;
 -      for_each_possible_cpu(cpu) {
 -              for (i = 0; i < pcpu_unit_pages; i++) {
 -                      struct page *page = get_page_fn(cpu, i);
 -
 -                      if (!page)
 -                              break;
 -                      *pcpu_chunk_pagep(schunk, cpu, i) = page;
 -              }
 -
 -              BUG_ON(i < PFN_UP(static_size));
 -
 -              if (nr_pages < 0)
 -                      nr_pages = i;
 -              else
 -                      BUG_ON(nr_pages != i);
 -      }
 -
 -      /* map them */
 -      if (populate_pte_fn) {
 -              for_each_possible_cpu(cpu)
 -                      for (i = 0; i < nr_pages; i++)
 -                              populate_pte_fn(pcpu_chunk_addr(schunk,
 -                                                              cpu, i));
 -
 -              err = pcpu_map(schunk, 0, nr_pages);
 -              if (err)
 -                      panic("failed to setup static percpu area, err=%d\n",
 -                            err);
 -      }
 -
        /* link the first chunk in */
        pcpu_first_chunk = dchunk ?: schunk;
        pcpu_chunk_relocate(pcpu_first_chunk, -1);
  
        /* we're done */
 -      pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0);
 +      pcpu_base_addr = schunk->vm->addr;
        return pcpu_unit_size;
  }
  
 -/*
 - * Embedding first chunk setup helper.
 - */
 -static void *pcpue_ptr __initdata;
 -static size_t pcpue_size __initdata;
 -static size_t pcpue_unit_size __initdata;
 -
 -static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
 +static size_t pcpu_calc_fc_sizes(size_t static_size, size_t reserved_size,
 +                               ssize_t *dyn_sizep)
  {
 -      size_t off = (size_t)pageno << PAGE_SHIFT;
 +      size_t size_sum;
  
 -      if (off >= pcpue_size)
 -              return NULL;
 +      size_sum = PFN_ALIGN(static_size + reserved_size +
 +                           (*dyn_sizep >= 0 ? *dyn_sizep : 0));
 +      if (*dyn_sizep != 0)
 +              *dyn_sizep = size_sum - static_size - reserved_size;
  
 -      return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off);
 +      return size_sum;
  }
  
  /**
   * @static_size: the size of static percpu area in bytes
   * @reserved_size: the size of reserved percpu area in bytes
   * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
 - * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto
   *
   * This is a helper to ease setting up embedded first percpu chunk and
   * can be called where pcpu_setup_first_chunk() is expected.
   * page size.
   *
   * When @dyn_size is positive, dynamic area might be larger than
 - * specified to fill page alignment.  Also, when @dyn_size is auto,
 - * @dyn_size does not fill the whole first chunk but only what's
 - * necessary for page alignment after static and reserved areas.
 + * specified to fill page alignment.  When @dyn_size is auto,
 + * @dyn_size is just big enough to fill page alignment after static
 + * and reserved areas.
   *
   * If the needed size is smaller than the minimum or specified unit
   * size, the leftover is returned to the bootmem allocator.
   * percpu access on success, -errno on failure.
   */
  ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
 -                                    ssize_t dyn_size, ssize_t unit_size)
 +                                    ssize_t dyn_size)
  {
 -      size_t chunk_size;
 +      size_t size_sum, unit_size, chunk_size;
 +      void *base;
        unsigned int cpu;
  
        /* determine parameters and allocate */
 -      pcpue_size = PFN_ALIGN(static_size + reserved_size +
 -                             (dyn_size >= 0 ? dyn_size : 0));
 -      if (dyn_size != 0)
 -              dyn_size = pcpue_size - static_size - reserved_size;
 -
 -      if (unit_size >= 0) {
 -              BUG_ON(unit_size < pcpue_size);
 -              pcpue_unit_size = unit_size;
 -      } else
 -              pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE);
 -
 -      chunk_size = pcpue_unit_size * nr_cpu_ids;
 -
 -      pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE,
 -                                          __pa(MAX_DMA_ADDRESS));
 -      if (!pcpue_ptr) {
 +      size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
 +
 +      unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
-       chunk_size = unit_size * num_possible_cpus();
++      chunk_size = unit_size * nr_cpu_ids;
 +
 +      base = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE,
 +                                     __pa(MAX_DMA_ADDRESS));
 +      if (!base) {
                pr_warning("PERCPU: failed to allocate %zu bytes for "
                           "embedding\n", chunk_size);
                return -ENOMEM;
        }
  
        /* return the leftover and copy */
-       for_each_possible_cpu(cpu) {
+       for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
 -              void *ptr = pcpue_ptr + cpu * pcpue_unit_size;
 +              void *ptr = base + cpu * unit_size;
  
-               free_bootmem(__pa(ptr + size_sum), unit_size - size_sum);
-               memcpy(ptr, __per_cpu_load, static_size);
+               if (cpu_possible(cpu)) {
 -                      free_bootmem(__pa(ptr + pcpue_size),
 -                                   pcpue_unit_size - pcpue_size);
++                      free_bootmem(__pa(ptr + size_sum),
++                                   unit_size - size_sum);
+                       memcpy(ptr, __per_cpu_load, static_size);
+               } else
 -                      free_bootmem(__pa(ptr), pcpue_unit_size);
++                      free_bootmem(__pa(ptr), unit_size);
        }
  
        /* we're ready, commit */
        pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
 -              pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size);
 +              size_sum >> PAGE_SHIFT, base, static_size);
 +
 +      return pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
 +                                    unit_size, base, NULL);
 +}
 +
 +/**
 + * pcpu_4k_first_chunk - map the first chunk using PAGE_SIZE pages
 + * @static_size: the size of static percpu area in bytes
 + * @reserved_size: the size of reserved percpu area in bytes
 + * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE
 + * @free_fn: funtion to free percpu page, always called with PAGE_SIZE
 + * @populate_pte_fn: function to populate pte
 + *
 + * This is a helper to ease setting up embedded first percpu chunk and
 + * can be called where pcpu_setup_first_chunk() is expected.
 + *
 + * This is the basic allocator.  Static percpu area is allocated
 + * page-by-page into vmalloc area.
 + *
 + * RETURNS:
 + * The determined pcpu_unit_size which can be used to initialize
 + * percpu access on success, -errno on failure.
 + */
 +ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size,
 +                                 pcpu_fc_alloc_fn_t alloc_fn,
 +                                 pcpu_fc_free_fn_t free_fn,
 +                                 pcpu_fc_populate_pte_fn_t populate_pte_fn)
 +{
 +      static struct vm_struct vm;
 +      int unit_pages;
 +      size_t pages_size;
 +      struct page **pages;
 +      unsigned int cpu;
 +      int i, j;
 +      ssize_t ret;
 +
 +      unit_pages = PFN_UP(max_t(size_t, static_size + reserved_size,
 +                                PCPU_MIN_UNIT_SIZE));
 +
 +      /* unaligned allocations can't be freed, round up to page size */
-       pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
-                              sizeof(pages[0]));
++      pages_size = PFN_ALIGN(unit_pages * nr_cpu_ids * sizeof(pages[0]));
 +      pages = alloc_bootmem(pages_size);
 +
 +      /* allocate pages */
 +      j = 0;
 +      for_each_possible_cpu(cpu)
 +              for (i = 0; i < unit_pages; i++) {
 +                      void *ptr;
 +
 +                      ptr = alloc_fn(cpu, PAGE_SIZE);
 +                      if (!ptr) {
 +                              pr_warning("PERCPU: failed to allocate "
 +                                         "4k page for cpu%u\n", cpu);
 +                              goto enomem;
 +                      }
 +                      pages[j++] = virt_to_page(ptr);
 +              }
 +
 +      /* allocate vm area, map the pages and copy static data */
 +      vm.flags = VM_ALLOC;
-       vm.size = num_possible_cpus() * unit_pages << PAGE_SHIFT;
++      vm.size = nr_cpu_ids * unit_pages << PAGE_SHIFT;
 +      vm_area_register_early(&vm, PAGE_SIZE);
 +
 +      for_each_possible_cpu(cpu) {
 +              unsigned long unit_addr = (unsigned long)vm.addr +
 +                      (cpu * unit_pages << PAGE_SHIFT);
 +
 +              for (i = 0; i < unit_pages; i++)
 +                      populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
 +
 +              /* pte already populated, the following shouldn't fail */
 +              ret = __pcpu_map_pages(unit_addr, &pages[cpu * unit_pages],
 +                                     unit_pages);
 +              if (ret < 0)
 +                      panic("failed to map percpu area, err=%zd\n", ret);
 +
 +              /*
 +               * FIXME: Archs with virtual cache should flush local
 +               * cache for the linear mapping here - something
 +               * equivalent to flush_cache_vmap() on the local cpu.
 +               * flush_cache_vmap() can't be used as most supporting
 +               * data structures are not set up yet.
 +               */
 +
 +              /* copy static data */
 +              memcpy((void *)unit_addr, __per_cpu_load, static_size);
 +      }
 +
 +      /* we're ready, commit */
 +      pr_info("PERCPU: %d 4k pages per cpu, static data %zu bytes\n",
 +              unit_pages, static_size);
 +
 +      ret = pcpu_setup_first_chunk(static_size, reserved_size, -1,
 +                                   unit_pages << PAGE_SHIFT, vm.addr, NULL);
 +      goto out_free_ar;
 +
 +enomem:
 +      while (--j >= 0)
 +              free_fn(page_address(pages[j]), PAGE_SIZE);
 +      ret = -ENOMEM;
 +out_free_ar:
 +      free_bootmem(__pa(pages), pages_size);
 +      return ret;
 +}
 +
 +/*
 + * Large page remapping first chunk setup helper
 + */
 +#ifdef CONFIG_NEED_MULTIPLE_NODES
 +
 +/**
 + * pcpu_lpage_build_unit_map - build unit_map for large page remapping
 + * @static_size: the size of static percpu area in bytes
 + * @reserved_size: the size of reserved percpu area in bytes
 + * @dyn_sizep: in/out parameter for dynamic size, -1 for auto
 + * @unit_sizep: out parameter for unit size
 + * @unit_map: unit_map to be filled
 + * @cpu_distance_fn: callback to determine distance between cpus
 + *
 + * This function builds cpu -> unit map and determine other parameters
 + * considering needed percpu size, large page size and distances
 + * between CPUs in NUMA.
 + *
 + * CPUs which are of LOCAL_DISTANCE both ways are grouped together and
 + * may share units in the same large page.  The returned configuration
 + * is guaranteed to have CPUs on different nodes on different large
 + * pages and >=75% usage of allocated virtual address space.
 + *
 + * RETURNS:
 + * On success, fills in @unit_map, sets *@dyn_sizep, *@unit_sizep and
 + * returns the number of units to be allocated.  -errno on failure.
 + */
 +int __init pcpu_lpage_build_unit_map(size_t static_size, size_t reserved_size,
 +                                   ssize_t *dyn_sizep, size_t *unit_sizep,
 +                                   size_t lpage_size, int *unit_map,
 +                                   pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
 +{
 +      static int group_map[NR_CPUS] __initdata;
 +      static int group_cnt[NR_CPUS] __initdata;
 +      int group_cnt_max = 0;
 +      size_t size_sum, min_unit_size, alloc_size;
 +      int upa, max_upa, uninitialized_var(best_upa);  /* units_per_alloc */
 +      int last_allocs;
 +      unsigned int cpu, tcpu;
 +      int group, unit;
 +
 +      /*
 +       * Determine min_unit_size, alloc_size and max_upa such that
 +       * alloc_size is multiple of lpage_size and is the smallest
 +       * which can accomodate 4k aligned segments which are equal to
 +       * or larger than min_unit_size.
 +       */
 +      size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, dyn_sizep);
 +      min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
 +
 +      alloc_size = roundup(min_unit_size, lpage_size);
 +      upa = alloc_size / min_unit_size;
 +      while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
 +              upa--;
 +      max_upa = upa;
 +
 +      /* group cpus according to their proximity */
 +      for_each_possible_cpu(cpu) {
 +              group = 0;
 +      next_group:
 +              for_each_possible_cpu(tcpu) {
 +                      if (cpu == tcpu)
 +                              break;
 +                      if (group_map[tcpu] == group &&
 +                          (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
 +                           cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
 +                              group++;
 +                              goto next_group;
 +                      }
 +              }
 +              group_map[cpu] = group;
 +              group_cnt[group]++;
 +              group_cnt_max = max(group_cnt_max, group_cnt[group]);
 +      }
 +
 +      /*
 +       * Expand unit size until address space usage goes over 75%
 +       * and then as much as possible without using more address
 +       * space.
 +       */
 +      last_allocs = INT_MAX;
 +      for (upa = max_upa; upa; upa--) {
 +              int allocs = 0, wasted = 0;
 +
 +              if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
 +                      continue;
 +
 +              for (group = 0; group_cnt[group]; group++) {
 +                      int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
 +                      allocs += this_allocs;
 +                      wasted += this_allocs * upa - group_cnt[group];
 +              }
 +
 +              /*
 +               * Don't accept if wastage is over 25%.  The
 +               * greater-than comparison ensures upa==1 always
 +               * passes the following check.
 +               */
 +              if (wasted > num_possible_cpus() / 3)
 +                      continue;
 +
 +              /* and then don't consume more memory */
 +              if (allocs > last_allocs)
 +                      break;
 +              last_allocs = allocs;
 +              best_upa = upa;
 +      }
 +      *unit_sizep = alloc_size / best_upa;
  
 -      return pcpu_setup_first_chunk(pcpue_get_page, static_size,
 -                                    reserved_size, dyn_size,
 -                                    pcpue_unit_size, pcpue_ptr, NULL);
 +      /* assign units to cpus accordingly */
 +      unit = 0;
 +      for (group = 0; group_cnt[group]; group++) {
 +              for_each_possible_cpu(cpu)
 +                      if (group_map[cpu] == group)
 +                              unit_map[cpu] = unit++;
 +              unit = roundup(unit, best_upa);
 +      }
 +
 +      return unit;    /* unit contains aligned number of units */
 +}
 +
 +struct pcpul_ent {
 +      void            *ptr;
 +      void            *map_addr;
 +};
 +
 +static size_t pcpul_size;
 +static size_t pcpul_lpage_size;
 +static int pcpul_nr_lpages;
 +static struct pcpul_ent *pcpul_map;
 +
 +static bool __init pcpul_unit_to_cpu(int unit, const int *unit_map,
 +                                   unsigned int *cpup)
 +{
 +      unsigned int cpu;
 +
 +      for_each_possible_cpu(cpu)
 +              if (unit_map[cpu] == unit) {
 +                      if (cpup)
 +                              *cpup = cpu;
 +                      return true;
 +              }
 +
 +      return false;
 +}
 +
 +static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size,
 +                                      size_t reserved_size, size_t dyn_size,
 +                                      size_t unit_size, size_t lpage_size,
 +                                      const int *unit_map, int nr_units)
 +{
 +      int width = 1, v = nr_units;
 +      char empty_str[] = "--------";
 +      int upl, lpl;   /* units per lpage, lpage per line */
 +      unsigned int cpu;
 +      int lpage, unit;
 +
 +      while (v /= 10)
 +              width++;
 +      empty_str[min_t(int, width, sizeof(empty_str) - 1)] = '\0';
 +
 +      upl = max_t(int, lpage_size / unit_size, 1);
 +      lpl = rounddown_pow_of_two(max_t(int, 60 / (upl * (width + 1) + 2), 1));
 +
 +      printk("%spcpu-lpage: sta/res/dyn=%zu/%zu/%zu unit=%zu lpage=%zu", lvl,
 +             static_size, reserved_size, dyn_size, unit_size, lpage_size);
 +
 +      for (lpage = 0, unit = 0; unit < nr_units; unit++) {
 +              if (!(unit % upl)) {
 +                      if (!(lpage++ % lpl)) {
 +                              printk("\n");
 +                              printk("%spcpu-lpage: ", lvl);
 +                      } else
 +                              printk("| ");
 +              }
 +              if (pcpul_unit_to_cpu(unit, unit_map, &cpu))
 +                      printk("%0*d ", width, cpu);
 +              else
 +                      printk("%s ", empty_str);
 +      }
 +      printk("\n");
 +}
 +
 +/**
 + * pcpu_lpage_first_chunk - remap the first percpu chunk using large page
 + * @static_size: the size of static percpu area in bytes
 + * @reserved_size: the size of reserved percpu area in bytes
 + * @dyn_size: free size for dynamic allocation in bytes
 + * @unit_size: unit size in bytes
 + * @lpage_size: the size of a large page
 + * @unit_map: cpu -> unit mapping
 + * @nr_units: the number of units
 + * @alloc_fn: function to allocate percpu lpage, always called with lpage_size
 + * @free_fn: function to free percpu memory, @size <= lpage_size
 + * @map_fn: function to map percpu lpage, always called with lpage_size
 + *
 + * This allocator uses large page to build and map the first chunk.
 + * Unlike other helpers, the caller should always specify @dyn_size
 + * and @unit_size.  These parameters along with @unit_map and
 + * @nr_units can be determined using pcpu_lpage_build_unit_map().
 + * This two stage initialization is to allow arch code to evaluate the
 + * parameters before committing to it.
 + *
 + * Large pages are allocated as directed by @unit_map and other
 + * parameters and mapped to vmalloc space.  Unused holes are returned
 + * to the page allocator.  Note that these holes end up being actively
 + * mapped twice - once to the physical mapping and to the vmalloc area
 + * for the first percpu chunk.  Depending on architecture, this might
 + * cause problem when changing page attributes of the returned area.
 + * These double mapped areas can be detected using
 + * pcpu_lpage_remapped().
 + *
 + * RETURNS:
 + * The determined pcpu_unit_size which can be used to initialize
 + * percpu access on success, -errno on failure.
 + */
 +ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size,
 +                                    size_t dyn_size, size_t unit_size,
 +                                    size_t lpage_size, const int *unit_map,
 +                                    int nr_units,
 +                                    pcpu_fc_alloc_fn_t alloc_fn,
 +                                    pcpu_fc_free_fn_t free_fn,
 +                                    pcpu_fc_map_fn_t map_fn)
 +{
 +      static struct vm_struct vm;
 +      size_t chunk_size = unit_size * nr_units;
 +      size_t map_size;
 +      unsigned int cpu;
 +      ssize_t ret;
 +      int i, j, unit;
 +
 +      pcpul_lpage_dump_cfg(KERN_DEBUG, static_size, reserved_size, dyn_size,
 +                           unit_size, lpage_size, unit_map, nr_units);
 +
 +      BUG_ON(chunk_size % lpage_size);
 +
 +      pcpul_size = static_size + reserved_size + dyn_size;
 +      pcpul_lpage_size = lpage_size;
 +      pcpul_nr_lpages = chunk_size / lpage_size;
 +
 +      /* allocate pointer array and alloc large pages */
 +      map_size = pcpul_nr_lpages * sizeof(pcpul_map[0]);
 +      pcpul_map = alloc_bootmem(map_size);
 +
 +      /* allocate all pages */
 +      for (i = 0; i < pcpul_nr_lpages; i++) {
 +              size_t offset = i * lpage_size;
 +              int first_unit = offset / unit_size;
 +              int last_unit = (offset + lpage_size - 1) / unit_size;
 +              void *ptr;
 +
 +              /* find out which cpu is mapped to this unit */
 +              for (unit = first_unit; unit <= last_unit; unit++)
 +                      if (pcpul_unit_to_cpu(unit, unit_map, &cpu))
 +                              goto found;
 +              continue;
 +      found:
 +              ptr = alloc_fn(cpu, lpage_size);
 +              if (!ptr) {
 +                      pr_warning("PERCPU: failed to allocate large page "
 +                                 "for cpu%u\n", cpu);
 +                      goto enomem;
 +              }
 +
 +              pcpul_map[i].ptr = ptr;
 +      }
 +
 +      /* return unused holes */
 +      for (unit = 0; unit < nr_units; unit++) {
 +              size_t start = unit * unit_size;
 +              size_t end = start + unit_size;
 +              size_t off, next;
 +
 +              /* don't free used part of occupied unit */
 +              if (pcpul_unit_to_cpu(unit, unit_map, NULL))
 +                      start += pcpul_size;
 +
 +              /* unit can span more than one page, punch the holes */
 +              for (off = start; off < end; off = next) {
 +                      void *ptr = pcpul_map[off / lpage_size].ptr;
 +                      next = min(roundup(off + 1, lpage_size), end);
 +                      if (ptr)
 +                              free_fn(ptr + off % lpage_size, next - off);
 +              }
 +      }
 +
 +      /* allocate address, map and copy */
 +      vm.flags = VM_ALLOC;
 +      vm.size = chunk_size;
 +      vm_area_register_early(&vm, unit_size);
 +
 +      for (i = 0; i < pcpul_nr_lpages; i++) {
 +              if (!pcpul_map[i].ptr)
 +                      continue;
 +              pcpul_map[i].map_addr = vm.addr + i * lpage_size;
 +              map_fn(pcpul_map[i].ptr, lpage_size, pcpul_map[i].map_addr);
 +      }
 +
 +      for_each_possible_cpu(cpu)
 +              memcpy(vm.addr + unit_map[cpu] * unit_size, __per_cpu_load,
 +                     static_size);
 +
 +      /* we're ready, commit */
 +      pr_info("PERCPU: Remapped at %p with large pages, static data "
 +              "%zu bytes\n", vm.addr, static_size);
 +
 +      ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
 +                                   unit_size, vm.addr, unit_map);
 +
 +      /*
 +       * Sort pcpul_map array for pcpu_lpage_remapped().  Unmapped
 +       * lpages are pushed to the end and trimmed.
 +       */
 +      for (i = 0; i < pcpul_nr_lpages - 1; i++)
 +              for (j = i + 1; j < pcpul_nr_lpages; j++) {
 +                      struct pcpul_ent tmp;
 +
 +                      if (!pcpul_map[j].ptr)
 +                              continue;
 +                      if (pcpul_map[i].ptr &&
 +                          pcpul_map[i].ptr < pcpul_map[j].ptr)
 +                              continue;
 +
 +                      tmp = pcpul_map[i];
 +                      pcpul_map[i] = pcpul_map[j];
 +                      pcpul_map[j] = tmp;
 +              }
 +
 +      while (pcpul_nr_lpages && !pcpul_map[pcpul_nr_lpages - 1].ptr)
 +              pcpul_nr_lpages--;
 +
 +      return ret;
 +
 +enomem:
 +      for (i = 0; i < pcpul_nr_lpages; i++)
 +              if (pcpul_map[i].ptr)
 +                      free_fn(pcpul_map[i].ptr, lpage_size);
 +      free_bootmem(__pa(pcpul_map), map_size);
 +      return -ENOMEM;
 +}
 +
 +/**
 + * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area
 + * @kaddr: the kernel address in question
 + *
 + * Determine whether @kaddr falls in the pcpul recycled area.  This is
 + * used by pageattr to detect VM aliases and break up the pcpu large
 + * page mapping such that the same physical page is not mapped under
 + * different attributes.
 + *
 + * The recycled area is always at the tail of a partially used large
 + * page.
 + *
 + * RETURNS:
 + * Address of corresponding remapped pcpu address if match is found;
 + * otherwise, NULL.
 + */
 +void *pcpu_lpage_remapped(void *kaddr)
 +{
 +      unsigned long lpage_mask = pcpul_lpage_size - 1;
 +      void *lpage_addr = (void *)((unsigned long)kaddr & ~lpage_mask);
 +      unsigned long offset = (unsigned long)kaddr & lpage_mask;
 +      int left = 0, right = pcpul_nr_lpages - 1;
 +      int pos;
 +
 +      /* pcpul in use at all? */
 +      if (!pcpul_map)
 +              return NULL;
 +
 +      /* okay, perform binary search */
 +      while (left <= right) {
 +              pos = (left + right) / 2;
 +
 +              if (pcpul_map[pos].ptr < lpage_addr)
 +                      left = pos + 1;
 +              else if (pcpul_map[pos].ptr > lpage_addr)
 +                      right = pos - 1;
 +              else
 +                      return pcpul_map[pos].map_addr + offset;
 +      }
 +
 +      return NULL;
 +}
 +#endif
 +
 +/*
 + * Generic percpu area setup.
 + *
 + * The embedding helper is used because its behavior closely resembles
 + * the original non-dynamic generic percpu area setup.  This is
 + * important because many archs have addressing restrictions and might
 + * fail if the percpu area is located far away from the previous
 + * location.  As an added bonus, in non-NUMA cases, embedding is
 + * generally a good idea TLB-wise because percpu area can piggy back
 + * on the physical linear memory mapping which uses large page
 + * mappings on applicable archs.
 + */
 +#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
 +unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
 +EXPORT_SYMBOL(__per_cpu_offset);
 +
 +void __init setup_per_cpu_areas(void)
 +{
 +      size_t static_size = __per_cpu_end - __per_cpu_start;
 +      ssize_t unit_size;
 +      unsigned long delta;
 +      unsigned int cpu;
 +
 +      /*
 +       * Always reserve area for module percpu variables.  That's
 +       * what the legacy allocator did.
 +       */
 +      unit_size = pcpu_embed_first_chunk(static_size, PERCPU_MODULE_RESERVE,
 +                                         PERCPU_DYNAMIC_RESERVE);
 +      if (unit_size < 0)
 +              panic("Failed to initialized percpu areas.");
 +
 +      delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
 +      for_each_possible_cpu(cpu)
 +              __per_cpu_offset[cpu] = delta + cpu * unit_size;
  }
 +#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
diff --combined mm/slub.c
+++ b/mm/slub.c
@@@ -21,7 -21,6 +21,6 @@@
  #include <linux/kmemcheck.h>
  #include <linux/cpu.h>
  #include <linux/cpuset.h>
- #include <linux/kmemleak.h>
  #include <linux/mempolicy.h>
  #include <linux/ctype.h>
  #include <linux/debugobjects.h>
@@@ -2092,8 -2091,8 +2091,8 @@@ init_kmem_cache_node(struct kmem_cache_
   */
  #define NR_KMEM_CACHE_CPU 100
  
 -static DEFINE_PER_CPU(struct kmem_cache_cpu,
 -                              kmem_cache_cpu)[NR_KMEM_CACHE_CPU];
 +static DEFINE_PER_CPU(struct kmem_cache_cpu [NR_KMEM_CACHE_CPU],
 +                    kmem_cache_cpu);
  
  static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free);
  static DECLARE_BITMAP(kmem_cach_cpu_free_init_once, CONFIG_NR_CPUS);
@@@ -2595,6 -2594,8 +2594,8 @@@ static inline int kmem_cache_close(stru
   */
  void kmem_cache_destroy(struct kmem_cache *s)
  {
+       if (s->flags & SLAB_DESTROY_BY_RCU)
+               rcu_barrier();
        down_write(&slub_lock);
        s->refcount--;
        if (!s->refcount) {
@@@ -2833,13 -2834,15 +2834,15 @@@ EXPORT_SYMBOL(__kmalloc)
  static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
  {
        struct page *page;
+       void *ptr = NULL;
  
        flags |= __GFP_COMP | __GFP_NOTRACK;
        page = alloc_pages_node(node, flags, get_order(size));
        if (page)
-               return page_address(page);
-       else
-               return NULL;
+               ptr = page_address(page);
+       kmemleak_alloc(ptr, size, 1, flags);
+       return ptr;
  }
  
  #ifdef CONFIG_NUMA
@@@ -2924,6 -2927,7 +2927,7 @@@ void kfree(const void *x
        page = virt_to_head_page(x);
        if (unlikely(!PageSlab(page))) {
                BUG_ON(!PageCompound(page));
+               kmemleak_free(x);
                put_page(page);
                return;
        }