Merge branch 'percpu-for-linus' into percpu-for-next

author Tejun Heo <tj@kernel.org>

Fri, 14 Aug 2009 05:41:02 +0000 (14:41 +0900)

committer Tejun Heo <tj@kernel.org>

Fri, 14 Aug 2009 05:45:31 +0000 (14:45 +0900)
author Tejun Heo <tj@kernel.org>
Fri, 14 Aug 2009 05:41:02 +0000 (14:41 +0900)
committer Tejun Heo <tj@kernel.org>
Fri, 14 Aug 2009 05:45:31 +0000 (14:45 +0900)
diff --combined Makefile

index 2f50338,abcfa85..e1e7a71
--- 1/Makefile
--- 2/Makefile
+++ b/Makefile
@@@ -1,7 -1,7 +1,7 @@@
   VERSION = 2
   PATCHLEVEL = 6
   SUBLEVEL = 31
- EXTRAVERSION = -rc1
+ EXTRAVERSION = -rc6
   NAME = Man-Eating Seals of Antiquity
   
   # *DOCUMENTATION*
@@@ -140,15 -140,13 +140,13 @@@ _all: module
   endif
   
   srctree               := $(if $(KBUILD_SRC),$(KBUILD_SRC),$(CURDIR))
- TOPDIR                := $(srctree)
- # FIXME - TOPDIR is obsolete, use srctree/objtree
   objtree               := $(CURDIR)
   src           := $(srctree)
   obj           := $(objtree)
   
   VPATH         := $(srctree)$(if $(KBUILD_EXTMOD),:$(KBUILD_EXTMOD))
   
- export srctree objtree VPATH TOPDIR
+ export srctree objtree VPATH
   
   
   # SUBARCH tells the usermode build what the underlying arch is.  That is set
@@@ -327,7 -325,7 +325,7 @@@ CHECKFLAGS     := -D__linux__ -Dlinux -
   MODFLAGS      = -DMODULE
   CFLAGS_MODULE   = $(MODFLAGS)
   AFLAGS_MODULE   = $(MODFLAGS)
- -LDFLAGS_MODULE  =
+ +LDFLAGS_MODULE  = -T $(srctree)/scripts/module-common.lds
   CFLAGS_KERNEL =
   AFLAGS_KERNEL =
   CFLAGS_GCOV   = -fprofile-arcs -ftest-coverage
@@@ -344,7 -342,9 +342,9 @@@ KBUILD_CPPFLAGS := -D__KERNEL_
   
   KBUILD_CFLAGS   := -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \
                    -fno-strict-aliasing -fno-common \
-                  -Werror-implicit-function-declaration
+                  -Werror-implicit-function-declaration \
+                  -Wno-format-security \
+                  -fno-delete-null-pointer-checks
   KBUILD_AFLAGS   := -D__ASSEMBLY__
   
   # Read KERNELRELEASE from include/config/kernel.release (if it exists)
@@@ -566,7 -566,7 +566,7 @@@ KBUILD_CFLAGS += $(call cc-option,-Wdec
   KBUILD_CFLAGS += $(call cc-option,-Wno-pointer-sign,)
   
   # disable invalid "can't wrap" optimizations for signed / pointers
- KBUILD_CFLAGS += $(call cc-option,-fwrapv)
+ KBUILD_CFLAGS += $(call cc-option,-fno-strict-overflow)
   
   # revert to pre-gcc-4.4 behaviour of .eh_frame
   KBUILD_CFLAGS += $(call cc-option,-fno-dwarf2-cfi-asm)
diff --combined arch/mn10300/kernel/vmlinux.lds.S

index 8fcd0f1,f4aa079..76f41bd
--- 1/arch/mn10300/kernel/vmlinux.lds.S
--- 2/arch/mn10300/kernel/vmlinux.lds.S
+++ b/arch/mn10300/kernel/vmlinux.lds.S
@@@ -61,7 -61,7 +61,7 @@@ SECTION
         _edata = .;             /* End of data section */
     }
   
-   .data.init_task : { INIT_TASK(THREAD_SIZE); }
+   .data.init_task : { INIT_TASK_DATA(THREAD_SIZE); }
   
     /* might get freed after init */
     . = ALIGN(PAGE_SIZE);
@@@ -107,7 -107,7 +107,7 @@@
     __init_end = .;
     /* freed after init ends here */
   
-   BSS(4)
+   BSS_SECTION(0, PAGE_SIZE, 4)
   
     _end = . ;
   
@@@ -115,10 -115,12 +115,10 @@@
     . = ALIGN(PAGE_SIZE);
     pg0 = .;
   
- -  /* Sections to be discarded */
- -  /DISCARD/ : {
- -      EXIT_CALL
- -      }
- -
     STABS_DEBUG
   
     DWARF_DEBUG
+ +
+ +  /* Sections to be discarded */
+ +  DISCARDS
   }
diff --combined arch/sparc/kernel/smp_64.c

index 6970333,3691907..9856d86
--- 1/arch/sparc/kernel/smp_64.c
--- 2/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@@ -1415,6 -1415,19 +1415,6 @@@ static void * __init pcpu_alloc_bootmem
   #endif
   }
   
- -static size_t pcpur_size __initdata;
- -static void **pcpur_ptrs __initdata;
- -
- -static struct page * __init pcpur_get_page(unsigned int cpu, int pageno)
- -{
- -      size_t off = (size_t)pageno << PAGE_SHIFT;
- -
- -      if (off >= pcpur_size)
- -              return NULL;
- -
- -      return virt_to_page(pcpur_ptrs[cpu] + off);
- -}
- -
   #define PCPU_CHUNK_SIZE (4UL * 1024UL * 1024UL)
   
   static void __init pcpu_map_range(unsigned long start, unsigned long end,
@@@ -1478,31 -1491,30 +1478,31 @@@ void __init setup_per_cpu_areas(void
         size_t dyn_size, static_size = __per_cpu_end - __per_cpu_start;
         static struct vm_struct vm;
         unsigned long delta, cpu;
- -      size_t pcpu_unit_size;
+ +      size_t size_sum, pcpu_unit_size;
         size_t ptrs_size;
+ +      void **ptrs;
   
- -      pcpur_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
- -                             PERCPU_DYNAMIC_RESERVE);
- -      dyn_size = pcpur_size - static_size - PERCPU_MODULE_RESERVE;
+ +      size_sum = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
+ +                           PERCPU_DYNAMIC_RESERVE);
+ +      dyn_size = size_sum - static_size - PERCPU_MODULE_RESERVE;
   
   
-       ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(ptrs[0]));
- -      ptrs_size = PFN_ALIGN(nr_cpu_ids * sizeof(pcpur_ptrs[0]));
- -      pcpur_ptrs = alloc_bootmem(ptrs_size);
++      ptrs_size = PFN_ALIGN(nr_cpu_ids * sizeof(ptrs[0]));
+ +      ptrs = alloc_bootmem(ptrs_size);
   
         for_each_possible_cpu(cpu) {
- -              pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PCPU_CHUNK_SIZE,
- -                                                   PCPU_CHUNK_SIZE);
+ +              ptrs[cpu] = pcpu_alloc_bootmem(cpu, PCPU_CHUNK_SIZE,
+ +                                             PCPU_CHUNK_SIZE);
   
- -              free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size),
- -                           PCPU_CHUNK_SIZE - pcpur_size);
+ +              free_bootmem(__pa(ptrs[cpu] + size_sum),
+ +                           PCPU_CHUNK_SIZE - size_sum);
   
- -              memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size);
+ +              memcpy(ptrs[cpu], __per_cpu_load, static_size);
         }
   
         /* allocate address and map */
         vm.flags = VM_ALLOC;
-       vm.size = num_possible_cpus() * PCPU_CHUNK_SIZE;
+       vm.size = nr_cpu_ids * PCPU_CHUNK_SIZE;
         vm_area_register_early(&vm, PCPU_CHUNK_SIZE);
   
         for_each_possible_cpu(cpu) {
@@@ -1511,14 -1523,14 +1511,14 @@@
   
                 start += cpu * PCPU_CHUNK_SIZE;
                 end = start + PCPU_CHUNK_SIZE;
- -              pcpu_map_range(start, end, virt_to_page(pcpur_ptrs[cpu]));
+ +              pcpu_map_range(start, end, virt_to_page(ptrs[cpu]));
         }
   
- -      pcpu_unit_size = pcpu_setup_first_chunk(pcpur_get_page, static_size,
+ +      pcpu_unit_size = pcpu_setup_first_chunk(static_size,
                                                 PERCPU_MODULE_RESERVE, dyn_size,
                                                 PCPU_CHUNK_SIZE, vm.addr, NULL);
   
- -      free_bootmem(__pa(pcpur_ptrs), ptrs_size);
+ +      free_bootmem(__pa(ptrs), ptrs_size);
   
         delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
         for_each_possible_cpu(cpu) {
diff --combined arch/x86/Kconfig

index 646fcab,13ffa5d..e06b2ee
--- 1/arch/x86/Kconfig
--- 2/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@@ -24,6 -24,7 +24,7 @@@ config X8
         select HAVE_UNSTABLE_SCHED_CLOCK
         select HAVE_IDE
         select HAVE_OPROFILE
+       select HAVE_PERF_COUNTERS if (!M386 && !M486)
         select HAVE_IOREMAP_PROT
         select HAVE_KPROBES
         select ARCH_WANT_OPTIONAL_GPIOLIB
@@@ -149,6 -150,9 +150,6 @@@ config ARCH_HAS_CACHE_LINE_SIZ
   config HAVE_SETUP_PER_CPU_AREA
         def_bool y
   
- -config HAVE_DYNAMIC_PER_CPU_AREA
- -      def_bool y
- -
   config HAVE_CPUMASK_OF_CPU_MAP
         def_bool X86_64_SMP
   
@@@ -739,7 -743,6 +740,6 @@@ config X86_UP_IOAPI
   config X86_LOCAL_APIC
         def_bool y
         depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC
-       select HAVE_PERF_COUNTERS if (!M386 && !M486)
   
   config X86_IO_APIC
         def_bool y
@@@ -1910,6 -1913,18 +1910,18 @@@ config DMAR_DEFAULT_O
           recommended you say N here while the DMAR code remains
           experimental.
   
+ config DMAR_BROKEN_GFX_WA
+       def_bool n
+       prompt "Workaround broken graphics drivers (going away soon)"
+       depends on DMAR
+       ---help---
+         Current Graphics drivers tend to use physical address
+         for DMA and avoid using DMA APIs. Setting this config
+         option permits the IOMMU driver to set a unity map for
+         all the OS-visible memory. Hence the driver can continue
+         to use physical addresses for DMA, at least until this
+         option is removed in the 2.6.32 kernel.
+ 
   config DMAR_FLOPPY_WA
         def_bool y
         depends on DMAR
diff --combined arch/x86/kernel/cpu/mcheck/mce.c

index 20d4983,1cfb623..14ce5d4
--- 1/arch/x86/kernel/cpu/mcheck/mce.c
--- 2/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@@ -194,14 -194,14 +194,14 @@@ static void print_mce(struct mce *m
                        m->cs, m->ip);
                 if (m->cs == __KERNEL_CS)
                         print_symbol("{%s}", m->ip);
-               printk("\n");
+               printk(KERN_CONT "\n");
         }
         printk(KERN_EMERG "TSC %llx ", m->tsc);
         if (m->addr)
-               printk("ADDR %llx ", m->addr);
+               printk(KERN_CONT "ADDR %llx ", m->addr);
         if (m->misc)
-               printk("MISC %llx ", m->misc);
-       printk("\n");
+               printk(KERN_CONT "MISC %llx ", m->misc);
+       printk(KERN_CONT "\n");
         printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
                         m->cpuvendor, m->cpuid, m->time, m->socketid,
                         m->apicid);
@@@ -209,13 -209,13 +209,13 @@@
   
   static void print_mce_head(void)
   {
-       printk(KERN_EMERG "\n" KERN_EMERG "HARDWARE ERROR\n");
+       printk(KERN_EMERG "\nHARDWARE ERROR\n");
   }
   
   static void print_mce_tail(void)
   {
         printk(KERN_EMERG "This is not a software problem!\n"
-              KERN_EMERG "Run through mcelog --ascii to decode and contact your hardware vendor\n");
+              "Run through mcelog --ascii to decode and contact your hardware vendor\n");
   }
   
   #define PANIC_TIMEOUT 5 /* 5 seconds */
@@@ -1091,7 -1091,7 +1091,7 @@@ void mce_log_therm_throt_event(__u64 st
    */
   static int check_interval = 5 * 60; /* 5 minutes */
   
- -static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
+ +static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */
   static DEFINE_PER_CPU(struct timer_list, mce_timer);
   
   static void mcheck_timer(unsigned long data)
@@@ -1110,7 -1110,7 +1110,7 @@@
          * Alert userspace if needed.  If we logged an MCE, reduce the
          * polling interval, otherwise increase the polling interval.
          */
- -      n = &__get_cpu_var(next_interval);
+ +      n = &__get_cpu_var(mce_next_interval);
         if (mce_notify_irq())
                 *n = max(*n/2, HZ/100);
         else
@@@ -1311,7 -1311,7 +1311,7 @@@ static void mce_cpu_features(struct cpu
   static void mce_init_timer(void)
   {
         struct timer_list *t = &__get_cpu_var(mce_timer);
- -      int *n = &__get_cpu_var(next_interval);
+ +      int *n = &__get_cpu_var(mce_next_interval);
   
         if (mce_ignore_ce)
                 return;
@@@ -1692,17 -1692,15 +1692,15 @@@ static ssize_t set_trigger(struct sys_d
                                 const char *buf, size_t siz)
   {
         char *p;
-       int len;
   
         strncpy(mce_helper, buf, sizeof(mce_helper));
         mce_helper[sizeof(mce_helper)-1] = 0;
-       len = strlen(mce_helper);
         p = strchr(mce_helper, '\n');
   
-       if (*p)
+       if (p)
                 *p = 0;
   
-       return len;
+       return strlen(mce_helper) + !!p;
   }
   
   static ssize_t set_ignore_ce(struct sys_device *s,
@@@ -1914,7 -1912,7 +1912,7 @@@ mce_cpu_callback(struct notifier_block 
         case CPU_DOWN_FAILED:
         case CPU_DOWN_FAILED_FROZEN:
                 t->expires = round_jiffies(jiffies +
- -                                              __get_cpu_var(next_interval));
+ +                                         __get_cpu_var(mce_next_interval));
                 add_timer_on(t, cpu);
                 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
                 break;
diff --combined arch/x86/kernel/cpu/perf_counter.c

index 13bd6d6,900332b..3d4ebbd
--- 1/arch/x86/kernel/cpu/perf_counter.c
--- 2/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@@ -55,6 -55,7 +55,7 @@@ struct x86_pmu 
         int             num_counters_fixed;
         int             counter_bits;
         u64             counter_mask;
+       int             apic;
         u64             max_period;
         u64             intel_ctrl;
   };
@@@ -66,6 -67,52 +67,52 @@@ static DEFINE_PER_CPU(struct cpu_hw_cou
   };
   
   /*
+  * Not sure about some of these
+  */
+ static const u64 p6_perfmon_event_map[] =
+ {
+   [PERF_COUNT_HW_CPU_CYCLES]          = 0x0079,
+   [PERF_COUNT_HW_INSTRUCTIONS]                = 0x00c0,
+   [PERF_COUNT_HW_CACHE_REFERENCES]    = 0x0f2e,
+   [PERF_COUNT_HW_CACHE_MISSES]                = 0x012e,
+   [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
+   [PERF_COUNT_HW_BRANCH_MISSES]               = 0x00c5,
+   [PERF_COUNT_HW_BUS_CYCLES]          = 0x0062,
+ };
+ 
+ static u64 p6_pmu_event_map(int event)
+ {
+       return p6_perfmon_event_map[event];
+ }
+ 
+ /*
+  * Counter setting that is specified not to count anything.
+  * We use this to effectively disable a counter.
+  *
+  * L2_RQSTS with 0 MESI unit mask.
+  */
+ #define P6_NOP_COUNTER                        0x0000002EULL
+ 
+ static u64 p6_pmu_raw_event(u64 event)
+ {
+ #define P6_EVNTSEL_EVENT_MASK         0x000000FFULL
+ #define P6_EVNTSEL_UNIT_MASK          0x0000FF00ULL
+ #define P6_EVNTSEL_EDGE_MASK          0x00040000ULL
+ #define P6_EVNTSEL_INV_MASK           0x00800000ULL
+ #define P6_EVNTSEL_COUNTER_MASK               0xFF000000ULL
+ 
+ #define P6_EVNTSEL_MASK                       \
+       (P6_EVNTSEL_EVENT_MASK |        \
+        P6_EVNTSEL_UNIT_MASK  |        \
+        P6_EVNTSEL_EDGE_MASK  |        \
+        P6_EVNTSEL_INV_MASK   |        \
+        P6_EVNTSEL_COUNTER_MASK)
+ 
+       return event & P6_EVNTSEL_MASK;
+ }
+ 
+ 
+ /*
    * Intel PerfMon v3. Used on Core2 and later.
    */
   static const u64 intel_perfmon_event_map[] =
@@@ -567,6 -614,7 +614,7 @@@ static DEFINE_MUTEX(pmc_reserve_mutex)
   
   static bool reserve_pmc_hardware(void)
   {
+ #ifdef CONFIG_X86_LOCAL_APIC
         int i;
   
         if (nmi_watchdog == NMI_LOCAL_APIC)
@@@ -581,9 -629,11 +629,11 @@@
                 if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
                         goto eventsel_fail;
         }
+ #endif
   
         return true;
   
+ #ifdef CONFIG_X86_LOCAL_APIC
   eventsel_fail:
         for (i--; i >= 0; i--)
                 release_evntsel_nmi(x86_pmu.eventsel + i);
@@@ -598,10 -648,12 +648,12 @@@ perfctr_fail
                 enable_lapic_nmi_watchdog();
   
         return false;
+ #endif
   }
   
   static void release_pmc_hardware(void)
   {
+ #ifdef CONFIG_X86_LOCAL_APIC
         int i;
   
         for (i = 0; i < x86_pmu.num_counters; i++) {
@@@ -611,6 -663,7 +663,7 @@@
   
         if (nmi_watchdog == NMI_LOCAL_APIC)
                 enable_lapic_nmi_watchdog();
+ #endif
   }
   
   static void hw_perf_counter_destroy(struct perf_counter *counter)
@@@ -666,6 -719,7 +719,7 @@@ static int __hw_perf_counter_init(struc
   {
         struct perf_counter_attr *attr = &counter->attr;
         struct hw_perf_counter *hwc = &counter->hw;
+       u64 config;
         int err;
   
         if (!x86_pmu_initialized())
@@@ -701,6 -755,15 +755,15 @@@
                 hwc->sample_period = x86_pmu.max_period;
                 hwc->last_period = hwc->sample_period;
                 atomic64_set(&hwc->period_left, hwc->sample_period);
+       } else {
+               /*
+                * If we have a PMU initialized but no APIC
+                * interrupts, we cannot sample hardware
+                * counters (user-space has to fall back and
+                * sample via a hrtimer based software counter):
+                */
+               if (!x86_pmu.apic)
+                       return -EOPNOTSUPP;
         }
   
         counter->destroy = hw_perf_counter_destroy;
@@@ -718,14 -781,40 +781,40 @@@
   
         if (attr->config >= x86_pmu.max_events)
                 return -EINVAL;
+ 
         /*
          * The generic map:
          */
-       hwc->config |= x86_pmu.event_map(attr->config);
+       config = x86_pmu.event_map(attr->config);
+ 
+       if (config == 0)
+               return -ENOENT;
+ 
+       if (config == -1LL)
+               return -EINVAL;
+ 
+       hwc->config |= config;
   
         return 0;
   }
   
+ static void p6_pmu_disable_all(void)
+ {
+       struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+       u64 val;
+ 
+       if (!cpuc->enabled)
+               return;
+ 
+       cpuc->enabled = 0;
+       barrier();
+ 
+       /* p6 only has one enable register */
+       rdmsrl(MSR_P6_EVNTSEL0, val);
+       val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
+       wrmsrl(MSR_P6_EVNTSEL0, val);
+ }
+ 
   static void intel_pmu_disable_all(void)
   {
         wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
@@@ -767,6 -856,23 +856,23 @@@ void hw_perf_disable(void
         return x86_pmu.disable_all();
   }
   
+ static void p6_pmu_enable_all(void)
+ {
+       struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+       unsigned long val;
+ 
+       if (cpuc->enabled)
+               return;
+ 
+       cpuc->enabled = 1;
+       barrier();
+ 
+       /* p6 only has one enable register */
+       rdmsrl(MSR_P6_EVNTSEL0, val);
+       val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+       wrmsrl(MSR_P6_EVNTSEL0, val);
+ }
+ 
   static void intel_pmu_enable_all(void)
   {
         wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
@@@ -784,13 -890,13 +890,13 @@@ static void amd_pmu_enable_all(void
         barrier();
   
         for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+               struct perf_counter *counter = cpuc->counters[idx];
                 u64 val;
   
                 if (!test_bit(idx, cpuc->active_mask))
                         continue;
-               rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
-               if (val & ARCH_PERFMON_EVENTSEL0_ENABLE)
-                       continue;
+ 
+               val = counter->hw.config;
                 val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
                 wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
         }
@@@ -819,16 -925,13 +925,13 @@@ static inline void intel_pmu_ack_status
   
   static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
   {
-       int err;
-       err = checking_wrmsrl(hwc->config_base + idx,
+       (void)checking_wrmsrl(hwc->config_base + idx,
                               hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
   }
   
   static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
   {
-       int err;
-       err = checking_wrmsrl(hwc->config_base + idx,
-                             hwc->config);
+       (void)checking_wrmsrl(hwc->config_base + idx, hwc->config);
   }
   
   static inline void
@@@ -836,13 -939,24 +939,24 @@@ intel_pmu_disable_fixed(struct hw_perf_
   {
         int idx = __idx - X86_PMC_IDX_FIXED;
         u64 ctrl_val, mask;
-       int err;
   
         mask = 0xfULL << (idx * 4);
   
         rdmsrl(hwc->config_base, ctrl_val);
         ctrl_val &= ~mask;
-       err = checking_wrmsrl(hwc->config_base, ctrl_val);
+       (void)checking_wrmsrl(hwc->config_base, ctrl_val);
+ }
+ 
+ static inline void
+ p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
+ {
+       struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+       u64 val = P6_NOP_COUNTER;
+ 
+       if (cpuc->enabled)
+               val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+ 
+       (void)checking_wrmsrl(hwc->config_base + idx, val);
   }
   
   static inline void
@@@ -862,7 -976,7 +976,7 @@@ amd_pmu_disable_counter(struct hw_perf_
         x86_pmu_disable_counter(hwc, idx);
   }
   
- -static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
+ +static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
   
   /*
    * Set the next IRQ period, based on the hwc->period_left value.
@@@ -901,7 -1015,7 +1015,7 @@@ x86_perf_counter_set_period(struct perf
         if (left > x86_pmu.max_period)
                 left = x86_pmu.max_period;
   
- -      per_cpu(prev_left[idx], smp_processor_id()) = left;
+ +      per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
   
         /*
          * The hw counter starts counting from this counter offset,
@@@ -943,6 -1057,19 +1057,19 @@@ intel_pmu_enable_fixed(struct hw_perf_c
         err = checking_wrmsrl(hwc->config_base, ctrl_val);
   }
   
+ static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
+ {
+       struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+       u64 val;
+ 
+       val = hwc->config;
+       if (cpuc->enabled)
+               val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+ 
+       (void)checking_wrmsrl(hwc->config_base + idx, val);
+ }
+ 
+ 
   static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
   {
         if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
@@@ -959,8 -1086,6 +1086,6 @@@ static void amd_pmu_enable_counter(stru
   
         if (cpuc->enabled)
                 x86_pmu_enable_counter(hwc, idx);
-       else
-               x86_pmu_disable_counter(hwc, idx);
   }
   
   static int
@@@ -1086,7 -1211,7 +1211,7 @@@ void perf_counter_print_debug(void
                 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
                 rdmsrl(x86_pmu.perfctr  + idx, pmc_count);
   
- -              prev_left = per_cpu(prev_left[idx], cpu);
+ +              prev_left = per_cpu(pmc_prev_left[idx], cpu);
   
                 pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
                         cpu, idx, pmc_ctrl);
@@@ -1176,6 -1301,49 +1301,49 @@@ static void intel_pmu_reset(void
         local_irq_restore(flags);
   }
   
+ static int p6_pmu_handle_irq(struct pt_regs *regs)
+ {
+       struct perf_sample_data data;
+       struct cpu_hw_counters *cpuc;
+       struct perf_counter *counter;
+       struct hw_perf_counter *hwc;
+       int idx, handled = 0;
+       u64 val;
+ 
+       data.regs = regs;
+       data.addr = 0;
+ 
+       cpuc = &__get_cpu_var(cpu_hw_counters);
+ 
+       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+               if (!test_bit(idx, cpuc->active_mask))
+                       continue;
+ 
+               counter = cpuc->counters[idx];
+               hwc = &counter->hw;
+ 
+               val = x86_perf_counter_update(counter, hwc, idx);
+               if (val & (1ULL << (x86_pmu.counter_bits - 1)))
+                       continue;
+ 
+               /*
+                * counter overflow
+                */
+               handled         = 1;
+               data.period     = counter->hw.last_period;
+ 
+               if (!x86_perf_counter_set_period(counter, hwc, idx))
+                       continue;
+ 
+               if (perf_counter_overflow(counter, 1, &data))
+                       p6_pmu_disable_counter(hwc, idx);
+       }
+ 
+       if (handled)
+               inc_irq_stat(apic_perf_irqs);
+ 
+       return handled;
+ }
   
   /*
    * This handler is triggered by the local APIC, so the APIC IRQ handling
@@@ -1185,14 -1353,13 +1353,13 @@@ static int intel_pmu_handle_irq(struct 
   {
         struct perf_sample_data data;
         struct cpu_hw_counters *cpuc;
-       int bit, cpu, loops;
+       int bit, loops;
         u64 ack, status;
   
         data.regs = regs;
         data.addr = 0;
   
-       cpu = smp_processor_id();
-       cpuc = &per_cpu(cpu_hw_counters, cpu);
+       cpuc = &__get_cpu_var(cpu_hw_counters);
   
         perf_disable();
         status = intel_pmu_get_status();
@@@ -1249,14 -1416,13 +1416,13 @@@ static int amd_pmu_handle_irq(struct pt
         struct cpu_hw_counters *cpuc;
         struct perf_counter *counter;
         struct hw_perf_counter *hwc;
-       int cpu, idx, handled = 0;
+       int idx, handled = 0;
         u64 val;
   
         data.regs = regs;
         data.addr = 0;
   
-       cpu = smp_processor_id();
-       cpuc = &per_cpu(cpu_hw_counters, cpu);
+       cpuc = &__get_cpu_var(cpu_hw_counters);
   
         for (idx = 0; idx < x86_pmu.num_counters; idx++) {
                 if (!test_bit(idx, cpuc->active_mask))
@@@ -1299,18 -1465,22 +1465,22 @@@ void smp_perf_pending_interrupt(struct 
   
   void set_perf_counter_pending(void)
   {
+ #ifdef CONFIG_X86_LOCAL_APIC
         apic->send_IPI_self(LOCAL_PENDING_VECTOR);
+ #endif
   }
   
   void perf_counters_lapic_init(void)
   {
-       if (!x86_pmu_initialized())
+ #ifdef CONFIG_X86_LOCAL_APIC
+       if (!x86_pmu.apic || !x86_pmu_initialized())
                 return;
   
         /*
          * Always use NMI for PMU
          */
         apic_write(APIC_LVTPC, APIC_DM_NMI);
+ #endif
   }
   
   static int __kprobes
@@@ -1334,7 -1504,9 +1504,9 @@@ perf_counter_nmi_handler(struct notifie
   
         regs = args->regs;
   
+ #ifdef CONFIG_X86_LOCAL_APIC
         apic_write(APIC_LVTPC, APIC_DM_NMI);
+ #endif
         /*
          * Can't rely on the handled return value to say it was our NMI, two
          * counters could trigger 'simultaneously' raising two back-to-back NMIs.
@@@ -1353,6 -1525,33 +1525,33 @@@ static __read_mostly struct notifier_bl
         .priority               = 1
   };
   
+ static struct x86_pmu p6_pmu = {
+       .name                   = "p6",
+       .handle_irq             = p6_pmu_handle_irq,
+       .disable_all            = p6_pmu_disable_all,
+       .enable_all             = p6_pmu_enable_all,
+       .enable                 = p6_pmu_enable_counter,
+       .disable                = p6_pmu_disable_counter,
+       .eventsel               = MSR_P6_EVNTSEL0,
+       .perfctr                = MSR_P6_PERFCTR0,
+       .event_map              = p6_pmu_event_map,
+       .raw_event              = p6_pmu_raw_event,
+       .max_events             = ARRAY_SIZE(p6_perfmon_event_map),
+       .apic                   = 1,
+       .max_period             = (1ULL << 31) - 1,
+       .version                = 0,
+       .num_counters           = 2,
+       /*
+        * Counters have 40 bits implemented. However they are designed such
+        * that bits [32-39] are sign extensions of bit 31. As such the
+        * effective width of a counter for P6-like PMU is 32 bits only.
+        *
+        * See IA-32 Intel Architecture Software developer manual Vol 3B
+        */
+       .counter_bits           = 32,
+       .counter_mask           = (1ULL << 32) - 1,
+ };
+ 
   static struct x86_pmu intel_pmu = {
         .name                   = "Intel",
         .handle_irq             = intel_pmu_handle_irq,
@@@ -1365,6 -1564,7 +1564,7 @@@
         .event_map              = intel_pmu_event_map,
         .raw_event              = intel_pmu_raw_event,
         .max_events             = ARRAY_SIZE(intel_perfmon_event_map),
+       .apic                   = 1,
         /*
          * Intel PMCs cannot be accessed sanely above 32 bit width,
          * so we install an artificial 1<<31 period regardless of
@@@ -1388,10 -1588,43 +1588,43 @@@ static struct x86_pmu amd_pmu = 
         .num_counters           = 4,
         .counter_bits           = 48,
         .counter_mask           = (1ULL << 48) - 1,
+       .apic                   = 1,
         /* use highest bit to detect overflow */
         .max_period             = (1ULL << 47) - 1,
   };
   
+ static int p6_pmu_init(void)
+ {
+       switch (boot_cpu_data.x86_model) {
+       case 1:
+       case 3:  /* Pentium Pro */
+       case 5:
+       case 6:  /* Pentium II */
+       case 7:
+       case 8:
+       case 11: /* Pentium III */
+               break;
+       case 9:
+       case 13:
+               /* Pentium M */
+               break;
+       default:
+               pr_cont("unsupported p6 CPU model %d ",
+                       boot_cpu_data.x86_model);
+               return -ENODEV;
+       }
+ 
+       x86_pmu = p6_pmu;
+ 
+       if (!cpu_has_apic) {
+               pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
+               pr_info("no hardware sampling interrupt available.\n");
+               x86_pmu.apic = 0;
+       }
+ 
+       return 0;
+ }
+ 
   static int intel_pmu_init(void)
   {
         union cpuid10_edx edx;
@@@ -1400,8 -1633,14 +1633,14 @@@
         unsigned int ebx;
         int version;
   
-       if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
+       if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
+               /* check for P6 processor family */
+          if (boot_cpu_data.x86 == 6) {
+               return p6_pmu_init();
+          } else {
                 return -ENODEV;
+          }
+       }
   
         /*
          * Check whether the Architectural PerfMon supports
@@@ -1559,8 -1798,9 +1798,9 @@@ void callchain_store(struct perf_callch
                 entry->ip[entry->nr++] = ip;
   }
   
- -static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry);
- -static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry);
+ +static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
+ +static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
+ static DEFINE_PER_CPU(int, in_nmi_frame);
   
   
   static void
@@@ -1576,7 -1816,9 +1816,9 @@@ static void backtrace_warning(void *dat
   
   static int backtrace_stack(void *data, char *name)
   {
-       /* Process all stacks: */
+       per_cpu(in_nmi_frame, smp_processor_id()) =
+                       x86_is_stack_id(NMI_STACK, name);
+ 
         return 0;
   }
   
@@@ -1584,6 -1826,9 +1826,9 @@@ static void backtrace_address(void *dat
   {
         struct perf_callchain_entry *entry = data;
   
+       if (per_cpu(in_nmi_frame, smp_processor_id()))
+               return;
+ 
         if (reliable)
                 callchain_store(entry, addr);
   }
@@@ -1707,9 -1952,9 +1952,9 @@@ struct perf_callchain_entry *perf_callc
         struct perf_callchain_entry *entry;
   
         if (in_nmi())
- -              entry = &__get_cpu_var(nmi_entry);
+ +              entry = &__get_cpu_var(pmc_nmi_entry);
         else
- -              entry = &__get_cpu_var(irq_entry);
+ +              entry = &__get_cpu_var(pmc_irq_entry);
   
         entry->nr = 0;
   
diff --combined arch/x86/kernel/setup_percpu.c

index 7501bb1,07d8191..a26ff61
--- 1/arch/x86/kernel/setup_percpu.c
--- 2/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@@ -124,51 -124,60 +124,51 @@@ static void * __init pcpu_alloc_bootmem
   }
   
   /*
- - * Large page remap allocator
- - *
- - * This allocator uses PMD page as unit.  A PMD page is allocated for
- - * each cpu and each is remapped into vmalloc area using PMD mapping.
- - * As PMD page is quite large, only part of it is used for the first
- - * chunk.  Unused part is returned to the bootmem allocator.
- - *
- - * So, the PMD pages are mapped twice - once to the physical mapping
- - * and to the vmalloc area for the first percpu chunk.  The double
- - * mapping does add one more PMD TLB entry pressure but still is much
- - * better than only using 4k mappings while still being NUMA friendly.
+ + * Helpers for first chunk memory allocation
    */
- -#ifdef CONFIG_NEED_MULTIPLE_NODES
- -struct pcpul_ent {
- -      unsigned int    cpu;
- -      void            *ptr;
- -};
+ +static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size)
+ +{
+ +      return pcpu_alloc_bootmem(cpu, size, size);
+ +}
   
- -static size_t pcpul_size;
- -static struct pcpul_ent *pcpul_map;
- -static struct vm_struct pcpul_vm;
+ +static void __init pcpu_fc_free(void *ptr, size_t size)
+ +{
+ +      free_bootmem(__pa(ptr), size);
+ +}
   
- -static struct page * __init pcpul_get_page(unsigned int cpu, int pageno)
+ +/*
+ + * Large page remapping allocator
+ + */
+ +#ifdef CONFIG_NEED_MULTIPLE_NODES
+ +static void __init pcpul_map(void *ptr, size_t size, void *addr)
   {
- -      size_t off = (size_t)pageno << PAGE_SHIFT;
+ +      pmd_t *pmd, pmd_v;
   
- -      if (off >= pcpul_size)
- -              return NULL;
+ +      pmd = populate_extra_pmd((unsigned long)addr);
+ +      pmd_v = pfn_pmd(page_to_pfn(virt_to_page(ptr)), PAGE_KERNEL_LARGE);
+ +      set_pmd(pmd, pmd_v);
+ +}
   
- -      return virt_to_page(pcpul_map[cpu].ptr + off);
+ +static int pcpu_lpage_cpu_distance(unsigned int from, unsigned int to)
+ +{
+ +      if (early_cpu_to_node(from) == early_cpu_to_node(to))
+ +              return LOCAL_DISTANCE;
+ +      else
+ +              return REMOTE_DISTANCE;
   }
   
   static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
   {
- -      size_t map_size, dyn_size;
- -      unsigned int cpu;
- -      int i, j;
+ +      size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
+ +      size_t dyn_size = reserve - PERCPU_FIRST_CHUNK_RESERVE;
+ +      size_t unit_map_size, unit_size;
+ +      int *unit_map;
+ +      int nr_units;
         ssize_t ret;
   
- -      if (!chosen) {
- -              size_t vm_size = VMALLOC_END - VMALLOC_START;
- -              size_t tot_size = nr_cpu_ids * PMD_SIZE;
- -
- -              /* on non-NUMA, embedding is better */
- -              if (!pcpu_need_numa())
- -                      return -EINVAL;
- -
- -              /* don't consume more than 20% of vmalloc area */
- -              if (tot_size > vm_size / 5) {
- -                      pr_info("PERCPU: too large chunk size %zuMB for "
- -                              "large page remap\n", tot_size >> 20);
- -                      return -EINVAL;
- -              }
- -      }
+ +      /* on non-NUMA, embedding is better */
+ +      if (!chosen && !pcpu_need_numa())
+ +              return -EINVAL;
   
         /* need PSE */
         if (!cpu_has_pse) {
@@@ -176,46 -185,134 +176,46 @@@
                 return -EINVAL;
         }
   
- -      /*
- -       * Currently supports only single page.  Supporting multiple
- -       * pages won't be too difficult if it ever becomes necessary.
- -       */
- -      pcpul_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
- -                             PERCPU_DYNAMIC_RESERVE);
- -      if (pcpul_size > PMD_SIZE) {
- -              pr_warning("PERCPU: static data is larger than large page, "
- -                         "can't use large page\n");
- -              return -EINVAL;
- -      }
- -      dyn_size = pcpul_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
- -
- -      /* allocate pointer array and alloc large pages */
- -      map_size = PFN_ALIGN(nr_cpu_ids * sizeof(pcpul_map[0]));
- -      pcpul_map = alloc_bootmem(map_size);
- -
- -      for_each_possible_cpu(cpu) {
- -              pcpul_map[cpu].cpu = cpu;
- -              pcpul_map[cpu].ptr = pcpu_alloc_bootmem(cpu, PMD_SIZE,
- -                                                      PMD_SIZE);
- -              if (!pcpul_map[cpu].ptr) {
- -                      pr_warning("PERCPU: failed to allocate large page "
- -                                 "for cpu%u\n", cpu);
- -                      goto enomem;
- -              }
- -
- -              /*
- -               * Only use pcpul_size bytes and give back the rest.
- -               *
- -               * Ingo: The 2MB up-rounding bootmem is needed to make
- -               * sure the partial 2MB page is still fully RAM - it's
- -               * not well-specified to have a PAT-incompatible area
- -               * (unmapped RAM, device memory, etc.) in that hole.
- -               */
- -              free_bootmem(__pa(pcpul_map[cpu].ptr + pcpul_size),
- -                           PMD_SIZE - pcpul_size);
- -
- -              memcpy(pcpul_map[cpu].ptr, __per_cpu_load, static_size);
+ +      /* allocate and build unit_map */
-       unit_map_size = num_possible_cpus() * sizeof(int);
++      unit_map_size = nr_cpu_ids * sizeof(int);
+ +      unit_map = alloc_bootmem_nopanic(unit_map_size);
+ +      if (!unit_map) {
+ +              pr_warning("PERCPU: failed to allocate unit_map\n");
+ +              return -ENOMEM;
         }
   
- -      /* allocate address and map */
- -      pcpul_vm.flags = VM_ALLOC;
- -      pcpul_vm.size = nr_cpu_ids * PMD_SIZE;
- -      vm_area_register_early(&pcpul_vm, PMD_SIZE);
- -
- -      for_each_possible_cpu(cpu) {
- -              pmd_t *pmd, pmd_v;
- -
- -              pmd = populate_extra_pmd((unsigned long)pcpul_vm.addr +
- -                                       cpu * PMD_SIZE);
- -              pmd_v = pfn_pmd(page_to_pfn(virt_to_page(pcpul_map[cpu].ptr)),
- -                              PAGE_KERNEL_LARGE);
- -              set_pmd(pmd, pmd_v);
+ +      ret = pcpu_lpage_build_unit_map(static_size,
+ +                                      PERCPU_FIRST_CHUNK_RESERVE,
+ +                                      &dyn_size, &unit_size, PMD_SIZE,
+ +                                      unit_map, pcpu_lpage_cpu_distance);
+ +      if (ret < 0) {
+ +              pr_warning("PERCPU: failed to build unit_map\n");
+ +              goto out_free;
         }
+ +      nr_units = ret;
   
- -      /* we're ready, commit */
- -      pr_info("PERCPU: Remapped at %p with large pages, static data "
- -              "%zu bytes\n", pcpul_vm.addr, static_size);
- -
- -      ret = pcpu_setup_first_chunk(pcpul_get_page, static_size,
- -                                   PERCPU_FIRST_CHUNK_RESERVE, dyn_size,
- -                                   PMD_SIZE, pcpul_vm.addr, NULL);
- -
- -      /* sort pcpul_map array for pcpu_lpage_remapped() */
- -      for (i = 0; i < nr_cpu_ids - 1; i++)
- -              for (j = i + 1; j < nr_cpu_ids; j++)
- -                      if (pcpul_map[i].ptr > pcpul_map[j].ptr) {
- -                              struct pcpul_ent tmp = pcpul_map[i];
- -                              pcpul_map[i] = pcpul_map[j];
- -                              pcpul_map[j] = tmp;
- -                      }
- -
- -      return ret;
- -
- -enomem:
- -      for_each_possible_cpu(cpu)
- -              if (pcpul_map[cpu].ptr)
- -                      free_bootmem(__pa(pcpul_map[cpu].ptr), pcpul_size);
- -      free_bootmem(__pa(pcpul_map), map_size);
- -      return -ENOMEM;
- -}
+ +      /* do the parameters look okay? */
+ +      if (!chosen) {
+ +              size_t vm_size = VMALLOC_END - VMALLOC_START;
+ +              size_t tot_size = nr_units * unit_size;
   
- -/**
- - * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area
- - * @kaddr: the kernel address in question
- - *
- - * Determine whether @kaddr falls in the pcpul recycled area.  This is
- - * used by pageattr to detect VM aliases and break up the pcpu PMD
- - * mapping such that the same physical page is not mapped under
- - * different attributes.
- - *
- - * The recycled area is always at the tail of a partially used PMD
- - * page.
- - *
- - * RETURNS:
- - * Address of corresponding remapped pcpu address if match is found;
- - * otherwise, NULL.
- - */
- -void *pcpu_lpage_remapped(void *kaddr)
- -{
- -      void *pmd_addr = (void *)((unsigned long)kaddr & PMD_MASK);
- -      unsigned long offset = (unsigned long)kaddr & ~PMD_MASK;
- -      int left = 0, right = nr_cpu_ids - 1;
- -      int pos;
- -
- -      /* pcpul in use at all? */
- -      if (!pcpul_map)
- -              return NULL;
- -
- -      /* okay, perform binary search */
- -      while (left <= right) {
- -              pos = (left + right) / 2;
- -
- -              if (pcpul_map[pos].ptr < pmd_addr)
- -                      left = pos + 1;
- -              else if (pcpul_map[pos].ptr > pmd_addr)
- -                      right = pos - 1;
- -              else {
- -                      /* it shouldn't be in the area for the first chunk */
- -                      WARN_ON(offset < pcpul_size);
- -
- -                      return pcpul_vm.addr +
- -                              pcpul_map[pos].cpu * PMD_SIZE + offset;
+ +              /* don't consume more than 20% of vmalloc area */
+ +              if (tot_size > vm_size / 5) {
+ +                      pr_info("PERCPU: too large chunk size %zuMB for "
+ +                              "large page remap\n", tot_size >> 20);
+ +                      ret = -EINVAL;
+ +                      goto out_free;
                 }
         }
   
- -      return NULL;
+ +      ret = pcpu_lpage_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
+ +                                   dyn_size, unit_size, PMD_SIZE,
+ +                                   unit_map, nr_units,
+ +                                   pcpu_fc_alloc, pcpu_fc_free, pcpul_map);
+ +out_free:
+ +      if (ret < 0)
+ +              free_bootmem(__pa(unit_map), unit_map_size);
+ +      return ret;
   }
   #else
   static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
@@@ -245,15 -342,26 +245,15 @@@ static ssize_t __init setup_pcpu_embed(
                 return -EINVAL;
   
         return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
- -                                    reserve - PERCPU_FIRST_CHUNK_RESERVE, -1);
+ +                                    reserve - PERCPU_FIRST_CHUNK_RESERVE);
   }
   
   /*
- - * 4k page allocator
+ + * 4k allocator
    *
- - * This is the basic allocator.  Static percpu area is allocated
- - * page-by-page and most of initialization is done by the generic
- - * setup function.
+ + * Boring fallback 4k allocator.  This allocator puts more pressure on
+ + * PTE TLBs but other than that behaves nicely on both UMA and NUMA.
    */
- -static struct page **pcpu4k_pages __initdata;
- -static int pcpu4k_nr_static_pages __initdata;
- -
- -static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno)
- -{
- -      if (pageno < pcpu4k_nr_static_pages)
- -              return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno];
- -      return NULL;
- -}
- -
   static void __init pcpu4k_populate_pte(unsigned long addr)
   {
         populate_extra_pte(addr);
@@@ -261,9 -369,51 +261,9 @@@
   
   static ssize_t __init setup_pcpu_4k(size_t static_size)
   {
- -      size_t pages_size;
- -      unsigned int cpu;
- -      int i, j;
- -      ssize_t ret;
- -
- -      pcpu4k_nr_static_pages = PFN_UP(static_size);
- -
- -      /* unaligned allocations can't be freed, round up to page size */
- -      pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * nr_cpu_ids
- -                             * sizeof(pcpu4k_pages[0]));
- -      pcpu4k_pages = alloc_bootmem(pages_size);
- -
- -      /* allocate and copy */
- -      j = 0;
- -      for_each_possible_cpu(cpu)
- -              for (i = 0; i < pcpu4k_nr_static_pages; i++) {
- -                      void *ptr;
- -
- -                      ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE);
- -                      if (!ptr) {
- -                              pr_warning("PERCPU: failed to allocate "
- -                                         "4k page for cpu%u\n", cpu);
- -                              goto enomem;
- -                      }
- -
- -                      memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE);
- -                      pcpu4k_pages[j++] = virt_to_page(ptr);
- -              }
- -
- -      /* we're ready, commit */
- -      pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n",
- -              pcpu4k_nr_static_pages, static_size);
- -
- -      ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size,
- -                                   PERCPU_FIRST_CHUNK_RESERVE, -1,
- -                                   -1, NULL, pcpu4k_populate_pte);
- -      goto out_free_ar;
- -
- -enomem:
- -      while (--j >= 0)
- -              free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE);
- -      ret = -ENOMEM;
- -out_free_ar:
- -      free_bootmem(__pa(pcpu4k_pages), pages_size);
- -      return ret;
+ +      return pcpu_4k_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
+ +                                 pcpu_fc_alloc, pcpu_fc_free,
+ +                                 pcpu4k_populate_pte);
   }
   
   /* for explicit first chunk allocator selection */
@@@ -336,8 -486,7 +336,8 @@@ void __init setup_per_cpu_areas(void
         /* alrighty, percpu areas up and running */
         delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
         for_each_possible_cpu(cpu) {
- -              per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size;
+ +              per_cpu_offset(cpu) =
+ +                      delta + pcpu_unit_map[cpu] * pcpu_unit_size;
                 per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
                 per_cpu(cpu_number, cpu) = cpu;
                 setup_percpu_segment(cpu);
diff --combined arch/x86/kernel/vmlinux.lds.S

index b600c84,78d185d..bbf4fd0
--- 1/arch/x86/kernel/vmlinux.lds.S
--- 2/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@@ -112,11 -112,6 +112,6 @@@ SECTION
                 _sdata = .;
                 DATA_DATA
                 CONSTRUCTORS
- 
- #ifdef CONFIG_X86_64
-               /* End of data section */
-               _edata = .;
- #endif
         } :data
   
   #ifdef CONFIG_X86_32
@@@ -156,10 -151,8 +151,8 @@@
         .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
                 *(.data.read_mostly)
   
- #ifdef CONFIG_X86_32
                 /* End of data section */
                 _edata = .;
- #endif
         }
   
   #ifdef CONFIG_X86_64
@@@ -387,18 -380,21 +380,18 @@@
                 _end = .;
         }
   
- -      /* Sections to be discarded */
- -      /DISCARD/ : {
- -              *(.exitcall.exit)
- -              *(.eh_frame)
- -              *(.discard)
- -      }
- -
           STABS_DEBUG
           DWARF_DEBUG
+ +
+ +      /* Sections to be discarded */
+ +      DISCARDS
+ +      /DISCARD/ : { *(.eh_frame) }
   }
   
   
   #ifdef CONFIG_X86_32
- ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
-         "kernel image bigger than KERNEL_IMAGE_SIZE")
+ . = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
+          "kernel image bigger than KERNEL_IMAGE_SIZE");
   #else
   /*
    * Per-cpu symbols which need to be offset from __per_cpu_load
@@@ -411,12 -407,12 +404,12 @@@ INIT_PER_CPU(irq_stack_union)
   /*
    * Build-time check on the image size:
    */
- ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
-       "kernel image bigger than KERNEL_IMAGE_SIZE")
+ . = ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
+          "kernel image bigger than KERNEL_IMAGE_SIZE");
   
   #ifdef CONFIG_SMP
- ASSERT((per_cpu__irq_stack_union == 0),
-         "irq_stack_union is not at start of per-cpu area");
+ . = ASSERT((per_cpu__irq_stack_union == 0),
+            "irq_stack_union is not at start of per-cpu area");
   #endif
   
   #endif /* CONFIG_X86_32 */
@@@ -424,7 -420,7 +417,7 @@@
   #ifdef CONFIG_KEXEC
   #include <asm/kexec.h>
   
- ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
-        "kexec control code size is too big")
+ . = ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
+            "kexec control code size is too big");
   #endif
   
diff --combined arch/x86/mm/pageattr.c

index c106f78,7e600c1..dce282f
--- 1/arch/x86/mm/pageattr.c
--- 2/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@@ -12,7 -12,6 +12,7 @@@
   #include <linux/seq_file.h>
   #include <linux/debugfs.h>
   #include <linux/pfn.h>
+ +#include <linux/percpu.h>
   
   #include <asm/e820.h>
   #include <asm/processor.h>
@@@ -592,9 -591,12 +592,12 @@@ static int __change_page_attr(struct cp
         unsigned int level;
         pte_t *kpte, old_pte;
   
-       if (cpa->flags & CPA_PAGES_ARRAY)
-               address = (unsigned long)page_address(cpa->pages[cpa->curpage]);
-       else if (cpa->flags & CPA_ARRAY)
+       if (cpa->flags & CPA_PAGES_ARRAY) {
+               struct page *page = cpa->pages[cpa->curpage];
+               if (unlikely(PageHighMem(page)))
+                       return 0;
+               address = (unsigned long)page_address(page);
+       } else if (cpa->flags & CPA_ARRAY)
                 address = cpa->vaddr[cpa->curpage];
         else
                 address = *cpa->vaddr;
@@@ -698,9 -700,12 +701,12 @@@ static int cpa_process_alias(struct cpa
          * No need to redo, when the primary call touched the direct
          * mapping already:
          */
-       if (cpa->flags & CPA_PAGES_ARRAY)
-               vaddr = (unsigned long)page_address(cpa->pages[cpa->curpage]);
-       else if (cpa->flags & CPA_ARRAY)
+       if (cpa->flags & CPA_PAGES_ARRAY) {
+               struct page *page = cpa->pages[cpa->curpage];
+               if (unlikely(PageHighMem(page)))
+                       return 0;
+               vaddr = (unsigned long)page_address(page);
+       } else if (cpa->flags & CPA_ARRAY)
                 vaddr = cpa->vaddr[cpa->curpage];
         else
                 vaddr = *cpa->vaddr;
@@@ -998,12 -1003,15 +1004,15 @@@ EXPORT_SYMBOL(set_memory_array_uc)
   int _set_memory_wc(unsigned long addr, int numpages)
   {
         int ret;
+       unsigned long addr_copy = addr;
+ 
         ret = change_page_attr_set(&addr, numpages,
                                     __pgprot(_PAGE_CACHE_UC_MINUS), 0);
- 
         if (!ret) {
-               ret = change_page_attr_set(&addr, numpages,
-                                   __pgprot(_PAGE_CACHE_WC), 0);
+               ret = change_page_attr_set_clr(&addr_copy, numpages,
+                                              __pgprot(_PAGE_CACHE_WC),
+                                              __pgprot(_PAGE_CACHE_MASK),
+                                              0, 0, NULL);
         }
         return ret;
   }
@@@ -1120,7 -1128,9 +1129,9 @@@ int set_pages_array_uc(struct page **pa
         int free_idx;
   
         for (i = 0; i < addrinarray; i++) {
-               start = (unsigned long)page_address(pages[i]);
+               if (PageHighMem(pages[i]))
+                       continue;
+               start = page_to_pfn(pages[i]) << PAGE_SHIFT;
                 end = start + PAGE_SIZE;
                 if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL))
                         goto err_out;
@@@ -1133,7 -1143,9 +1144,9 @@@
   err_out:
         free_idx = i;
         for (i = 0; i < free_idx; i++) {
-               start = (unsigned long)page_address(pages[i]);
+               if (PageHighMem(pages[i]))
+                       continue;
+               start = page_to_pfn(pages[i]) << PAGE_SHIFT;
                 end = start + PAGE_SIZE;
                 free_memtype(start, end);
         }
@@@ -1162,7 -1174,9 +1175,9 @@@ int set_pages_array_wb(struct page **pa
                 return retval;
   
         for (i = 0; i < addrinarray; i++) {
-               start = (unsigned long)page_address(pages[i]);
+               if (PageHighMem(pages[i]))
+                       continue;
+               start = page_to_pfn(pages[i]) << PAGE_SHIFT;
                 end = start + PAGE_SIZE;
                 free_memtype(start, end);
         }
diff --combined block/cfq-iosched.c

index 85208dd,fd7080e..1b2d12c
--- 1/block/cfq-iosched.c
--- 2/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@@ -48,7 -48,7 +48,7 @@@ static int cfq_slice_idle = HZ / 125
   static struct kmem_cache *cfq_pool;
   static struct kmem_cache *cfq_ioc_pool;
   
- -static DEFINE_PER_CPU(unsigned long, ioc_count);
+ +static DEFINE_PER_CPU(unsigned long, cfq_ioc_count);
   static struct completion *ioc_gone;
   static DEFINE_SPINLOCK(ioc_gone_lock);
   
@@@ -1427,7 -1427,7 +1427,7 @@@ static void cfq_cic_free_rcu(struct rcu
         cic = container_of(head, struct cfq_io_context, rcu_head);
   
         kmem_cache_free(cfq_ioc_pool, cic);
- -      elv_ioc_count_dec(ioc_count);
+ +      elv_ioc_count_dec(cfq_ioc_count);
   
         if (ioc_gone) {
                 /*
@@@ -1436,7 -1436,7 +1436,7 @@@
                  * complete ioc_gone and set it back to NULL
                  */
                 spin_lock(&ioc_gone_lock);
- -              if (ioc_gone && !elv_ioc_count_read(ioc_count)) {
+ +              if (ioc_gone && !elv_ioc_count_read(cfq_ioc_count)) {
                         complete(ioc_gone);
                         ioc_gone = NULL;
                 }
@@@ -1562,7 -1562,7 +1562,7 @@@ cfq_alloc_io_context(struct cfq_data *c
                 INIT_HLIST_NODE(&cic->cic_list);
                 cic->dtor = cfq_free_io_context;
                 cic->exit = cfq_exit_io_context;
- -              elv_ioc_count_inc(ioc_count);
+ +              elv_ioc_count_inc(cfq_ioc_count);
         }
   
         return cic;
@@@ -2311,7 -2311,7 +2311,7 @@@ cfq_set_request(struct request_queue *q
                 goto queue_fail;
   
         cfqq = cic_to_cfqq(cic, is_sync);
-       if (!cfqq) {
+       if (!cfqq || cfqq == &cfqd->oom_cfqq) {
                 cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask);
                 cic_set_cfqq(cic, cfqq, is_sync);
         }
@@@ -2668,7 -2668,7 +2668,7 @@@ static void __exit cfq_exit(void
          * this also protects us from entering cfq_slab_kill() with
          * pending RCU callbacks
          */
- -      if (elv_ioc_count_read(ioc_count))
+ +      if (elv_ioc_count_read(cfq_ioc_count))
                 wait_for_completion(&all_gone);
         cfq_slab_kill();
   }
diff --combined drivers/cpufreq/cpufreq_conservative.c

index a7ef465,bdea7e2..bc33ddc
--- 1/drivers/cpufreq/cpufreq_conservative.c
--- 2/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@@ -64,21 -64,20 +64,20 @@@ struct cpu_dbs_info_s 
         unsigned int requested_freq;
         int cpu;
         unsigned int enable:1;
+       /*
+        * percpu mutex that serializes governor limit change with
+        * do_dbs_timer invocation. We do not want do_dbs_timer to run
+        * when user is changing the governor or limits.
+        */
+       struct mutex timer_mutex;
   };
- -static DEFINE_PER_CPU(struct cpu_dbs_info_s, cpu_dbs_info);
+ +static DEFINE_PER_CPU(struct cpu_dbs_info_s, cs_cpu_dbs_info);
   
   static unsigned int dbs_enable;       /* number of CPUs using this policy */
   
   /*
-  * DEADLOCK ALERT! There is a ordering requirement between cpu_hotplug
-  * lock and dbs_mutex. cpu_hotplug lock should always be held before
-  * dbs_mutex. If any function that can potentially take cpu_hotplug lock
-  * (like __cpufreq_driver_target()) is being called with dbs_mutex taken, then
-  * cpu_hotplug lock should be taken before that. Note that cpu_hotplug lock
-  * is recursive for the same process. -Venki
-  * DEADLOCK ALERT! (2) : do_dbs_timer() must not take the dbs_mutex, because it
-  * would deadlock with cancel_delayed_work_sync(), which is needed for proper
-  * raceless workqueue teardown.
+  * dbs_mutex protects data in dbs_tuners_ins from concurrent changes on
+  * different CPUs. It protects dbs_enable in governor start/stop.
    */
   static DEFINE_MUTEX(dbs_mutex);
   
@@@ -138,7 -137,7 +137,7 @@@ dbs_cpufreq_notifier(struct notifier_bl
                      void *data)
   {
         struct cpufreq_freqs *freq = data;
- -      struct cpu_dbs_info_s *this_dbs_info = &per_cpu(cpu_dbs_info,
+ +      struct cpu_dbs_info_s *this_dbs_info = &per_cpu(cs_cpu_dbs_info,
                                                         freq->cpu);
   
         struct cpufreq_policy *policy;
@@@ -298,7 -297,7 +297,7 @@@ static ssize_t store_ignore_nice_load(s
         /* we need to re-evaluate prev_cpu_idle */
         for_each_online_cpu(j) {
                 struct cpu_dbs_info_s *dbs_info;
- -              dbs_info = &per_cpu(cpu_dbs_info, j);
+ +              dbs_info = &per_cpu(cs_cpu_dbs_info, j);
                 dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
                                                 &dbs_info->prev_cpu_wall);
                 if (dbs_tuners_ins.ignore_nice)
@@@ -388,7 -387,7 +387,7 @@@ static void dbs_check_cpu(struct cpu_db
                 cputime64_t cur_wall_time, cur_idle_time;
                 unsigned int idle_time, wall_time;
   
- -              j_dbs_info = &per_cpu(cpu_dbs_info, j);
+ +              j_dbs_info = &per_cpu(cs_cpu_dbs_info, j);
   
                 cur_idle_time = get_cpu_idle_time(j, &cur_wall_time);
   
@@@ -488,18 -487,12 +487,12 @@@ static void do_dbs_timer(struct work_st
   
         delay -= jiffies % delay;
   
-       if (lock_policy_rwsem_write(cpu) < 0)
-               return;
- 
-       if (!dbs_info->enable) {
-               unlock_policy_rwsem_write(cpu);
-               return;
-       }
+       mutex_lock(&dbs_info->timer_mutex);
   
         dbs_check_cpu(dbs_info);
   
         queue_delayed_work_on(cpu, kconservative_wq, &dbs_info->work, delay);
-       unlock_policy_rwsem_write(cpu);
+       mutex_unlock(&dbs_info->timer_mutex);
   }
   
   static inline void dbs_timer_init(struct cpu_dbs_info_s *dbs_info)
@@@ -528,16 -521,13 +521,13 @@@ static int cpufreq_governor_dbs(struct 
         unsigned int j;
         int rc;
   
- -      this_dbs_info = &per_cpu(cpu_dbs_info, cpu);
+ +      this_dbs_info = &per_cpu(cs_cpu_dbs_info, cpu);
   
         switch (event) {
         case CPUFREQ_GOV_START:
                 if ((!cpu_online(cpu)) || (!policy->cur))
                         return -EINVAL;
   
-               if (this_dbs_info->enable) /* Already enabled */
-                       break;
- 
                 mutex_lock(&dbs_mutex);
   
                 rc = sysfs_create_group(&policy->kobj, &dbs_attr_group);
@@@ -548,7 -538,7 +538,7 @@@
   
                 for_each_cpu(j, policy->cpus) {
                         struct cpu_dbs_info_s *j_dbs_info;
- -                      j_dbs_info = &per_cpu(cpu_dbs_info, j);
+ +                      j_dbs_info = &per_cpu(cs_cpu_dbs_info, j);
                         j_dbs_info->cur_policy = policy;
   
                         j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
@@@ -561,6 -551,7 +551,7 @@@
                 this_dbs_info->down_skip = 0;
                 this_dbs_info->requested_freq = policy->cur;
   
+               mutex_init(&this_dbs_info->timer_mutex);
                 dbs_enable++;
                 /*
                  * Start the timerschedule work, when this governor
@@@ -590,17 -581,19 +581,19 @@@
                                         &dbs_cpufreq_notifier_block,
                                         CPUFREQ_TRANSITION_NOTIFIER);
                 }
-               dbs_timer_init(this_dbs_info);
- 
                 mutex_unlock(&dbs_mutex);
   
+               dbs_timer_init(this_dbs_info);
+ 
                 break;
   
         case CPUFREQ_GOV_STOP:
-               mutex_lock(&dbs_mutex);
                 dbs_timer_exit(this_dbs_info);
+ 
+               mutex_lock(&dbs_mutex);
                 sysfs_remove_group(&policy->kobj, &dbs_attr_group);
                 dbs_enable--;
+               mutex_destroy(&this_dbs_info->timer_mutex);
   
                 /*
                  * Stop the timerschedule work, when this governor
@@@ -616,7 -609,7 +609,7 @@@
                 break;
   
         case CPUFREQ_GOV_LIMITS:
-               mutex_lock(&dbs_mutex);
+               mutex_lock(&this_dbs_info->timer_mutex);
                 if (policy->max < this_dbs_info->cur_policy->cur)
                         __cpufreq_driver_target(
                                         this_dbs_info->cur_policy,
@@@ -625,7 -618,7 +618,7 @@@
                         __cpufreq_driver_target(
                                         this_dbs_info->cur_policy,
                                         policy->min, CPUFREQ_RELATION_L);
-               mutex_unlock(&dbs_mutex);
+               mutex_unlock(&this_dbs_info->timer_mutex);
   
                 break;
         }
diff --combined drivers/cpufreq/cpufreq_ondemand.c

index 36f292a,d6ba142..d7a528c
--- 1/drivers/cpufreq/cpufreq_ondemand.c
--- 2/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@@ -70,23 -70,21 +70,21 @@@ struct cpu_dbs_info_s 
         unsigned int freq_lo_jiffies;
         unsigned int freq_hi_jiffies;
         int cpu;
-       unsigned int enable:1,
-               sample_type:1;
+       unsigned int sample_type:1;
+       /*
+        * percpu mutex that serializes governor limit change with
+        * do_dbs_timer invocation. We do not want do_dbs_timer to run
+        * when user is changing the governor or limits.
+        */
+       struct mutex timer_mutex;
   };
- -static DEFINE_PER_CPU(struct cpu_dbs_info_s, cpu_dbs_info);
+ +static DEFINE_PER_CPU(struct cpu_dbs_info_s, od_cpu_dbs_info);
   
   static unsigned int dbs_enable;       /* number of CPUs using this policy */
   
   /*
-  * DEADLOCK ALERT! There is a ordering requirement between cpu_hotplug
-  * lock and dbs_mutex. cpu_hotplug lock should always be held before
-  * dbs_mutex. If any function that can potentially take cpu_hotplug lock
-  * (like __cpufreq_driver_target()) is being called with dbs_mutex taken, then
-  * cpu_hotplug lock should be taken before that. Note that cpu_hotplug lock
-  * is recursive for the same process. -Venki
-  * DEADLOCK ALERT! (2) : do_dbs_timer() must not take the dbs_mutex, because it
-  * would deadlock with cancel_delayed_work_sync(), which is needed for proper
-  * raceless workqueue teardown.
+  * dbs_mutex protects data in dbs_tuners_ins from concurrent changes on
+  * different CPUs. It protects dbs_enable in governor start/stop.
    */
   static DEFINE_MUTEX(dbs_mutex);
   
@@@ -151,8 -149,7 +149,8 @@@ static unsigned int powersave_bias_targ
         unsigned int freq_hi, freq_lo;
         unsigned int index = 0;
         unsigned int jiffies_total, jiffies_hi, jiffies_lo;
- -      struct cpu_dbs_info_s *dbs_info = &per_cpu(cpu_dbs_info, policy->cpu);
+ +      struct cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info,
+ +                                                 policy->cpu);
   
         if (!dbs_info->freq_table) {
                 dbs_info->freq_lo = 0;
@@@ -193,13 -190,18 +191,18 @@@
         return freq_hi;
   }
   
- -      struct cpu_dbs_info_s *dbs_info = &per_cpu(cpu_dbs_info, cpu);
+ static void ondemand_powersave_bias_init_cpu(int cpu)
+ {
++      struct cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
+       dbs_info->freq_table = cpufreq_frequency_get_table(cpu);
+       dbs_info->freq_lo = 0;
+ }
+ 
   static void ondemand_powersave_bias_init(void)
   {
         int i;
         for_each_online_cpu(i) {
-               struct cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, i);
-               dbs_info->freq_table = cpufreq_frequency_get_table(i);
-               dbs_info->freq_lo = 0;
+               ondemand_powersave_bias_init_cpu(i);
         }
   }
   
@@@ -241,12 -243,10 +244,10 @@@ static ssize_t store_sampling_rate(stru
         unsigned int input;
         int ret;
         ret = sscanf(buf, "%u", &input);
+       if (ret != 1)
+               return -EINVAL;
   
         mutex_lock(&dbs_mutex);
-       if (ret != 1) {
-               mutex_unlock(&dbs_mutex);
-               return -EINVAL;
-       }
         dbs_tuners_ins.sampling_rate = max(input, min_sampling_rate);
         mutex_unlock(&dbs_mutex);
   
@@@ -260,13 -260,12 +261,12 @@@ static ssize_t store_up_threshold(struc
         int ret;
         ret = sscanf(buf, "%u", &input);
   
-       mutex_lock(&dbs_mutex);
         if (ret != 1 || input > MAX_FREQUENCY_UP_THRESHOLD ||
                         input < MIN_FREQUENCY_UP_THRESHOLD) {
-               mutex_unlock(&dbs_mutex);
                 return -EINVAL;
         }
   
+       mutex_lock(&dbs_mutex);
         dbs_tuners_ins.up_threshold = input;
         mutex_unlock(&dbs_mutex);
   
@@@ -298,7 -297,7 +298,7 @@@ static ssize_t store_ignore_nice_load(s
         /* we need to re-evaluate prev_cpu_idle */
         for_each_online_cpu(j) {
                 struct cpu_dbs_info_s *dbs_info;
- -              dbs_info = &per_cpu(cpu_dbs_info, j);
+ +              dbs_info = &per_cpu(od_cpu_dbs_info, j);
                 dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
                                                 &dbs_info->prev_cpu_wall);
                 if (dbs_tuners_ins.ignore_nice)
@@@ -364,9 -363,6 +364,6 @@@ static void dbs_check_cpu(struct cpu_db
         struct cpufreq_policy *policy;
         unsigned int j;
   
-       if (!this_dbs_info->enable)
-               return;
- 
         this_dbs_info->freq_lo = 0;
         policy = this_dbs_info->cur_policy;
   
@@@ -392,7 -388,7 +389,7 @@@
                 unsigned int load, load_freq;
                 int freq_avg;
   
- -              j_dbs_info = &per_cpu(cpu_dbs_info, j);
+ +              j_dbs_info = &per_cpu(od_cpu_dbs_info, j);
   
                 cur_idle_time = get_cpu_idle_time(j, &cur_wall_time);
   
@@@ -494,14 -490,7 +491,7 @@@ static void do_dbs_timer(struct work_st
         int delay = usecs_to_jiffies(dbs_tuners_ins.sampling_rate);
   
         delay -= jiffies % delay;
- 
-       if (lock_policy_rwsem_write(cpu) < 0)
-               return;
- 
-       if (!dbs_info->enable) {
-               unlock_policy_rwsem_write(cpu);
-               return;
-       }
+       mutex_lock(&dbs_info->timer_mutex);
   
         /* Common NORMAL_SAMPLE setup */
         dbs_info->sample_type = DBS_NORMAL_SAMPLE;
@@@ -518,7 -507,7 +508,7 @@@
                         dbs_info->freq_lo, CPUFREQ_RELATION_H);
         }
         queue_delayed_work_on(cpu, kondemand_wq, &dbs_info->work, delay);
-       unlock_policy_rwsem_write(cpu);
+       mutex_unlock(&dbs_info->timer_mutex);
   }
   
   static inline void dbs_timer_init(struct cpu_dbs_info_s *dbs_info)
@@@ -527,8 -516,6 +517,6 @@@
         int delay = usecs_to_jiffies(dbs_tuners_ins.sampling_rate);
         delay -= jiffies % delay;
   
-       dbs_info->enable = 1;
-       ondemand_powersave_bias_init();
         dbs_info->sample_type = DBS_NORMAL_SAMPLE;
         INIT_DELAYED_WORK_DEFERRABLE(&dbs_info->work, do_dbs_timer);
         queue_delayed_work_on(dbs_info->cpu, kondemand_wq, &dbs_info->work,
@@@ -537,7 -524,6 +525,6 @@@
   
   static inline void dbs_timer_exit(struct cpu_dbs_info_s *dbs_info)
   {
-       dbs_info->enable = 0;
         cancel_delayed_work_sync(&dbs_info->work);
   }
   
@@@ -549,29 -535,25 +536,25 @@@ static int cpufreq_governor_dbs(struct 
         unsigned int j;
         int rc;
   
- -      this_dbs_info = &per_cpu(cpu_dbs_info, cpu);
+ +      this_dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
   
         switch (event) {
         case CPUFREQ_GOV_START:
                 if ((!cpu_online(cpu)) || (!policy->cur))
                         return -EINVAL;
   
-               if (this_dbs_info->enable) /* Already enabled */
-                       break;
- 
                 mutex_lock(&dbs_mutex);
-               dbs_enable++;
   
                 rc = sysfs_create_group(&policy->kobj, &dbs_attr_group);
                 if (rc) {
-                       dbs_enable--;
                         mutex_unlock(&dbs_mutex);
                         return rc;
                 }
   
+               dbs_enable++;
                 for_each_cpu(j, policy->cpus) {
                         struct cpu_dbs_info_s *j_dbs_info;
- -                      j_dbs_info = &per_cpu(cpu_dbs_info, j);
+ +                      j_dbs_info = &per_cpu(od_cpu_dbs_info, j);
                         j_dbs_info->cur_policy = policy;
   
                         j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
@@@ -582,6 -564,8 +565,8 @@@
                         }
                 }
                 this_dbs_info->cpu = cpu;
+               ondemand_powersave_bias_init_cpu(cpu);
+               mutex_init(&this_dbs_info->timer_mutex);
                 /*
                  * Start the timerschedule work, when this governor
                  * is used for first time
@@@ -599,29 -583,31 +584,31 @@@
                                 max(min_sampling_rate,
                                     latency * LATENCY_MULTIPLIER);
                 }
-               dbs_timer_init(this_dbs_info);
- 
                 mutex_unlock(&dbs_mutex);
+ 
+               dbs_timer_init(this_dbs_info);
                 break;
   
         case CPUFREQ_GOV_STOP:
-               mutex_lock(&dbs_mutex);
                 dbs_timer_exit(this_dbs_info);
+ 
+               mutex_lock(&dbs_mutex);
                 sysfs_remove_group(&policy->kobj, &dbs_attr_group);
+               mutex_destroy(&this_dbs_info->timer_mutex);
                 dbs_enable--;
                 mutex_unlock(&dbs_mutex);
   
                 break;
   
         case CPUFREQ_GOV_LIMITS:
-               mutex_lock(&dbs_mutex);
+               mutex_lock(&this_dbs_info->timer_mutex);
                 if (policy->max < this_dbs_info->cur_policy->cur)
                         __cpufreq_driver_target(this_dbs_info->cur_policy,
                                 policy->max, CPUFREQ_RELATION_H);
                 else if (policy->min > this_dbs_info->cur_policy->cur)
                         __cpufreq_driver_target(this_dbs_info->cur_policy,
                                 policy->min, CPUFREQ_RELATION_L);
-               mutex_unlock(&dbs_mutex);
+               mutex_unlock(&this_dbs_info->timer_mutex);
                 break;
         }
         return 0;
diff --combined drivers/xen/events.c

index 7d2987e,abad71b..2f57276
--- 1/drivers/xen/events.c
--- 2/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@@ -47,10 -47,10 +47,10 @@@
   static DEFINE_SPINLOCK(irq_mapping_update_lock);
   
   /* IRQ <-> VIRQ mapping. */
- -static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1};
+ +static DEFINE_PER_CPU(int [NR_VIRQS], virq_to_irq) = {[0 ... NR_VIRQS-1] = -1};
   
   /* IRQ <-> IPI mapping */
- -static DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = -1};
+ +static DEFINE_PER_CPU(int [XEN_NR_IPIS], ipi_to_irq) = {[0 ... XEN_NR_IPIS-1] = -1};
   
   /* Interrupt types. */
   enum xen_irq_type {
@@@ -602,8 -602,6 +602,8 @@@ irqreturn_t xen_debug_interrupt(int irq
         return IRQ_HANDLED;
   }
   
+ +static DEFINE_PER_CPU(unsigned, xed_nesting_count);
+ +
   /*
    * Search the CPUs pending events bitmasks.  For each one found, map
    * the event number to an irq, and feed it into do_IRQ() for
@@@ -619,6 -617,7 +619,6 @@@ void xen_evtchn_do_upcall(struct pt_reg
         struct pt_regs *old_regs = set_irq_regs(regs);
         struct shared_info *s = HYPERVISOR_shared_info;
         struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
- -      static DEFINE_PER_CPU(unsigned, nesting_count);
         unsigned count;
   
         exit_idle();
@@@ -629,7 -628,7 +629,7 @@@
   
                 vcpu_info->evtchn_upcall_pending = 0;
   
- -              if (__get_cpu_var(nesting_count)++)
+ +              if (__get_cpu_var(xed_nesting_count)++)
                         goto out;
   
   #ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */
@@@ -654,8 -653,8 +654,8 @@@
   
                 BUG_ON(!irqs_disabled());
   
- -              count = __get_cpu_var(nesting_count);
- -              __get_cpu_var(nesting_count) = 0;
+ +              count = __get_cpu_var(xed_nesting_count);
+ +              __get_cpu_var(xed_nesting_count) = 0;
         } while(count != 1);
   
   out:
@@@ -928,9 -927,9 +928,9 @@@ static struct irq_chip xen_dynamic_chi
   void __init xen_init_IRQ(void)
   {
         int i;
-       size_t size = nr_cpu_ids * sizeof(struct cpu_evtchn_s);
   
-       cpu_evtchn_mask_p = alloc_bootmem(size);
+       cpu_evtchn_mask_p = kcalloc(nr_cpu_ids, sizeof(struct cpu_evtchn_s),
+                                   GFP_KERNEL);
         BUG_ON(cpu_evtchn_mask_p == NULL);
   
         init_evtchn_cpu_bindings();
diff --combined include/asm-generic/vmlinux.lds.h

index ab8ea9b,6ad76bf..a43223a
--- 1/include/asm-generic/vmlinux.lds.h
--- 2/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@@ -30,15 -30,16 +30,13 @@@
    *    EXCEPTION_TABLE(...)
    *    NOTES
    *
-  *    __bss_start = .;
-  *    BSS_SECTION(0, 0)
-  *    __bss_stop = .;
+  *    BSS_SECTION(0, 0, 0)
    *    _end = .;
    *
- - *    /DISCARD/ : {
- - *            EXIT_TEXT
- - *            EXIT_DATA
- - *            EXIT_CALL
- - *    }
    *    STABS_DEBUG
    *    DWARF_DEBUG
+ + *
+ + *    DISCARDS                // must be the last
    * }
    *
    * [__init_begin, __init_end] is the init section that may be freed after init
@@@ -188,7 -189,7 +186,7 @@@
         . = ALIGN(align);                                               \
         *(.data.cacheline_aligned)
   
- #define INIT_TASK(align)                                              \
+ #define INIT_TASK_DATA(align)                                         \
         . = ALIGN(align);                                               \
         *(.data.init_task)
   
@@@ -431,10 -432,10 +429,10 @@@
   /*
    * Init task
    */
- #define INIT_TASK_DATA(align)                                         \
+ #define INIT_TASK_DATA_SECTION(align)                                 \
         . = ALIGN(align);                                               \
         .data.init_task : {                                             \
-               INIT_TASK                                               \
+               INIT_TASK_DATA(align)                                   \
         }
   
   #ifdef CONFIG_CONSTRUCTORS
@@@ -486,7 -487,8 +484,8 @@@
    * bss (Block Started by Symbol) - uninitialized data
    * zeroed during startup
    */
- #define SBSS                                                          \
+ #define SBSS(sbss_align)                                              \
+       . = ALIGN(sbss_align);                                          \
         .sbss : AT(ADDR(.sbss) - LOAD_OFFSET) {                         \
                 *(.sbss)                                                \
                 *(.scommon)                                             \
@@@ -495,12 -497,10 +494,10 @@@
   #define BSS(bss_align)                                                        \
         . = ALIGN(bss_align);                                           \
         .bss : AT(ADDR(.bss) - LOAD_OFFSET) {                           \
-               VMLINUX_SYMBOL(__bss_start) = .;                        \
                 *(.bss.page_aligned)                                    \
                 *(.dynbss)                                              \
                 *(.bss)                                                 \
                 *(COMMON)                                               \
-               VMLINUX_SYMBOL(__bss_stop) = .;                         \
         }
   
   /*
@@@ -626,23 -626,6 +623,23 @@@
   #define INIT_RAM_FS
   #endif
   
+ +/*
+ + * Default discarded sections.
+ + *
+ + * Some archs want to discard exit text/data at runtime rather than
+ + * link time due to cross-section references such as alt instructions,
+ + * bug table, eh_frame, etc.  DISCARDS must be the last of output
+ + * section definitions so that such archs put those in earlier section
+ + * definitions.
+ + */
+ +#define DISCARDS                                                      \
+ +      /DISCARD/ : {                                                   \
+ +      EXIT_TEXT                                                       \
+ +      EXIT_DATA                                                       \
+ +      EXIT_CALL                                                       \
+ +      *(.discard)                                                     \
+ +      }
+ +
   /**
    * PERCPU_VADDR - define output section for percpu area
    * @vaddr: explicit base address (optional)
@@@ -719,15 -702,15 +716,15 @@@
    * matches the requirment of PAGE_ALIGNED_DATA.
    *
    * use 0 as page_align if page_aligned data is not used */
- #define RW_DATA_SECTION(cacheline, nosave, pagealigned, inittask)     \
+ #define RW_DATA_SECTION(cacheline, pagealigned, inittask)             \
         . = ALIGN(PAGE_SIZE);                                           \
         .data : AT(ADDR(.data) - LOAD_OFFSET) {                         \
-               INIT_TASK(inittask)                                     \
+               INIT_TASK_DATA(inittask)                                \
                 CACHELINE_ALIGNED_DATA(cacheline)                       \
                 READ_MOSTLY_DATA(cacheline)                             \
                 DATA_DATA                                               \
                 CONSTRUCTORS                                            \
-               NOSAVE_DATA(nosave)                                     \
+               NOSAVE_DATA                                             \
                 PAGE_ALIGNED_DATA(pagealigned)                          \
         }
   
@@@ -749,8 -732,10 +746,10 @@@
                 INIT_RAM_FS                                             \
         }
   
- #define BSS_SECTION(sbss_align, bss_align)                            \
-       SBSS                                                            \
+ #define BSS_SECTION(sbss_align, bss_align, stop_align)                        \
+       . = ALIGN(sbss_align);                                          \
+       VMLINUX_SYMBOL(__bss_start) = .;                                \
+       SBSS(sbss_align)                                                \
         BSS(bss_align)                                                  \
-       . = ALIGN(4);
- 
+       . = ALIGN(stop_align);                                          \
+       VMLINUX_SYMBOL(__bss_stop) = .;
diff --combined init/main.c

index 7183285,2d9d6bd..2f9544d
--- 1/init/main.c
--- 2/init/main.c
+++ b/init/main.c
@@@ -353,6 -353,7 +353,6 @@@ static void __init smp_init(void
   #define smp_init()    do { } while (0)
   #endif
   
- -static inline void setup_per_cpu_areas(void) { }
   static inline void setup_nr_cpu_ids(void) { }
   static inline void smp_prepare_cpus(unsigned int maxcpus) { }
   
@@@ -373,6 -374,29 +373,6 @@@ static void __init setup_nr_cpu_ids(voi
         nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1;
   }
   
- -#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
- -unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
- -
- -EXPORT_SYMBOL(__per_cpu_offset);
- -
- -static void __init setup_per_cpu_areas(void)
- -{
- -      unsigned long size, i;
- -      char *ptr;
- -      unsigned long nr_possible_cpus = num_possible_cpus();
- -
- -      /* Copy section for each CPU (we discard the original) */
- -      size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE);
- -      ptr = alloc_bootmem_pages(size * nr_possible_cpus);
- -
- -      for_each_possible_cpu(i) {
- -              __per_cpu_offset[i] = ptr - __per_cpu_start;
- -              memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
- -              ptr += size;
- -      }
- -}
- -#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
- -
   /* Called by boot processor to activate the rest. */
   static void __init smp_init(void)
   {
@@@ -560,8 -584,8 +560,8 @@@ asmlinkage void __init start_kernel(voi
         setup_arch(&command_line);
         mm_init_owner(&init_mm, &init_task);
         setup_command_line(command_line);
-       setup_per_cpu_areas();
         setup_nr_cpu_ids();
+       setup_per_cpu_areas();
         smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
   
         build_all_zonelists();
diff --combined kernel/module.c

index f593495,fd14114..3a4db71
--- 1/kernel/module.c
--- 2/kernel/module.c
+++ b/kernel/module.c
@@@ -364,7 -364,7 +364,7 @@@ EXPORT_SYMBOL_GPL(find_module)
   
   #ifdef CONFIG_SMP
   
- -#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
+ +#ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA
   
   static void *percpu_modalloc(unsigned long size, unsigned long align,
                              const char *name)
@@@ -389,7 -389,7 +389,7 @@@ static void percpu_modfree(void *freeme
         free_percpu(freeme);
   }
   
- -#else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
+ +#else /* ... CONFIG_HAVE_LEGACY_PER_CPU_AREA */
   
   /* Number of blocks used and allocated. */
   static unsigned int pcpu_num_used, pcpu_num_allocated;
@@@ -535,7 -535,7 +535,7 @@@ static int percpu_modinit(void
   }
   __initcall(percpu_modinit);
   
- -#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
+ +#endif /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */
   
   static unsigned int find_pcpusec(Elf_Ehdr *hdr,
                                  Elf_Shdr *sechdrs,
@@@ -1068,7 -1068,8 +1068,8 @@@ static inline int check_modstruct_versi
   {
         const unsigned long *crc;
   
-       if (!find_symbol("module_layout", NULL, &crc, true, false))
+       if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL,
+                        &crc, true, false))
                 BUG();
         return check_version(sechdrs, versindex, "module_layout", mod, crc);
   }
@@@ -2451,9 -2452,9 +2452,9 @@@ SYSCALL_DEFINE3(init_module, void __use
                 return ret;
         }
         if (ret > 0) {
-               printk(KERN_WARNING "%s: '%s'->init suspiciously returned %d, "
-                                   "it should follow 0/-E convention\n"
-                      KERN_WARNING "%s: loading module anyway...\n",
+               printk(KERN_WARNING
+ "%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n"
+ "%s: loading module anyway...\n",
                        __func__, mod->name, ret,
                        __func__);
                 dump_stack();
diff --combined kernel/perf_counter.c

index fc3b974,534e20d..b0bdb36
--- 1/kernel/perf_counter.c
--- 2/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@@ -42,6 -42,7 +42,7 @@@ static int perf_overcommit __read_mostl
   static atomic_t nr_counters __read_mostly;
   static atomic_t nr_mmap_counters __read_mostly;
   static atomic_t nr_comm_counters __read_mostly;
+ static atomic_t nr_task_counters __read_mostly;
   
   /*
    * perf counter paranoia level:
@@@ -87,6 -88,7 +88,7 @@@ void __weak hw_perf_disable(void)             { ba
   void __weak hw_perf_enable(void)              { barrier(); }
   
   void __weak hw_perf_counter_setup(int cpu)    { barrier(); }
+ void __weak hw_perf_counter_setup_online(int cpu)     { barrier(); }
   
   int __weak
   hw_perf_group_sched_in(struct perf_counter *group_leader,
@@@ -98,16 -100,16 +100,16 @@@
   
   void __weak perf_counter_print_debug(void)    { }
   
- -static DEFINE_PER_CPU(int, disable_count);
+ +static DEFINE_PER_CPU(int, perf_disable_count);
   
   void __perf_disable(void)
   {
- -      __get_cpu_var(disable_count)++;
+ +      __get_cpu_var(perf_disable_count)++;
   }
   
   bool __perf_enable(void)
   {
- -      return !--__get_cpu_var(disable_count);
+ +      return !--__get_cpu_var(perf_disable_count);
   }
   
   void perf_disable(void)
@@@ -146,6 -148,28 +148,28 @@@ static void put_ctx(struct perf_counter
         }
   }
   
+ static void unclone_ctx(struct perf_counter_context *ctx)
+ {
+       if (ctx->parent_ctx) {
+               put_ctx(ctx->parent_ctx);
+               ctx->parent_ctx = NULL;
+       }
+ }
+ 
+ /*
+  * If we inherit counters we want to return the parent counter id
+  * to userspace.
+  */
+ static u64 primary_counter_id(struct perf_counter *counter)
+ {
+       u64 id = counter->id;
+ 
+       if (counter->parent)
+               id = counter->parent->id;
+ 
+       return id;
+ }
+ 
   /*
    * Get the perf_counter_context for a task and lock it.
    * This has to cope with with the fact that until it is locked,
@@@ -283,6 -307,10 +307,10 @@@ counter_sched_out(struct perf_counter *
                 return;
   
         counter->state = PERF_COUNTER_STATE_INACTIVE;
+       if (counter->pending_disable) {
+               counter->pending_disable = 0;
+               counter->state = PERF_COUNTER_STATE_OFF;
+       }
         counter->tstamp_stopped = ctx->time;
         counter->pmu->disable(counter);
         counter->oncpu = -1;
@@@ -1081,7 -1109,7 +1109,7 @@@ static void perf_counter_sync_stat(stru
                 __perf_counter_sync_stat(counter, next_counter);
   
                 counter = list_next_entry(counter, event_entry);
-               next_counter = list_next_entry(counter, event_entry);
+               next_counter = list_next_entry(next_counter, event_entry);
         }
   }
   
@@@ -1288,7 -1316,6 +1316,6 @@@ static void perf_counter_cpu_sched_in(s
   #define MAX_INTERRUPTS (~0ULL)
   
   static void perf_log_throttle(struct perf_counter *counter, int enable);
- static void perf_log_period(struct perf_counter *counter, u64 period);
   
   static void perf_adjust_period(struct perf_counter *counter, u64 events)
   {
@@@ -1307,8 -1334,6 +1334,6 @@@
         if (!sample_period)
                 sample_period = 1;
   
-       perf_log_period(counter, sample_period);
- 
         hwc->sample_period = sample_period;
   }
   
@@@ -1463,10 -1488,8 +1488,8 @@@ static void perf_counter_enable_on_exec
         /*
          * Unclone this context if we enabled any counter.
          */
-       if (enabled && ctx->parent_ctx) {
-               put_ctx(ctx->parent_ctx);
-               ctx->parent_ctx = NULL;
-       }
+       if (enabled)
+               unclone_ctx(ctx);
   
         spin_unlock(&ctx->lock);
   
@@@ -1526,7 -1549,6 +1549,6 @@@ __perf_counter_init_context(struct perf
   
   static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
   {
-       struct perf_counter_context *parent_ctx;
         struct perf_counter_context *ctx;
         struct perf_cpu_context *cpuctx;
         struct task_struct *task;
@@@ -1586,11 -1608,7 +1608,7 @@@
    retry:
         ctx = perf_lock_task_context(task, &flags);
         if (ctx) {
-               parent_ctx = ctx->parent_ctx;
-               if (parent_ctx) {
-                       put_ctx(parent_ctx);
-                       ctx->parent_ctx = NULL;         /* no longer a clone */
-               }
+               unclone_ctx(ctx);
                 spin_unlock_irqrestore(&ctx->lock, flags);
         }
   
@@@ -1642,6 -1660,8 +1660,8 @@@ static void free_counter(struct perf_co
                         atomic_dec(&nr_mmap_counters);
                 if (counter->attr.comm)
                         atomic_dec(&nr_comm_counters);
+               if (counter->attr.task)
+                       atomic_dec(&nr_task_counters);
         }
   
         if (counter->destroy)
@@@ -1676,14 -1696,133 +1696,133 @@@ static int perf_release(struct inode *i
         return 0;
   }
   
+ static int perf_counter_read_size(struct perf_counter *counter)
+ {
+       int entry = sizeof(u64); /* value */
+       int size = 0;
+       int nr = 1;
+ 
+       if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+               size += sizeof(u64);
+ 
+       if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+               size += sizeof(u64);
+ 
+       if (counter->attr.read_format & PERF_FORMAT_ID)
+               entry += sizeof(u64);
+ 
+       if (counter->attr.read_format & PERF_FORMAT_GROUP) {
+               nr += counter->group_leader->nr_siblings;
+               size += sizeof(u64);
+       }
+ 
+       size += entry * nr;
+ 
+       return size;
+ }
+ 
+ static u64 perf_counter_read_value(struct perf_counter *counter)
+ {
+       struct perf_counter *child;
+       u64 total = 0;
+ 
+       total += perf_counter_read(counter);
+       list_for_each_entry(child, &counter->child_list, child_list)
+               total += perf_counter_read(child);
+ 
+       return total;
+ }
+ 
+ static int perf_counter_read_entry(struct perf_counter *counter,
+                                  u64 read_format, char __user *buf)
+ {
+       int n = 0, count = 0;
+       u64 values[2];
+ 
+       values[n++] = perf_counter_read_value(counter);
+       if (read_format & PERF_FORMAT_ID)
+               values[n++] = primary_counter_id(counter);
+ 
+       count = n * sizeof(u64);
+ 
+       if (copy_to_user(buf, values, count))
+               return -EFAULT;
+ 
+       return count;
+ }
+ 
+ static int perf_counter_read_group(struct perf_counter *counter,
+                                  u64 read_format, char __user *buf)
+ {
+       struct perf_counter *leader = counter->group_leader, *sub;
+       int n = 0, size = 0, err = -EFAULT;
+       u64 values[3];
+ 
+       values[n++] = 1 + leader->nr_siblings;
+       if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+               values[n++] = leader->total_time_enabled +
+                       atomic64_read(&leader->child_total_time_enabled);
+       }
+       if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+               values[n++] = leader->total_time_running +
+                       atomic64_read(&leader->child_total_time_running);
+       }
+ 
+       size = n * sizeof(u64);
+ 
+       if (copy_to_user(buf, values, size))
+               return -EFAULT;
+ 
+       err = perf_counter_read_entry(leader, read_format, buf + size);
+       if (err < 0)
+               return err;
+ 
+       size += err;
+ 
+       list_for_each_entry(sub, &leader->sibling_list, list_entry) {
+               err = perf_counter_read_entry(counter, read_format,
+                               buf + size);
+               if (err < 0)
+                       return err;
+ 
+               size += err;
+       }
+ 
+       return size;
+ }
+ 
+ static int perf_counter_read_one(struct perf_counter *counter,
+                                u64 read_format, char __user *buf)
+ {
+       u64 values[4];
+       int n = 0;
+ 
+       values[n++] = perf_counter_read_value(counter);
+       if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+               values[n++] = counter->total_time_enabled +
+                       atomic64_read(&counter->child_total_time_enabled);
+       }
+       if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+               values[n++] = counter->total_time_running +
+                       atomic64_read(&counter->child_total_time_running);
+       }
+       if (read_format & PERF_FORMAT_ID)
+               values[n++] = primary_counter_id(counter);
+ 
+       if (copy_to_user(buf, values, n * sizeof(u64)))
+               return -EFAULT;
+ 
+       return n * sizeof(u64);
+ }
+ 
   /*
    * Read the performance counter - simple non blocking version for now
    */
   static ssize_t
   perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
   {
-       u64 values[4];
-       int n;
+       u64 read_format = counter->attr.read_format;
+       int ret;
   
         /*
          * Return end-of-file for a read on a counter that is in
@@@ -1693,28 -1832,18 +1832,18 @@@
         if (counter->state == PERF_COUNTER_STATE_ERROR)
                 return 0;
   
+       if (count < perf_counter_read_size(counter))
+               return -ENOSPC;
+ 
         WARN_ON_ONCE(counter->ctx->parent_ctx);
         mutex_lock(&counter->child_mutex);
-       values[0] = perf_counter_read(counter);
-       n = 1;
-       if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
-               values[n++] = counter->total_time_enabled +
-                       atomic64_read(&counter->child_total_time_enabled);
-       if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
-               values[n++] = counter->total_time_running +
-                       atomic64_read(&counter->child_total_time_running);
-       if (counter->attr.read_format & PERF_FORMAT_ID)
-               values[n++] = counter->id;
+       if (read_format & PERF_FORMAT_GROUP)
+               ret = perf_counter_read_group(counter, read_format, buf);
+       else
+               ret = perf_counter_read_one(counter, read_format, buf);
         mutex_unlock(&counter->child_mutex);
   
-       if (count < n * sizeof(u64))
-               return -EINVAL;
-       count = n * sizeof(u64);
- 
-       if (copy_to_user(buf, values, count))
-               return -EFAULT;
- 
-       return count;
+       return ret;
   }
   
   static ssize_t
@@@ -1811,8 -1940,6 +1940,6 @@@ static int perf_counter_period(struct p
   
                 counter->attr.sample_freq = value;
         } else {
-               perf_log_period(counter, value);
- 
                 counter->attr.sample_period = value;
                 counter->hw.sample_period = value;
         }
@@@ -2020,7 -2147,7 +2147,7 @@@ fail
   
   static void perf_mmap_free_page(unsigned long addr)
   {
-       struct page *page = virt_to_page(addr);
+       struct page *page = virt_to_page((void *)addr);
   
         page->mapping = NULL;
         __free_page(page);
@@@ -2220,7 -2347,7 +2347,7 @@@ static void perf_pending_counter(struc
   
         if (counter->pending_disable) {
                 counter->pending_disable = 0;
-               perf_counter_disable(counter);
+               __perf_counter_disable(counter);
         }
   
         if (counter->pending_wakeup) {
@@@ -2605,7 -2732,80 +2732,80 @@@ static u32 perf_counter_tid(struct perf
         return task_pid_nr_ns(p, counter->ns);
   }
   
- static void perf_counter_output(struct perf_counter *counter, int nmi,
+ static void perf_output_read_one(struct perf_output_handle *handle,
+                                struct perf_counter *counter)
+ {
+       u64 read_format = counter->attr.read_format;
+       u64 values[4];
+       int n = 0;
+ 
+       values[n++] = atomic64_read(&counter->count);
+       if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+               values[n++] = counter->total_time_enabled +
+                       atomic64_read(&counter->child_total_time_enabled);
+       }
+       if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+               values[n++] = counter->total_time_running +
+                       atomic64_read(&counter->child_total_time_running);
+       }
+       if (read_format & PERF_FORMAT_ID)
+               values[n++] = primary_counter_id(counter);
+ 
+       perf_output_copy(handle, values, n * sizeof(u64));
+ }
+ 
+ /*
+  * XXX PERF_FORMAT_GROUP vs inherited counters seems difficult.
+  */
+ static void perf_output_read_group(struct perf_output_handle *handle,
+                           struct perf_counter *counter)
+ {
+       struct perf_counter *leader = counter->group_leader, *sub;
+       u64 read_format = counter->attr.read_format;
+       u64 values[5];
+       int n = 0;
+ 
+       values[n++] = 1 + leader->nr_siblings;
+ 
+       if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+               values[n++] = leader->total_time_enabled;
+ 
+       if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+               values[n++] = leader->total_time_running;
+ 
+       if (leader != counter)
+               leader->pmu->read(leader);
+ 
+       values[n++] = atomic64_read(&leader->count);
+       if (read_format & PERF_FORMAT_ID)
+               values[n++] = primary_counter_id(leader);
+ 
+       perf_output_copy(handle, values, n * sizeof(u64));
+ 
+       list_for_each_entry(sub, &leader->sibling_list, list_entry) {
+               n = 0;
+ 
+               if (sub != counter)
+                       sub->pmu->read(sub);
+ 
+               values[n++] = atomic64_read(&sub->count);
+               if (read_format & PERF_FORMAT_ID)
+                       values[n++] = primary_counter_id(sub);
+ 
+               perf_output_copy(handle, values, n * sizeof(u64));
+       }
+ }
+ 
+ static void perf_output_read(struct perf_output_handle *handle,
+                            struct perf_counter *counter)
+ {
+       if (counter->attr.read_format & PERF_FORMAT_GROUP)
+               perf_output_read_group(handle, counter);
+       else
+               perf_output_read_one(handle, counter);
+ }
+ 
+ void perf_counter_output(struct perf_counter *counter, int nmi,
                                 struct perf_sample_data *data)
   {
         int ret;
@@@ -2616,10 -2816,6 +2816,6 @@@
         struct {
                 u32 pid, tid;
         } tid_entry;
-       struct {
-               u64 id;
-               u64 counter;
-       } group_entry;
         struct perf_callchain_entry *callchain = NULL;
         int callchain_size = 0;
         u64 time;
@@@ -2661,19 -2857,21 +2857,21 @@@
         if (sample_type & PERF_SAMPLE_ID)
                 header.size += sizeof(u64);
   
+       if (sample_type & PERF_SAMPLE_STREAM_ID)
+               header.size += sizeof(u64);
+ 
         if (sample_type & PERF_SAMPLE_CPU) {
                 header.size += sizeof(cpu_entry);
   
                 cpu_entry.cpu = raw_smp_processor_id();
+               cpu_entry.reserved = 0;
         }
   
         if (sample_type & PERF_SAMPLE_PERIOD)
                 header.size += sizeof(u64);
   
-       if (sample_type & PERF_SAMPLE_GROUP) {
-               header.size += sizeof(u64) +
-                       counter->nr_siblings * sizeof(group_entry);
-       }
+       if (sample_type & PERF_SAMPLE_READ)
+               header.size += perf_counter_read_size(counter);
   
         if (sample_type & PERF_SAMPLE_CALLCHAIN) {
                 callchain = perf_callchain(data->regs);
@@@ -2685,6 -2883,18 +2883,18 @@@
                         header.size += sizeof(u64);
         }
   
+       if (sample_type & PERF_SAMPLE_RAW) {
+               int size = sizeof(u32);
+ 
+               if (data->raw)
+                       size += data->raw->size;
+               else
+                       size += sizeof(u32);
+ 
+               WARN_ON_ONCE(size & (sizeof(u64)-1));
+               header.size += size;
+       }
+ 
         ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
         if (ret)
                 return;
@@@ -2703,7 -2913,13 +2913,13 @@@
         if (sample_type & PERF_SAMPLE_ADDR)
                 perf_output_put(&handle, data->addr);
   
-       if (sample_type & PERF_SAMPLE_ID)
+       if (sample_type & PERF_SAMPLE_ID) {
+               u64 id = primary_counter_id(counter);
+ 
+               perf_output_put(&handle, id);
+       }
+ 
+       if (sample_type & PERF_SAMPLE_STREAM_ID)
                 perf_output_put(&handle, counter->id);
   
         if (sample_type & PERF_SAMPLE_CPU)
@@@ -2712,26 -2928,8 +2928,8 @@@
         if (sample_type & PERF_SAMPLE_PERIOD)
                 perf_output_put(&handle, data->period);
   
-       /*
-        * XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult.
-        */
-       if (sample_type & PERF_SAMPLE_GROUP) {
-               struct perf_counter *leader, *sub;
-               u64 nr = counter->nr_siblings;
- 
-               perf_output_put(&handle, nr);
- 
-               leader = counter->group_leader;
-               list_for_each_entry(sub, &leader->sibling_list, list_entry) {
-                       if (sub != counter)
-                               sub->pmu->read(sub);
- 
-                       group_entry.id = sub->id;
-                       group_entry.counter = atomic64_read(&sub->count);
- 
-                       perf_output_put(&handle, group_entry);
-               }
-       }
+       if (sample_type & PERF_SAMPLE_READ)
+               perf_output_read(&handle, counter);
   
         if (sample_type & PERF_SAMPLE_CALLCHAIN) {
                 if (callchain)
@@@ -2742,6 -2940,22 +2940,22 @@@
                 }
         }
   
+       if (sample_type & PERF_SAMPLE_RAW) {
+               if (data->raw) {
+                       perf_output_put(&handle, data->raw->size);
+                       perf_output_copy(&handle, data->raw->data, data->raw->size);
+               } else {
+                       struct {
+                               u32     size;
+                               u32     data;
+                       } raw = {
+                               .size = sizeof(u32),
+                               .data = 0,
+                       };
+                       perf_output_put(&handle, raw);
+               }
+       }
+ 
         perf_output_end(&handle);
   }
   
@@@ -2754,8 -2968,6 +2968,6 @@@ struct perf_read_event 
   
         u32                             pid;
         u32                             tid;
-       u64                             value;
-       u64                             format[3];
   };
   
   static void
@@@ -2767,87 -2979,74 +2979,74 @@@ perf_counter_read_event(struct perf_cou
                 .header = {
                         .type = PERF_EVENT_READ,
                         .misc = 0,
-                       .size = sizeof(event) - sizeof(event.format),
+                       .size = sizeof(event) + perf_counter_read_size(counter),
                 },
                 .pid = perf_counter_pid(counter, task),
                 .tid = perf_counter_tid(counter, task),
-               .value = atomic64_read(&counter->count),
         };
-       int ret, i = 0;
- 
-       if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
-               event.header.size += sizeof(u64);
-               event.format[i++] = counter->total_time_enabled;
-       }
- 
-       if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
-               event.header.size += sizeof(u64);
-               event.format[i++] = counter->total_time_running;
-       }
- 
-       if (counter->attr.read_format & PERF_FORMAT_ID) {
-               u64 id;
- 
-               event.header.size += sizeof(u64);
-               if (counter->parent)
-                       id = counter->parent->id;
-               else
-                       id = counter->id;
- 
-               event.format[i++] = id;
-       }
+       int ret;
   
         ret = perf_output_begin(&handle, counter, event.header.size, 0, 0);
         if (ret)
                 return;
   
-       perf_output_copy(&handle, &event, event.header.size);
+       perf_output_put(&handle, event);
+       perf_output_read(&handle, counter);
+ 
         perf_output_end(&handle);
   }
   
   /*
-  * fork tracking
+  * task tracking -- fork/exit
+  *
+  * enabled by: attr.comm | attr.mmap | attr.task
    */
   
- struct perf_fork_event {
-       struct task_struct      *task;
+ struct perf_task_event {
+       struct task_struct              *task;
+       struct perf_counter_context     *task_ctx;
   
         struct {
                 struct perf_event_header        header;
   
                 u32                             pid;
                 u32                             ppid;
+               u32                             tid;
+               u32                             ptid;
         } event;
   };
   
- static void perf_counter_fork_output(struct perf_counter *counter,
-                                    struct perf_fork_event *fork_event)
+ static void perf_counter_task_output(struct perf_counter *counter,
+                                    struct perf_task_event *task_event)
   {
         struct perf_output_handle handle;
-       int size = fork_event->event.header.size;
-       struct task_struct *task = fork_event->task;
+       int size = task_event->event.header.size;
+       struct task_struct *task = task_event->task;
         int ret = perf_output_begin(&handle, counter, size, 0, 0);
   
         if (ret)
                 return;
   
-       fork_event->event.pid = perf_counter_pid(counter, task);
-       fork_event->event.ppid = perf_counter_pid(counter, task->real_parent);
+       task_event->event.pid = perf_counter_pid(counter, task);
+       task_event->event.ppid = perf_counter_pid(counter, current);
+ 
+       task_event->event.tid = perf_counter_tid(counter, task);
+       task_event->event.ptid = perf_counter_tid(counter, current);
   
-       perf_output_put(&handle, fork_event->event);
+       perf_output_put(&handle, task_event->event);
         perf_output_end(&handle);
   }
   
- static int perf_counter_fork_match(struct perf_counter *counter)
+ static int perf_counter_task_match(struct perf_counter *counter)
   {
-       if (counter->attr.comm || counter->attr.mmap)
+       if (counter->attr.comm || counter->attr.mmap || counter->attr.task)
                 return 1;
   
         return 0;
   }
   
- static void perf_counter_fork_ctx(struct perf_counter_context *ctx,
-                                 struct perf_fork_event *fork_event)
+ static void perf_counter_task_ctx(struct perf_counter_context *ctx,
+                                 struct perf_task_event *task_event)
   {
         struct perf_counter *counter;
   
@@@ -2856,51 -3055,62 +3055,62 @@@
   
         rcu_read_lock();
         list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
-               if (perf_counter_fork_match(counter))
-                       perf_counter_fork_output(counter, fork_event);
+               if (perf_counter_task_match(counter))
+                       perf_counter_task_output(counter, task_event);
         }
         rcu_read_unlock();
   }
   
- static void perf_counter_fork_event(struct perf_fork_event *fork_event)
+ static void perf_counter_task_event(struct perf_task_event *task_event)
   {
         struct perf_cpu_context *cpuctx;
-       struct perf_counter_context *ctx;
+       struct perf_counter_context *ctx = task_event->task_ctx;
   
         cpuctx = &get_cpu_var(perf_cpu_context);
-       perf_counter_fork_ctx(&cpuctx->ctx, fork_event);
+       perf_counter_task_ctx(&cpuctx->ctx, task_event);
         put_cpu_var(perf_cpu_context);
   
         rcu_read_lock();
-       /*
-        * doesn't really matter which of the child contexts the
-        * events ends up in.
-        */
-       ctx = rcu_dereference(current->perf_counter_ctxp);
+       if (!ctx)
+               ctx = rcu_dereference(task_event->task->perf_counter_ctxp);
         if (ctx)
-               perf_counter_fork_ctx(ctx, fork_event);
+               perf_counter_task_ctx(ctx, task_event);
         rcu_read_unlock();
   }
   
- void perf_counter_fork(struct task_struct *task)
+ static void perf_counter_task(struct task_struct *task,
+                             struct perf_counter_context *task_ctx,
+                             int new)
   {
-       struct perf_fork_event fork_event;
+       struct perf_task_event task_event;
   
         if (!atomic_read(&nr_comm_counters) &&
-           !atomic_read(&nr_mmap_counters))
+           !atomic_read(&nr_mmap_counters) &&
+           !atomic_read(&nr_task_counters))
                 return;
   
-       fork_event = (struct perf_fork_event){
-               .task   = task,
-               .event  = {
+       task_event = (struct perf_task_event){
+               .task     = task,
+               .task_ctx = task_ctx,
+               .event    = {
                         .header = {
-                               .type = PERF_EVENT_FORK,
-                               .size = sizeof(fork_event.event),
+                               .type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT,
+                               .misc = 0,
+                               .size = sizeof(task_event.event),
                         },
+                       /* .pid  */
+                       /* .ppid */
+                       /* .tid  */
+                       /* .ptid */
                 },
         };
   
-       perf_counter_fork_event(&fork_event);
+       perf_counter_task_event(&task_event);
+ }
+ 
+ void perf_counter_fork(struct task_struct *task)
+ {
+       perf_counter_task(task, NULL, 1);
   }
   
   /*
@@@ -2968,8 -3178,10 +3178,10 @@@ static void perf_counter_comm_event(str
         struct perf_cpu_context *cpuctx;
         struct perf_counter_context *ctx;
         unsigned int size;
-       char *comm = comm_event->task->comm;
+       char comm[TASK_COMM_LEN];
   
+       memset(comm, 0, sizeof(comm));
+       strncpy(comm, comm_event->task->comm, sizeof(comm));
         size = ALIGN(strlen(comm)+1, sizeof(u64));
   
         comm_event->comm = comm;
@@@ -3004,8 -3216,16 +3216,16 @@@ void perf_counter_comm(struct task_stru
   
         comm_event = (struct perf_comm_event){
                 .task   = task,
+               /* .comm      */
+               /* .comm_size */
                 .event  = {
-                       .header = { .type = PERF_EVENT_COMM, },
+                       .header = {
+                               .type = PERF_EVENT_COMM,
+                               .misc = 0,
+                               /* .size */
+                       },
+                       /* .pid */
+                       /* .tid */
                 },
         };
   
@@@ -3088,8 -3308,15 +3308,15 @@@ static void perf_counter_mmap_event(str
         char *buf = NULL;
         const char *name;
   
+       memset(tmp, 0, sizeof(tmp));
+ 
         if (file) {
-               buf = kzalloc(PATH_MAX, GFP_KERNEL);
+               /*
+                * d_path works from the end of the buffer backwards, so we
+                * need to add enough zero bytes after the string to handle
+                * the 64bit alignment we do later.
+                */
+               buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
                 if (!buf) {
                         name = strncpy(tmp, "//enomem", sizeof(tmp));
                         goto got_name;
@@@ -3100,9 -3327,11 +3327,11 @@@
                         goto got_name;
                 }
         } else {
-               name = arch_vma_name(mmap_event->vma);
-               if (name)
+               if (arch_vma_name(mmap_event->vma)) {
+                       name = strncpy(tmp, arch_vma_name(mmap_event->vma),
+                                      sizeof(tmp));
                         goto got_name;
+               }
   
                 if (!vma->vm_mm) {
                         name = strncpy(tmp, "[vdso]", sizeof(tmp));
@@@ -3147,8 -3376,16 +3376,16 @@@ void __perf_counter_mmap(struct vm_area
   
         mmap_event = (struct perf_mmap_event){
                 .vma    = vma,
+               /* .file_name */
+               /* .file_size */
                 .event  = {
-                       .header = { .type = PERF_EVENT_MMAP, },
+                       .header = {
+                               .type = PERF_EVENT_MMAP,
+                               .misc = 0,
+                               /* .size */
+                       },
+                       /* .pid */
+                       /* .tid */
                         .start  = vma->vm_start,
                         .len    = vma->vm_end - vma->vm_start,
                         .pgoff  = vma->vm_pgoff,
@@@ -3159,49 -3396,6 +3396,6 @@@
   }
   
   /*
-  * Log sample_period changes so that analyzing tools can re-normalize the
-  * event flow.
-  */
- 
- struct freq_event {
-       struct perf_event_header        header;
-       u64                             time;
-       u64                             id;
-       u64                             period;
- };
- 
- static void perf_log_period(struct perf_counter *counter, u64 period)
- {
-       struct perf_output_handle handle;
-       struct freq_event event;
-       int ret;
- 
-       if (counter->hw.sample_period == period)
-               return;
- 
-       if (counter->attr.sample_type & PERF_SAMPLE_PERIOD)
-               return;
- 
-       event = (struct freq_event) {
-               .header = {
-                       .type = PERF_EVENT_PERIOD,
-                       .misc = 0,
-                       .size = sizeof(event),
-               },
-               .time = sched_clock(),
-               .id = counter->id,
-               .period = period,
-       };
- 
-       ret = perf_output_begin(&handle, counter, sizeof(event), 1, 0);
-       if (ret)
-               return;
- 
-       perf_output_put(&handle, event);
-       perf_output_end(&handle);
- }
- 
- /*
    * IRQ throttle logging
    */
   
@@@ -3214,16 -3408,21 +3408,21 @@@ static void perf_log_throttle(struct pe
                 struct perf_event_header        header;
                 u64                             time;
                 u64                             id;
+               u64                             stream_id;
         } throttle_event = {
                 .header = {
-                       .type = PERF_EVENT_THROTTLE + 1,
+                       .type = PERF_EVENT_THROTTLE,
                         .misc = 0,
                         .size = sizeof(throttle_event),
                 },
-               .time   = sched_clock(),
-               .id     = counter->id,
+               .time           = sched_clock(),
+               .id             = primary_counter_id(counter),
+               .stream_id      = counter->id,
         };
   
+       if (enable)
+               throttle_event.header.type = PERF_EVENT_UNTHROTTLE;
+ 
         ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0);
         if (ret)
                 return;
@@@ -3300,125 -3499,111 +3499,111 @@@ int perf_counter_overflow(struct perf_c
    * Generic software counter infrastructure
    */
   
- static void perf_swcounter_update(struct perf_counter *counter)
+ /*
+  * We directly increment counter->count and keep a second value in
+  * counter->hw.period_left to count intervals. This period counter
+  * is kept in the range [-sample_period, 0] so that we can use the
+  * sign as trigger.
+  */
+ 
+ static u64 perf_swcounter_set_period(struct perf_counter *counter)
   {
         struct hw_perf_counter *hwc = &counter->hw;
-       u64 prev, now;
-       s64 delta;
+       u64 period = hwc->last_period;
+       u64 nr, offset;
+       s64 old, val;
+ 
+       hwc->last_period = hwc->sample_period;
   
   again:
-       prev = atomic64_read(&hwc->prev_count);
-       now = atomic64_read(&hwc->count);
-       if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev)
-               goto again;
+       old = val = atomic64_read(&hwc->period_left);
+       if (val < 0)
+               return 0;
   
-       delta = now - prev;
+       nr = div64_u64(period + val, period);
+       offset = nr * period;
+       val -= offset;
+       if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
+               goto again;
   
-       atomic64_add(delta, &counter->count);
-       atomic64_sub(delta, &hwc->period_left);
+       return nr;
   }
   
- static void perf_swcounter_set_period(struct perf_counter *counter)
+ static void perf_swcounter_overflow(struct perf_counter *counter,
+                                   int nmi, struct perf_sample_data *data)
   {
         struct hw_perf_counter *hwc = &counter->hw;
-       s64 left = atomic64_read(&hwc->period_left);
-       s64 period = hwc->sample_period;
+       u64 overflow;
   
-       if (unlikely(left <= -period)) {
-               left = period;
-               atomic64_set(&hwc->period_left, left);
-               hwc->last_period = period;
-       }
+       data->period = counter->hw.last_period;
+       overflow = perf_swcounter_set_period(counter);
   
-       if (unlikely(left <= 0)) {
-               left += period;
-               atomic64_add(period, &hwc->period_left);
-               hwc->last_period = period;
-       }
+       if (hwc->interrupts == MAX_INTERRUPTS)
+               return;
   
-       atomic64_set(&hwc->prev_count, -left);
-       atomic64_set(&hwc->count, -left);
+       for (; overflow; overflow--) {
+               if (perf_counter_overflow(counter, nmi, data)) {
+                       /*
+                        * We inhibit the overflow from happening when
+                        * hwc->interrupts == MAX_INTERRUPTS.
+                        */
+                       break;
+               }
+       }
   }
   
- static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
+ static void perf_swcounter_unthrottle(struct perf_counter *counter)
   {
-       enum hrtimer_restart ret = HRTIMER_RESTART;
-       struct perf_sample_data data;
-       struct perf_counter *counter;
-       u64 period;
- 
-       counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
-       counter->pmu->read(counter);
- 
-       data.addr = 0;
-       data.regs = get_irq_regs();
         /*
-        * In case we exclude kernel IPs or are somehow not in interrupt
-        * context, provide the next best thing, the user IP.
+        * Nothing to do, we already reset hwc->interrupts.
          */
-       if ((counter->attr.exclude_kernel || !data.regs) &&
-                       !counter->attr.exclude_user)
-               data.regs = task_pt_regs(current);
+ }
   
-       if (data.regs) {
-               if (perf_counter_overflow(counter, 0, &data))
-                       ret = HRTIMER_NORESTART;
-       }
+ static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
+                              int nmi, struct perf_sample_data *data)
+ {
+       struct hw_perf_counter *hwc = &counter->hw;
   
-       period = max_t(u64, 10000, counter->hw.sample_period);
-       hrtimer_forward_now(hrtimer, ns_to_ktime(period));
+       atomic64_add(nr, &counter->count);
   
-       return ret;
- }
+       if (!hwc->sample_period)
+               return;
   
- static void perf_swcounter_overflow(struct perf_counter *counter,
-                                   int nmi, struct perf_sample_data *data)
- {
-       data->period = counter->hw.last_period;
+       if (!data->regs)
+               return;
   
-       perf_swcounter_update(counter);
-       perf_swcounter_set_period(counter);
-       if (perf_counter_overflow(counter, nmi, data))
-               /* soft-disable the counter */
-               ;
+       if (!atomic64_add_negative(nr, &hwc->period_left))
+               perf_swcounter_overflow(counter, nmi, data);
   }
   
   static int perf_swcounter_is_counting(struct perf_counter *counter)
   {
-       struct perf_counter_context *ctx;
-       unsigned long flags;
-       int count;
- 
+       /*
+        * The counter is active, we're good!
+        */
         if (counter->state == PERF_COUNTER_STATE_ACTIVE)
                 return 1;
   
+       /*
+        * The counter is off/error, not counting.
+        */
         if (counter->state != PERF_COUNTER_STATE_INACTIVE)
                 return 0;
   
         /*
-        * If the counter is inactive, it could be just because
-        * its task is scheduled out, or because it's in a group
-        * which could not go on the PMU.  We want to count in
-        * the first case but not the second.  If the context is
-        * currently active then an inactive software counter must
-        * be the second case.  If it's not currently active then
-        * we need to know whether the counter was active when the
-        * context was last active, which we can determine by
-        * comparing counter->tstamp_stopped with ctx->time.
-        *
-        * We are within an RCU read-side critical section,
-        * which protects the existence of *ctx.
+        * The counter is inactive, if the context is active
+        * we're part of a group that didn't make it on the 'pmu',
+        * not counting.
          */
-       ctx = counter->ctx;
-       spin_lock_irqsave(&ctx->lock, flags);
-       count = 1;
-       /* Re-check state now we have the lock */
-       if (counter->state < PERF_COUNTER_STATE_INACTIVE ||
-           counter->ctx->is_active ||
-           counter->tstamp_stopped < ctx->time)
-               count = 0;
-       spin_unlock_irqrestore(&ctx->lock, flags);
-       return count;
+       if (counter->ctx->is_active)
+               return 0;
+ 
+       /*
+        * We're inactive and the context is too, this means the
+        * task is scheduled out, we're counting events that happen
+        * to us, like migration events.
+        */
+       return 1;
   }
   
   static int perf_swcounter_match(struct perf_counter *counter,
@@@ -3444,15 -3629,6 +3629,6 @@@
         return 1;
   }
   
- static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
-                              int nmi, struct perf_sample_data *data)
- {
-       int neg = atomic64_add_negative(nr, &counter->hw.count);
- 
-       if (counter->hw.sample_period && !neg && data->regs)
-               perf_swcounter_overflow(counter, nmi, data);
- }
- 
   static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
                                      enum perf_type_id type,
                                      u32 event, u64 nr, int nmi,
@@@ -3531,27 -3707,66 +3707,66 @@@ void __perf_swcounter_event(u32 event, 
   
   static void perf_swcounter_read(struct perf_counter *counter)
   {
-       perf_swcounter_update(counter);
   }
   
   static int perf_swcounter_enable(struct perf_counter *counter)
   {
-       perf_swcounter_set_period(counter);
+       struct hw_perf_counter *hwc = &counter->hw;
+ 
+       if (hwc->sample_period) {
+               hwc->last_period = hwc->sample_period;
+               perf_swcounter_set_period(counter);
+       }
         return 0;
   }
   
   static void perf_swcounter_disable(struct perf_counter *counter)
   {
-       perf_swcounter_update(counter);
   }
   
   static const struct pmu perf_ops_generic = {
         .enable         = perf_swcounter_enable,
         .disable        = perf_swcounter_disable,
         .read           = perf_swcounter_read,
+       .unthrottle     = perf_swcounter_unthrottle,
   };
   
   /*
+  * hrtimer based swcounter callback
+  */
+ 
+ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
+ {
+       enum hrtimer_restart ret = HRTIMER_RESTART;
+       struct perf_sample_data data;
+       struct perf_counter *counter;
+       u64 period;
+ 
+       counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
+       counter->pmu->read(counter);
+ 
+       data.addr = 0;
+       data.regs = get_irq_regs();
+       /*
+        * In case we exclude kernel IPs or are somehow not in interrupt
+        * context, provide the next best thing, the user IP.
+        */
+       if ((counter->attr.exclude_kernel || !data.regs) &&
+                       !counter->attr.exclude_user)
+               data.regs = task_pt_regs(current);
+ 
+       if (data.regs) {
+               if (perf_counter_overflow(counter, 0, &data))
+                       ret = HRTIMER_NORESTART;
+       }
+ 
+       period = max_t(u64, 10000, counter->hw.sample_period);
+       hrtimer_forward_now(hrtimer, ns_to_ktime(period));
+ 
+       return ret;
+ }
+ 
+ /*
    * Software counter: cpu wall time clock
    */
   
@@@ -3668,17 -3883,24 +3883,24 @@@ static const struct pmu perf_ops_task_c
   };
   
   #ifdef CONFIG_EVENT_PROFILE
- void perf_tpcounter_event(int event_id)
+ void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record,
+                         int entry_size)
   {
+       struct perf_raw_record raw = {
+               .size = entry_size,
+               .data = record,
+       };
+ 
         struct perf_sample_data data = {
-               .regs = get_irq_regs();
-               .addr = 0,
+               .regs = get_irq_regs(),
+               .addr = addr,
+               .raw = &raw,
         };
   
         if (!data.regs)
                 data.regs = task_pt_regs(current);
   
-       do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, &data);
+       do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data);
   }
   EXPORT_SYMBOL_GPL(perf_tpcounter_event);
   
@@@ -3687,16 -3909,20 +3909,20 @@@ extern void ftrace_profile_disable(int)
   
   static void tp_perf_counter_destroy(struct perf_counter *counter)
   {
-       ftrace_profile_disable(perf_event_id(&counter->attr));
+       ftrace_profile_disable(counter->attr.config);
   }
   
   static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
   {
-       int event_id = perf_event_id(&counter->attr);
-       int ret;
+       /*
+        * Raw tracepoint data is a severe data leak, only allow root to
+        * have these.
+        */
+       if ((counter->attr.sample_type & PERF_SAMPLE_RAW) &&
+                       !capable(CAP_SYS_ADMIN))
+               return ERR_PTR(-EPERM);
   
-       ret = ftrace_profile_enable(event_id);
-       if (ret)
+       if (ftrace_profile_enable(counter->attr.config))
                 return NULL;
   
         counter->destroy = tp_perf_counter_destroy;
@@@ -3829,9 -4055,9 +4055,9 @@@ perf_counter_alloc(struct perf_counter_
         atomic64_set(&hwc->period_left, hwc->sample_period);
   
         /*
-        * we currently do not support PERF_SAMPLE_GROUP on inherited counters
+        * we currently do not support PERF_FORMAT_GROUP on inherited counters
          */
-       if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP))
+       if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
                 goto done;
   
         switch (attr->type) {
@@@ -3874,6 -4100,8 +4100,8 @@@ done
                         atomic_inc(&nr_mmap_counters);
                 if (counter->attr.comm)
                         atomic_inc(&nr_comm_counters);
+               if (counter->attr.task)
+                       atomic_inc(&nr_task_counters);
         }
   
         return counter;
@@@ -4235,8 -4463,10 +4463,10 @@@ void perf_counter_exit_task(struct task
         struct perf_counter_context *child_ctx;
         unsigned long flags;
   
-       if (likely(!child->perf_counter_ctxp))
+       if (likely(!child->perf_counter_ctxp)) {
+               perf_counter_task(child, NULL, 0);
                 return;
+       }
   
         local_irq_save(flags);
         /*
@@@ -4255,17 -4485,20 +4485,20 @@@
          */
         spin_lock(&child_ctx->lock);
         child->perf_counter_ctxp = NULL;
-       if (child_ctx->parent_ctx) {
-               /*
-                * This context is a clone; unclone it so it can't get
-                * swapped to another process while we're removing all
-                * the counters from it.
-                */
-               put_ctx(child_ctx->parent_ctx);
-               child_ctx->parent_ctx = NULL;
-       }
-       spin_unlock(&child_ctx->lock);
-       local_irq_restore(flags);
+       /*
+        * If this context is a clone; unclone it so it can't get
+        * swapped to another process while we're removing all
+        * the counters from it.
+        */
+       unclone_ctx(child_ctx);
+       spin_unlock_irqrestore(&child_ctx->lock, flags);
+ 
+       /*
+        * Report the task dead after unscheduling the counters so that we
+        * won't get any samples after PERF_EVENT_EXIT. We can however still
+        * get a few PERF_EVENT_READ events.
+        */
+       perf_counter_task(child, child_ctx, 0);
   
         /*
          * We can recurse on the same lock type through:
@@@ -4486,6 -4719,11 +4719,11 @@@ perf_cpu_notify(struct notifier_block *
                 perf_counter_init_cpu(cpu);
                 break;
   
+       case CPU_ONLINE:
+       case CPU_ONLINE_FROZEN:
+               hw_perf_counter_setup_online(cpu);
+               break;
+ 
         case CPU_DOWN_PREPARE:
         case CPU_DOWN_PREPARE_FROZEN:
                 perf_counter_exit_cpu(cpu);
@@@ -4510,6 -4748,8 +4748,8 @@@ void __init perf_counter_init(void
   {
         perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
                         (void *)(long)smp_processor_id());
+       perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
+                       (void *)(long)smp_processor_id());
         register_cpu_notifier(&perf_cpu_nb);
   }
   
diff --combined kernel/sched.c

index 34fd81d,1b59e26..d3d7e76
--- 1/kernel/sched.c
--- 2/kernel/sched.c
+++ b/kernel/sched.c
@@@ -318,12 -318,12 +318,12 @@@ struct task_group root_task_group
   /* Default task group's sched entity on each cpu */
   static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
   /* Default task group's cfs_rq on each cpu */
- -static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
+ +static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_cfs_rq);
   #endif /* CONFIG_FAIR_GROUP_SCHED */
   
   #ifdef CONFIG_RT_GROUP_SCHED
   static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
- -static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
+ +static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq);
   #endif /* CONFIG_RT_GROUP_SCHED */
   #else /* !CONFIG_USER_SCHED */
   #define root_task_group init_task_group
@@@ -493,6 -493,7 +493,7 @@@ struct rt_rq 
   #endif
   #ifdef CONFIG_SMP
         unsigned long rt_nr_migratory;
+       unsigned long rt_nr_total;
         int overloaded;
         struct plist_head pushable_tasks;
   #endif
@@@ -2571,15 -2572,37 +2572,37 @@@ static void __sched_fork(struct task_st
         p->se.avg_wakeup                = sysctl_sched_wakeup_granularity;
   
   #ifdef CONFIG_SCHEDSTATS
-       p->se.wait_start                = 0;
-       p->se.sum_sleep_runtime         = 0;
-       p->se.sleep_start               = 0;
-       p->se.block_start               = 0;
-       p->se.sleep_max                 = 0;
-       p->se.block_max                 = 0;
-       p->se.exec_max                  = 0;
-       p->se.slice_max                 = 0;
-       p->se.wait_max                  = 0;
+       p->se.wait_start                        = 0;
+       p->se.wait_max                          = 0;
+       p->se.wait_count                        = 0;
+       p->se.wait_sum                          = 0;
+ 
+       p->se.sleep_start                       = 0;
+       p->se.sleep_max                         = 0;
+       p->se.sum_sleep_runtime                 = 0;
+ 
+       p->se.block_start                       = 0;
+       p->se.block_max                         = 0;
+       p->se.exec_max                          = 0;
+       p->se.slice_max                         = 0;
+ 
+       p->se.nr_migrations_cold                = 0;
+       p->se.nr_failed_migrations_affine       = 0;
+       p->se.nr_failed_migrations_running      = 0;
+       p->se.nr_failed_migrations_hot          = 0;
+       p->se.nr_forced_migrations              = 0;
+       p->se.nr_forced2_migrations             = 0;
+ 
+       p->se.nr_wakeups                        = 0;
+       p->se.nr_wakeups_sync                   = 0;
+       p->se.nr_wakeups_migrate                = 0;
+       p->se.nr_wakeups_local                  = 0;
+       p->se.nr_wakeups_remote                 = 0;
+       p->se.nr_wakeups_affine                 = 0;
+       p->se.nr_wakeups_affine_attempts        = 0;
+       p->se.nr_wakeups_passive                = 0;
+       p->se.nr_wakeups_idle                   = 0;
+ 
   #endif
   
         INIT_LIST_HEAD(&p->rt.run_list);
@@@ -6541,6 -6564,11 +6564,11 @@@ SYSCALL_DEFINE0(sched_yield
         return 0;
   }
   
+ static inline int should_resched(void)
+ {
+       return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
+ }
+ 
   static void __cond_resched(void)
   {
   #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
@@@ -6560,8 -6588,7 +6588,7 @@@
   
   int __sched _cond_resched(void)
   {
-       if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
-                                       system_state == SYSTEM_RUNNING) {
+       if (should_resched()) {
                 __cond_resched();
                 return 1;
         }
@@@ -6579,12 -6606,12 +6606,12 @@@ EXPORT_SYMBOL(_cond_resched)
    */
   int cond_resched_lock(spinlock_t *lock)
   {
-       int resched = need_resched() && system_state == SYSTEM_RUNNING;
+       int resched = should_resched();
         int ret = 0;
   
         if (spin_needbreak(lock) || resched) {
                 spin_unlock(lock);
-               if (resched && need_resched())
+               if (resched)
                         __cond_resched();
                 else
                         cpu_relax();
@@@ -6599,7 -6626,7 +6626,7 @@@ int __sched cond_resched_softirq(void
   {
         BUG_ON(!in_softirq());
   
-       if (need_resched() && system_state == SYSTEM_RUNNING) {
+       if (should_resched()) {
                 local_bh_enable();
                 __cond_resched();
                 local_bh_disable();
@@@ -7262,6 -7289,7 +7289,7 @@@ static void migrate_dead_tasks(unsigne
   static void calc_global_load_remove(struct rq *rq)
   {
         atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
+       rq->calc_load_active = 0;
   }
   #endif /* CONFIG_HOTPLUG_CPU */
   
@@@ -7488,6 -7516,7 +7516,7 @@@ migration_call(struct notifier_block *n
                 task_rq_unlock(rq, &flags);
                 get_task_struct(p);
                 cpu_rq(cpu)->migration_thread = p;
+               rq->calc_load_update = calc_load_update;
                 break;
   
         case CPU_ONLINE:
@@@ -7498,8 -7527,6 +7527,6 @@@
                 /* Update our root-domain */
                 rq = cpu_rq(cpu);
                 spin_lock_irqsave(&rq->lock, flags);
-               rq->calc_load_update = calc_load_update;
-               rq->calc_load_active = 0;
                 if (rq->rd) {
                         BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
   
@@@ -9070,7 -9097,7 +9097,7 @@@ static void init_rt_rq(struct rt_rq *rt
   #ifdef CONFIG_SMP
         rt_rq->rt_nr_migratory = 0;
         rt_rq->overloaded = 0;
-       plist_head_init(&rq->rt.pushable_tasks, &rq->lock);
+       plist_head_init(&rt_rq->pushable_tasks, &rq->lock);
   #endif
   
         rt_rq->rt_time = 0;
diff --combined kernel/trace/trace_events.c

index 8c193c2,e75276a..0db0a41
--- 1/kernel/trace/trace_events.c
--- 2/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@@ -376,7 -376,7 +376,7 @@@ ftrace_event_seq_open(struct inode *ino
         const struct seq_operations *seq_ops;
   
         if ((file->f_mode & FMODE_WRITE) &&
-           !(file->f_flags & O_APPEND))
+           (file->f_flags & O_TRUNC))
                 ftrace_clear_events();
   
         seq_ops = inode->i_private;
@@@ -940,7 -940,7 +940,7 @@@ event_create_dir(struct ftrace_event_ca
                 entry = trace_create_file("enable", 0644, call->dir, call,
                                           enable);
   
-       if (call->id)
+       if (call->id && call->profile_enable)
                 entry = trace_create_file("id", 0444, call->dir, call,
                                           id);
   
@@@ -1334,7 -1334,7 +1334,7 @@@ static __init void event_trace_self_tes
   
   #ifdef CONFIG_FUNCTION_TRACER
   
- -static DEFINE_PER_CPU(atomic_t, test_event_disable);
+ +static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable);
   
   static void
   function_test_events_call(unsigned long ip, unsigned long parent_ip)
@@@ -1350,7 -1350,7 +1350,7 @@@
         pc = preempt_count();
         resched = ftrace_preempt_disable();
         cpu = raw_smp_processor_id();
- -      disabled = atomic_inc_return(&per_cpu(test_event_disable, cpu));
+ +      disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu));
   
         if (disabled != 1)
                 goto out;
@@@ -1368,7 -1368,7 +1368,7 @@@
         trace_nowake_buffer_unlock_commit(event, flags, pc);
   
    out:
- -      atomic_dec(&per_cpu(test_event_disable, cpu));
+ +      atomic_dec(&per_cpu(ftrace_test_event_disable, cpu));
         ftrace_preempt_enable(resched);
   }
   
diff --combined mm/page-writeback.c

index 3c7f5e1,81627eb..997186c
--- 1/mm/page-writeback.c
--- 2/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@@ -575,7 -575,7 +575,7 @@@ static void balance_dirty_pages(struct 
                 if (pages_written >= write_chunk)
                         break;          /* We've done our duty */
   
-               congestion_wait(WRITE, HZ/10);
+               congestion_wait(BLK_RW_ASYNC, HZ/10);
         }
   
         if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
@@@ -610,8 -610,6 +610,8 @@@ void set_page_dirty_balance(struct pag
         }
   }
   
+ +static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0;
+ +
   /**
    * balance_dirty_pages_ratelimited_nr - balance dirty memory state
    * @mapping: address_space which was dirtied
@@@ -629,6 -627,7 +629,6 @@@
   void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
                                         unsigned long nr_pages_dirtied)
   {
- -      static DEFINE_PER_CPU(unsigned long, ratelimits) = 0;
         unsigned long ratelimit;
         unsigned long *p;
   
@@@ -641,7 -640,7 +641,7 @@@
          * tasks in balance_dirty_pages(). Period.
          */
         preempt_disable();
- -      p =  &__get_cpu_var(ratelimits);
+ +      p =  &__get_cpu_var(bdp_ratelimits);
         *p += nr_pages_dirtied;
         if (unlikely(*p >= ratelimit)) {
                 *p = 0;
@@@ -670,7 -669,7 +670,7 @@@ void throttle_vm_writeout(gfp_t gfp_mas
                   if (global_page_state(NR_UNSTABLE_NFS) +
                         global_page_state(NR_WRITEBACK) <= dirty_thresh)
                                 break;
-                 congestion_wait(WRITE, HZ/10);
+                 congestion_wait(BLK_RW_ASYNC, HZ/10);
   
                 /*
                  * The caller might hold locks which can prevent IO completion
@@@ -716,7 -715,7 +716,7 @@@ static void background_writeout(unsigne
                 if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
                         /* Wrote less than expected */
                         if (wbc.encountered_congestion || wbc.more_io)
-                               congestion_wait(WRITE, HZ/10);
+                               congestion_wait(BLK_RW_ASYNC, HZ/10);
                         else
                                 break;
                 }
@@@ -788,7 -787,7 +788,7 @@@ static void wb_kupdate(unsigned long ar
                 writeback_inodes(&wbc);
                 if (wbc.nr_to_write > 0) {
                         if (wbc.encountered_congestion || wbc.more_io)
-                               congestion_wait(WRITE, HZ/10);
+                               congestion_wait(BLK_RW_ASYNC, HZ/10);
                         else
                                 break;  /* All the old data is written */
                 }
diff --combined mm/percpu.c

index b3d0bcf,5fe3784..3f9f182
--- 1/mm/percpu.c
--- 2/mm/percpu.c
+++ b/mm/percpu.c
@@@ -8,13 -8,12 +8,13 @@@
    *
    * This is percpu allocator which can handle both static and dynamic
    * areas.  Percpu areas are allocated in chunks in vmalloc area.  Each
- - * chunk is consisted of nr_cpu_ids units and the first chunk is used
- - * for static percpu variables in the kernel image (special boot time
- - * alloc/init handling necessary as these areas need to be brought up
- - * before allocation services are running).  Unit grows as necessary
- - * and all units grow or shrink in unison.  When a chunk is filled up,
- - * another chunk is allocated.  ie. in vmalloc area
+ + * chunk is consisted of boot-time determined number of units and the
+ + * first chunk is used for static percpu variables in the kernel image
+ + * (special boot time alloc/init handling necessary as these areas
+ + * need to be brought up before allocation services are running).
+ + * Unit grows as necessary and all units grow or shrink in unison.
+ + * When a chunk is filled up, another chunk is allocated.  ie. in
+ + * vmalloc area
    *
    *  c0                           c1                         c2
    *  -------------------          -------------------        ------------
@@@ -23,13 -22,11 +23,13 @@@
    *
    * Allocation is done in offset-size areas of single unit space.  Ie,
    * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0,
- - * c1:u1, c1:u2 and c1:u3.  Percpu access can be done by configuring
- - * percpu base registers pcpu_unit_size apart.
+ + * c1:u1, c1:u2 and c1:u3.  On UMA, units corresponds directly to
+ + * cpus.  On NUMA, the mapping can be non-linear and even sparse.
+ + * Percpu access can be done by configuring percpu base registers
+ + * according to cpu to unit mapping and pcpu_unit_size.
    *
- - * There are usually many small percpu allocations many of them as
- - * small as 4 bytes.  The allocator organizes chunks into lists
+ + * There are usually many small percpu allocations many of them being
+ + * as small as 4 bytes.  The allocator organizes chunks into lists
    * according to free size and tries to allocate from the fullest one.
    * Each chunk keeps the maximum contiguous area size hint which is
    * guaranteed to be eqaul to or larger than the maximum contiguous
@@@ -46,7 -43,7 +46,7 @@@
    *
    * To use this allocator, arch code should do the followings.
    *
- - * - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
+ + * - drop CONFIG_HAVE_LEGACY_PER_CPU_AREA
    *
    * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
    *   regular address to percpu pointer and back if they need to be
@@@ -59,7 -56,6 +59,7 @@@
   #include <linux/bitmap.h>
   #include <linux/bootmem.h>
   #include <linux/list.h>
+ +#include <linux/log2.h>
   #include <linux/mm.h>
   #include <linux/module.h>
   #include <linux/mutex.h>
@@@ -98,27 -94,20 +98,27 @@@ struct pcpu_chunk 
         int                     map_alloc;      /* # of map entries allocated */
         int                     *map;           /* allocation map */
         bool                    immutable;      /* no [de]population allowed */
- -      struct page             **page;         /* points to page array */
- -      struct page             *page_ar[];     /* #cpus * UNIT_PAGES */
+ +      unsigned long           populated[];    /* populated bitmap */
   };
   
   static int pcpu_unit_pages __read_mostly;
   static int pcpu_unit_size __read_mostly;
+ +static int pcpu_nr_units __read_mostly;
   static int pcpu_chunk_size __read_mostly;
   static int pcpu_nr_slots __read_mostly;
   static size_t pcpu_chunk_struct_size __read_mostly;
   
+ +/* cpus with the lowest and highest unit numbers */
+ +static unsigned int pcpu_first_unit_cpu __read_mostly;
+ +static unsigned int pcpu_last_unit_cpu __read_mostly;
+ +
   /* the address of the first chunk which starts with the kernel static area */
   void *pcpu_base_addr __read_mostly;
   EXPORT_SYMBOL_GPL(pcpu_base_addr);
   
+ +/* cpu -> unit map */
+ +const int *pcpu_unit_map __read_mostly;
+ +
   /*
    * The first chunk which always exists.  Note that unlike other
    * chunks, this one can be allocated and mapped in several different
@@@ -140,9 -129,9 +140,9 @@@ static int pcpu_reserved_chunk_limit
    * Synchronization rules.
    *
    * There are two locks - pcpu_alloc_mutex and pcpu_lock.  The former
- - * protects allocation/reclaim paths, chunks and chunk->page arrays.
- - * The latter is a spinlock and protects the index data structures -
- - * chunk slots, chunks and area maps in chunks.
+ + * protects allocation/reclaim paths, chunks, populated bitmap and
+ + * vmalloc mapping.  The latter is a spinlock and protects the index
+ + * data structures - chunk slots, chunks and area maps in chunks.
    *
    * During allocation, pcpu_alloc_mutex is kept locked all the time and
    * pcpu_lock is grabbed and released as necessary.  All actual memory
@@@ -189,7 -178,13 +189,7 @@@ static int pcpu_chunk_slot(const struc
   
   static int pcpu_page_idx(unsigned int cpu, int page_idx)
   {
- -      return cpu * pcpu_unit_pages + page_idx;
- -}
- -
- -static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk,
- -                                    unsigned int cpu, int page_idx)
- -{
- -      return &chunk->page[pcpu_page_idx(cpu, page_idx)];
+ +      return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
   }
   
   static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
@@@ -199,13 -194,10 +199,13 @@@
                 (pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT);
   }
   
- -static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
- -                                   int page_idx)
+ +static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
+ +                                  unsigned int cpu, int page_idx)
   {
- -      return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL;
+ +      /* must not be used on pre-mapped chunk */
+ +      WARN_ON(chunk->immutable);
+ +
+ +      return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx));
   }
   
   /* set the pointer to a chunk in a page struct */
@@@ -220,34 -212,6 +220,34 @@@ static struct pcpu_chunk *pcpu_get_page
         return (struct pcpu_chunk *)page->index;
   }
   
+ +static void pcpu_next_unpop(struct pcpu_chunk *chunk, int *rs, int *re, int end)
+ +{
+ +      *rs = find_next_zero_bit(chunk->populated, end, *rs);
+ +      *re = find_next_bit(chunk->populated, end, *rs + 1);
+ +}
+ +
+ +static void pcpu_next_pop(struct pcpu_chunk *chunk, int *rs, int *re, int end)
+ +{
+ +      *rs = find_next_bit(chunk->populated, end, *rs);
+ +      *re = find_next_zero_bit(chunk->populated, end, *rs + 1);
+ +}
+ +
+ +/*
+ + * (Un)populated page region iterators.  Iterate over (un)populated
+ + * page regions betwen @start and @end in @chunk.  @rs and @re should
+ + * be integer variables and will be set to start and end page index of
+ + * the current region.
+ + */
+ +#define pcpu_for_each_unpop_region(chunk, rs, re, start, end)             \
+ +      for ((rs) = (start), pcpu_next_unpop((chunk), &(rs), &(re), (end)); \
+ +           (rs) < (re);                                                   \
+ +           (rs) = (re) + 1, pcpu_next_unpop((chunk), &(rs), &(re), (end)))
+ +
+ +#define pcpu_for_each_pop_region(chunk, rs, re, start, end)               \
+ +      for ((rs) = (start), pcpu_next_pop((chunk), &(rs), &(re), (end));   \
+ +           (rs) < (re);                                                   \
+ +           (rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end)))
+ +
   /**
    * pcpu_mem_alloc - allocate memory
    * @size: bytes to allocate
@@@ -326,21 -290,13 +326,21 @@@ static struct pcpu_chunk *pcpu_chunk_ad
         void *first_start = pcpu_first_chunk->vm->addr;
   
         /* is it in the first chunk? */
- -      if (addr >= first_start && addr < first_start + pcpu_chunk_size) {
+ +      if (addr >= first_start && addr < first_start + pcpu_unit_size) {
                 /* is it in the reserved area? */
                 if (addr < first_start + pcpu_reserved_chunk_limit)
                         return pcpu_reserved_chunk;
                 return pcpu_first_chunk;
         }
   
+ +      /*
+ +       * The address is relative to unit0 which might be unused and
+ +       * thus unmapped.  Offset the address to the unit space of the
+ +       * current processor before looking it up in the vmalloc
+ +       * space.  Note that any possible cpu id can be used here, so
+ +       * there's no need to worry about preemption or cpu hotplug.
+ +       */
+ +      addr += pcpu_unit_map[smp_processor_id()] * pcpu_unit_size;
         return pcpu_get_page_chunk(vmalloc_to_page(addr));
   }
   
@@@ -589,327 -545,125 +589,327 @@@ static void pcpu_free_area(struct pcpu_
   }
   
   /**
- - * pcpu_unmap - unmap pages out of a pcpu_chunk
+ + * pcpu_get_pages_and_bitmap - get temp pages array and bitmap
    * @chunk: chunk of interest
- - * @page_start: page index of the first page to unmap
- - * @page_end: page index of the last page to unmap + 1
- - * @flush_tlb: whether to flush tlb or not
+ + * @bitmapp: output parameter for bitmap
+ + * @may_alloc: may allocate the array
    *
- - * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
- - * If @flush is true, vcache is flushed before unmapping and tlb
- - * after.
+ + * Returns pointer to array of pointers to struct page and bitmap,
+ + * both of which can be indexed with pcpu_page_idx().  The returned
+ + * array is cleared to zero and *@bitmapp is copied from
+ + * @chunk->populated.  Note that there is only one array and bitmap
+ + * and access exclusion is the caller's responsibility.
+ + *
+ + * CONTEXT:
+ + * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc.
+ + * Otherwise, don't care.
+ + *
+ + * RETURNS:
+ + * Pointer to temp pages array on success, NULL on failure.
    */
- -static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
- -                     bool flush_tlb)
+ +static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk,
+ +                                             unsigned long **bitmapp,
+ +                                             bool may_alloc)
   {
- -      unsigned int last = nr_cpu_ids - 1;
- -      unsigned int cpu;
+ +      static struct page **pages;
+ +      static unsigned long *bitmap;
+ +      size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]);
+ +      size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) *
+ +                           sizeof(unsigned long);
+ +
+ +      if (!pages || !bitmap) {
+ +              if (may_alloc && !pages)
+ +                      pages = pcpu_mem_alloc(pages_size);
+ +              if (may_alloc && !bitmap)
+ +                      bitmap = pcpu_mem_alloc(bitmap_size);
+ +              if (!pages || !bitmap)
+ +                      return NULL;
+ +      }
   
- -      /* unmap must not be done on immutable chunk */
- -      WARN_ON(chunk->immutable);
+ +      memset(pages, 0, pages_size);
+ +      bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages);
   
- -      /*
- -       * Each flushing trial can be very expensive, issue flush on
- -       * the whole region at once rather than doing it for each cpu.
- -       * This could be an overkill but is more scalable.
- -       */
- -      flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start),
- -                         pcpu_chunk_addr(chunk, last, page_end));
+ +      *bitmapp = bitmap;
+ +      return pages;
+ +}
   
- -      for_each_possible_cpu(cpu)
- -              unmap_kernel_range_noflush(
- -                              pcpu_chunk_addr(chunk, cpu, page_start),
- -                              (page_end - page_start) << PAGE_SHIFT);
- -
- -      /* ditto as flush_cache_vunmap() */
- -      if (flush_tlb)
- -              flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start),
- -                                     pcpu_chunk_addr(chunk, last, page_end));
+ +/**
+ + * pcpu_free_pages - free pages which were allocated for @chunk
+ + * @chunk: chunk pages were allocated for
+ + * @pages: array of pages to be freed, indexed by pcpu_page_idx()
+ + * @populated: populated bitmap
+ + * @page_start: page index of the first page to be freed
+ + * @page_end: page index of the last page to be freed + 1
+ + *
+ + * Free pages [@page_start and @page_end) in @pages for all units.
+ + * The pages were allocated for @chunk.
+ + */
+ +static void pcpu_free_pages(struct pcpu_chunk *chunk,
+ +                          struct page **pages, unsigned long *populated,
+ +                          int page_start, int page_end)
+ +{
+ +      unsigned int cpu;
+ +      int i;
+ +
+ +      for_each_possible_cpu(cpu) {
+ +              for (i = page_start; i < page_end; i++) {
+ +                      struct page *page = pages[pcpu_page_idx(cpu, i)];
+ +
+ +                      if (page)
+ +                              __free_page(page);
+ +              }
+ +      }
   }
   
   /**
- - * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
- - * @chunk: chunk to depopulate
- - * @off: offset to the area to depopulate
- - * @size: size of the area to depopulate in bytes
- - * @flush: whether to flush cache and tlb or not
- - *
- - * For each cpu, depopulate and unmap pages [@page_start,@page_end)
- - * from @chunk.  If @flush is true, vcache is flushed before unmapping
- - * and tlb after.
- - *
- - * CONTEXT:
- - * pcpu_alloc_mutex.
+ + * pcpu_alloc_pages - allocates pages for @chunk
+ + * @chunk: target chunk
+ + * @pages: array to put the allocated pages into, indexed by pcpu_page_idx()
+ + * @populated: populated bitmap
+ + * @page_start: page index of the first page to be allocated
+ + * @page_end: page index of the last page to be allocated + 1
+ + *
+ + * Allocate pages [@page_start,@page_end) into @pages for all units.
+ + * The allocation is for @chunk.  Percpu core doesn't care about the
+ + * content of @pages and will pass it verbatim to pcpu_map_pages().
    */
- -static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size,
- -                                bool flush)
+ +static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
+ +                          struct page **pages, unsigned long *populated,
+ +                          int page_start, int page_end)
   {
- -      int page_start = PFN_DOWN(off);
- -      int page_end = PFN_UP(off + size);
- -      int unmap_start = -1;
- -      int uninitialized_var(unmap_end);
+ +      const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
         unsigned int cpu;
         int i;
   
- -      for (i = page_start; i < page_end; i++) {
- -              for_each_possible_cpu(cpu) {
- -                      struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
+ +      for_each_possible_cpu(cpu) {
+ +              for (i = page_start; i < page_end; i++) {
+ +                      struct page **pagep = &pages[pcpu_page_idx(cpu, i)];
+ +
+ +                      *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0);
+ +                      if (!*pagep) {
+ +                              pcpu_free_pages(chunk, pages, populated,
+ +                                              page_start, page_end);
+ +                              return -ENOMEM;
+ +                      }
+ +              }
+ +      }
+ +      return 0;
+ +}
   
- -                      if (!*pagep)
- -                              continue;
+ +/**
+ + * pcpu_pre_unmap_flush - flush cache prior to unmapping
+ + * @chunk: chunk the regions to be flushed belongs to
+ + * @page_start: page index of the first page to be flushed
+ + * @page_end: page index of the last page to be flushed + 1
+ + *
+ + * Pages in [@page_start,@page_end) of @chunk are about to be
+ + * unmapped.  Flush cache.  As each flushing trial can be very
+ + * expensive, issue flush on the whole region at once rather than
+ + * doing it for each cpu.  This could be an overkill but is more
+ + * scalable.
+ + */
+ +static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk,
+ +                               int page_start, int page_end)
+ +{
+ +      flush_cache_vunmap(
+ +              pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
+ +              pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
+ +}
   
- -                      __free_page(*pagep);
+ +static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
+ +{
+ +      unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT);
+ +}
   
- -                      /*
- -                       * If it's partial depopulation, it might get
- -                       * populated or depopulated again.  Mark the
- -                       * page gone.
- -                       */
- -                      *pagep = NULL;
+ +/**
+ + * pcpu_unmap_pages - unmap pages out of a pcpu_chunk
+ + * @chunk: chunk of interest
+ + * @pages: pages array which can be used to pass information to free
+ + * @populated: populated bitmap
+ + * @page_start: page index of the first page to unmap
+ + * @page_end: page index of the last page to unmap + 1
+ + *
+ + * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
+ + * Corresponding elements in @pages were cleared by the caller and can
+ + * be used to carry information to pcpu_free_pages() which will be
+ + * called after all unmaps are finished.  The caller should call
+ + * proper pre/post flush functions.
+ + */
+ +static void pcpu_unmap_pages(struct pcpu_chunk *chunk,
+ +                           struct page **pages, unsigned long *populated,
+ +                           int page_start, int page_end)
+ +{
+ +      unsigned int cpu;
+ +      int i;
   
- -                      unmap_start = unmap_start < 0 ? i : unmap_start;
- -                      unmap_end = i + 1;
+ +      for_each_possible_cpu(cpu) {
+ +              for (i = page_start; i < page_end; i++) {
+ +                      struct page *page;
+ +
+ +                      page = pcpu_chunk_page(chunk, cpu, i);
+ +                      WARN_ON(!page);
+ +                      pages[pcpu_page_idx(cpu, i)] = page;
                 }
+ +              __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start),
+ +                                 page_end - page_start);
         }
   
- -      if (unmap_start >= 0)
- -              pcpu_unmap(chunk, unmap_start, unmap_end, flush);
+ +      for (i = page_start; i < page_end; i++)
+ +              __clear_bit(i, populated);
   }
   
   /**
- - * pcpu_map - map pages into a pcpu_chunk
+ + * pcpu_post_unmap_tlb_flush - flush TLB after unmapping
+ + * @chunk: pcpu_chunk the regions to be flushed belong to
+ + * @page_start: page index of the first page to be flushed
+ + * @page_end: page index of the last page to be flushed + 1
+ + *
+ + * Pages [@page_start,@page_end) of @chunk have been unmapped.  Flush
+ + * TLB for the regions.  This can be skipped if the area is to be
+ + * returned to vmalloc as vmalloc will handle TLB flushing lazily.
+ + *
+ + * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
+ + * for the whole region.
+ + */
+ +static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
+ +                                    int page_start, int page_end)
+ +{
+ +      flush_tlb_kernel_range(
+ +              pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
+ +              pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
+ +}
+ +
+ +static int __pcpu_map_pages(unsigned long addr, struct page **pages,
+ +                          int nr_pages)
+ +{
+ +      return map_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT,
+ +                                      PAGE_KERNEL, pages);
+ +}
+ +
+ +/**
+ + * pcpu_map_pages - map pages into a pcpu_chunk
    * @chunk: chunk of interest
+ + * @pages: pages array containing pages to be mapped
+ + * @populated: populated bitmap
    * @page_start: page index of the first page to map
    * @page_end: page index of the last page to map + 1
    *
- - * For each cpu, map pages [@page_start,@page_end) into @chunk.
- - * vcache is flushed afterwards.
+ + * For each cpu, map pages [@page_start,@page_end) into @chunk.  The
+ + * caller is responsible for calling pcpu_post_map_flush() after all
+ + * mappings are complete.
+ + *
+ + * This function is responsible for setting corresponding bits in
+ + * @chunk->populated bitmap and whatever is necessary for reverse
+ + * lookup (addr -> chunk).
    */
- -static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
+ +static int pcpu_map_pages(struct pcpu_chunk *chunk,
+ +                        struct page **pages, unsigned long *populated,
+ +                        int page_start, int page_end)
   {
- -      unsigned int last = nr_cpu_ids - 1;
- -      unsigned int cpu;
- -      int err;
- -
- -      /* map must not be done on immutable chunk */
- -      WARN_ON(chunk->immutable);
+ +      unsigned int cpu, tcpu;
+ +      int i, err;
   
         for_each_possible_cpu(cpu) {
- -              err = map_kernel_range_noflush(
- -                              pcpu_chunk_addr(chunk, cpu, page_start),
- -                              (page_end - page_start) << PAGE_SHIFT,
- -                              PAGE_KERNEL,
- -                              pcpu_chunk_pagep(chunk, cpu, page_start));
+ +              err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start),
+ +                                     &pages[pcpu_page_idx(cpu, page_start)],
+ +                                     page_end - page_start);
                 if (err < 0)
- -                      return err;
+ +                      goto err;
+ +      }
+ +
+ +      /* mapping successful, link chunk and mark populated */
+ +      for (i = page_start; i < page_end; i++) {
+ +              for_each_possible_cpu(cpu)
+ +                      pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)],
+ +                                          chunk);
+ +              __set_bit(i, populated);
         }
   
- -      /* flush at once, please read comments in pcpu_unmap() */
- -      flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start),
- -                       pcpu_chunk_addr(chunk, last, page_end));
         return 0;
+ +
+ +err:
+ +      for_each_possible_cpu(tcpu) {
+ +              if (tcpu == cpu)
+ +                      break;
+ +              __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start),
+ +                                 page_end - page_start);
+ +      }
+ +      return err;
+ +}
+ +
+ +/**
+ + * pcpu_post_map_flush - flush cache after mapping
+ + * @chunk: pcpu_chunk the regions to be flushed belong to
+ + * @page_start: page index of the first page to be flushed
+ + * @page_end: page index of the last page to be flushed + 1
+ + *
+ + * Pages [@page_start,@page_end) of @chunk have been mapped.  Flush
+ + * cache.
+ + *
+ + * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
+ + * for the whole region.
+ + */
+ +static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
+ +                              int page_start, int page_end)
+ +{
+ +      flush_cache_vmap(
+ +              pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
+ +              pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
+ +}
+ +
+ +/**
+ + * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
+ + * @chunk: chunk to depopulate
+ + * @off: offset to the area to depopulate
+ + * @size: size of the area to depopulate in bytes
+ + * @flush: whether to flush cache and tlb or not
+ + *
+ + * For each cpu, depopulate and unmap pages [@page_start,@page_end)
+ + * from @chunk.  If @flush is true, vcache is flushed before unmapping
+ + * and tlb after.
+ + *
+ + * CONTEXT:
+ + * pcpu_alloc_mutex.
+ + */
+ +static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size)
+ +{
+ +      int page_start = PFN_DOWN(off);
+ +      int page_end = PFN_UP(off + size);
+ +      struct page **pages;
+ +      unsigned long *populated;
+ +      int rs, re;
+ +
+ +      /* quick path, check whether it's empty already */
+ +      pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
+ +              if (rs == page_start && re == page_end)
+ +                      return;
+ +              break;
+ +      }
+ +
+ +      /* immutable chunks can't be depopulated */
+ +      WARN_ON(chunk->immutable);
+ +
+ +      /*
+ +       * If control reaches here, there must have been at least one
+ +       * successful population attempt so the temp pages array must
+ +       * be available now.
+ +       */
+ +      pages = pcpu_get_pages_and_bitmap(chunk, &populated, false);
+ +      BUG_ON(!pages);
+ +
+ +      /* unmap and free */
+ +      pcpu_pre_unmap_flush(chunk, page_start, page_end);
+ +
+ +      pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
+ +              pcpu_unmap_pages(chunk, pages, populated, rs, re);
+ +
+ +      /* no need to flush tlb, vmalloc will handle it lazily */
+ +
+ +      pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
+ +              pcpu_free_pages(chunk, pages, populated, rs, re);
+ +
+ +      /* commit new bitmap */
+ +      bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
   }
   
   /**
@@@ -926,60 -680,50 +926,60 @@@
    */
   static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
   {
- -      const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
         int page_start = PFN_DOWN(off);
         int page_end = PFN_UP(off + size);
- -      int map_start = -1;
- -      int uninitialized_var(map_end);
+ +      int free_end = page_start, unmap_end = page_start;
+ +      struct page **pages;
+ +      unsigned long *populated;
         unsigned int cpu;
- -      int i;
+ +      int rs, re, rc;
   
- -      for (i = page_start; i < page_end; i++) {
- -              if (pcpu_chunk_page_occupied(chunk, i)) {
- -                      if (map_start >= 0) {
- -                              if (pcpu_map(chunk, map_start, map_end))
- -                                      goto err;
- -                              map_start = -1;
- -                      }
- -                      continue;
- -              }
+ +      /* quick path, check whether all pages are already there */
+ +      pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) {
+ +              if (rs == page_start && re == page_end)
+ +                      goto clear;
+ +              break;
+ +      }
   
- -              map_start = map_start < 0 ? i : map_start;
- -              map_end = i + 1;
+ +      /* need to allocate and map pages, this chunk can't be immutable */
+ +      WARN_ON(chunk->immutable);
   
- -              for_each_possible_cpu(cpu) {
- -                      struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
+ +      pages = pcpu_get_pages_and_bitmap(chunk, &populated, true);
+ +      if (!pages)
+ +              return -ENOMEM;
   
- -                      *pagep = alloc_pages_node(cpu_to_node(cpu),
- -                                                alloc_mask, 0);
- -                      if (!*pagep)
- -                              goto err;
- -                      pcpu_set_page_chunk(*pagep, chunk);
- -              }
+ +      /* alloc and map */
+ +      pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
+ +              rc = pcpu_alloc_pages(chunk, pages, populated, rs, re);
+ +              if (rc)
+ +                      goto err_free;
+ +              free_end = re;
         }
   
- -      if (map_start >= 0 && pcpu_map(chunk, map_start, map_end))
- -              goto err;
+ +      pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
+ +              rc = pcpu_map_pages(chunk, pages, populated, rs, re);
+ +              if (rc)
+ +                      goto err_unmap;
+ +              unmap_end = re;
+ +      }
+ +      pcpu_post_map_flush(chunk, page_start, page_end);
   
+ +      /* commit new bitmap */
+ +      bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
+ +clear:
         for_each_possible_cpu(cpu)
- -              memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0,
- -                     size);
- -
+ +              memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
         return 0;
- -err:
- -      /* likely under heavy memory pressure, give memory back */
- -      pcpu_depopulate_chunk(chunk, off, size, true);
- -      return -ENOMEM;
+ +
+ +err_unmap:
+ +      pcpu_pre_unmap_flush(chunk, page_start, unmap_end);
+ +      pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end)
+ +              pcpu_unmap_pages(chunk, pages, populated, rs, re);
+ +      pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end);
+ +err_free:
+ +      pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end)
+ +              pcpu_free_pages(chunk, pages, populated, rs, re);
+ +      return rc;
   }
   
   static void free_pcpu_chunk(struct pcpu_chunk *chunk)
@@@ -1003,8 -747,9 +1003,8 @@@ static struct pcpu_chunk *alloc_pcpu_ch
         chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
         chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
         chunk->map[chunk->map_used++] = pcpu_unit_size;
- -      chunk->page = chunk->page_ar;
   
-       chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL);
+       chunk->vm = get_vm_area(pcpu_chunk_size, VM_ALLOC);
         if (!chunk->vm) {
                 free_pcpu_chunk(chunk);
                 return NULL;
@@@ -1102,7 -847,6 +1102,7 @@@ area_found
   
         mutex_unlock(&pcpu_alloc_mutex);
   
+ +      /* return address relative to unit0 */
         return __addr_to_pcpu_ptr(chunk->vm->addr + off);
   
   fail_unlock:
@@@ -1184,7 -928,7 +1184,7 @@@ static void pcpu_reclaim(struct work_st
         mutex_unlock(&pcpu_alloc_mutex);
   
         list_for_each_entry_safe(chunk, next, &todo, list) {
- -              pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);
+ +              pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size);
                 free_pcpu_chunk(chunk);
         }
   }
@@@ -1232,16 -976,26 +1232,16 @@@ EXPORT_SYMBOL_GPL(free_percpu)
   
   /**
    * pcpu_setup_first_chunk - initialize the first percpu chunk
- - * @get_page_fn: callback to fetch page pointer
    * @static_size: the size of static percpu area in bytes
- - * @reserved_size: the size of reserved percpu area in bytes
+ + * @reserved_size: the size of reserved percpu area in bytes, 0 for none
    * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
- - * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto
- - * @base_addr: mapped address, NULL for auto
- - * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary
+ + * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE
+ + * @base_addr: mapped address
+ + * @unit_map: cpu -> unit map, NULL for sequential mapping
    *
    * Initialize the first percpu chunk which contains the kernel static
    * perpcu area.  This function is to be called from arch percpu area
- - * setup path.  The first two parameters are mandatory.  The rest are
- - * optional.
- - *
- - * @get_page_fn() should return pointer to percpu page given cpu
- - * number and page number.  It should at least return enough pages to
- - * cover the static area.  The returned pages for static area should
- - * have been initialized with valid data.  If @unit_size is specified,
- - * it can also return pages after the static area.  NULL return
- - * indicates end of pages for the cpu.  Note that @get_page_fn() must
- - * return the same number of pages for all cpus.
+ + * setup path.
    *
    * @reserved_size, if non-zero, specifies the amount of bytes to
    * reserve after the static area in the first chunk.  This reserves
@@@ -1256,12 -1010,17 +1256,12 @@@
    * non-negative value makes percpu leave alone the area beyond
    * @static_size + @reserved_size + @dyn_size.
    *
- - * @unit_size, if non-negative, specifies unit size and must be
- - * aligned to PAGE_SIZE and equal to or larger than @static_size +
- - * @reserved_size + if non-negative, @dyn_size.
- - *
- - * Non-null @base_addr means that the caller already allocated virtual
- - * region for the first chunk and mapped it.  percpu must not mess
- - * with the chunk.  Note that @base_addr with 0 @unit_size or non-NULL
- - * @populate_pte_fn doesn't make any sense.
+ + * @unit_size specifies unit size and must be aligned to PAGE_SIZE and
+ + * equal to or larger than @static_size + @reserved_size + if
+ + * non-negative, @dyn_size.
    *
- - * @populate_pte_fn is used to populate the pagetable.  NULL means the
- - * caller already populated the pagetable.
+ + * The caller should have mapped the first chunk at @base_addr and
+ + * copied static data to each unit.
    *
    * If the first chunk ends up with both reserved and dynamic areas, it
    * is served by two chunks - one to serve the core static and reserved
@@@ -1274,83 -1033,47 +1274,83 @@@
    * The determined pcpu_unit_size which can be used to initialize
    * percpu access.
    */
- -size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
- -                                   size_t static_size, size_t reserved_size,
- -                                   ssize_t dyn_size, ssize_t unit_size,
- -                                   void *base_addr,
- -                                   pcpu_populate_pte_fn_t populate_pte_fn)
+ +size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size,
+ +                                   ssize_t dyn_size, size_t unit_size,
+ +                                   void *base_addr, const int *unit_map)
   {
         static struct vm_struct first_vm;
         static int smap[2], dmap[2];
         size_t size_sum = static_size + reserved_size +
                           (dyn_size >= 0 ? dyn_size : 0);
         struct pcpu_chunk *schunk, *dchunk = NULL;
- -      unsigned int cpu;
- -      int nr_pages;
- -      int err, i;
+ +      unsigned int cpu, tcpu;
+ +      int i;
   
- -      /* santiy checks */
+ +      /* sanity checks */
         BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
                      ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC);
         BUG_ON(!static_size);
- -      if (unit_size >= 0) {
- -              BUG_ON(unit_size < size_sum);
- -              BUG_ON(unit_size & ~PAGE_MASK);
- -              BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE);
- -      } else
- -              BUG_ON(base_addr);
- -      BUG_ON(base_addr && populate_pte_fn);
- -
- -      if (unit_size >= 0)
- -              pcpu_unit_pages = unit_size >> PAGE_SHIFT;
- -      else
- -              pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT,
- -                                      PFN_UP(size_sum));
+ +      BUG_ON(!base_addr);
+ +      BUG_ON(unit_size < size_sum);
+ +      BUG_ON(unit_size & ~PAGE_MASK);
+ +      BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE);
+ +
+ +      /* determine number of units and verify and initialize pcpu_unit_map */
+ +      if (unit_map) {
+ +              int first_unit = INT_MAX, last_unit = INT_MIN;
+ +
+ +              for_each_possible_cpu(cpu) {
+ +                      int unit = unit_map[cpu];
+ +
+ +                      BUG_ON(unit < 0);
+ +                      for_each_possible_cpu(tcpu) {
+ +                              if (tcpu == cpu)
+ +                                      break;
+ +                              /* the mapping should be one-to-one */
+ +                              BUG_ON(unit_map[tcpu] == unit);
+ +                      }
+ +
+ +                      if (unit < first_unit) {
+ +                              pcpu_first_unit_cpu = cpu;
+ +                              first_unit = unit;
+ +                      }
+ +                      if (unit > last_unit) {
+ +                              pcpu_last_unit_cpu = cpu;
+ +                              last_unit = unit;
+ +                      }
+ +              }
+ +              pcpu_nr_units = last_unit + 1;
+ +              pcpu_unit_map = unit_map;
+ +      } else {
+ +              int *identity_map;
+ +
+ +              /* #units == #cpus, identity mapped */
-               identity_map = alloc_bootmem(num_possible_cpus() *
++              identity_map = alloc_bootmem(nr_cpu_ids *
+ +                                           sizeof(identity_map[0]));
   
-               pcpu_nr_units = num_possible_cpus();
+ +              for_each_possible_cpu(cpu)
+ +                      identity_map[cpu] = cpu;
+ +
+ +              pcpu_first_unit_cpu = 0;
+ +              pcpu_last_unit_cpu = pcpu_nr_units - 1;
++              pcpu_nr_units = nr_cpu_ids;
+ +              pcpu_unit_map = identity_map;
+ +      }
+ +
+ +      /* determine basic parameters */
+ +      pcpu_unit_pages = unit_size >> PAGE_SHIFT;
         pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
- -      pcpu_chunk_size = nr_cpu_ids * pcpu_unit_size;
- -      pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
- -              + nr_cpu_ids * pcpu_unit_pages * sizeof(struct page *);
+ +      pcpu_chunk_size = pcpu_nr_units * pcpu_unit_size;
+ +      pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +
+ +              BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long);
   
         if (dyn_size < 0)
                 dyn_size = pcpu_unit_size - static_size - reserved_size;
   
+ +      first_vm.flags = VM_ALLOC;
+ +      first_vm.size = pcpu_chunk_size;
+ +      first_vm.addr = base_addr;
+ +
         /*
          * Allocate chunk slots.  The additional last slot is for
          * empty chunks.
@@@ -1372,8 -1095,7 +1372,8 @@@
         schunk->vm = &first_vm;
         schunk->map = smap;
         schunk->map_alloc = ARRAY_SIZE(smap);
- -      schunk->page = schunk->page_ar;
+ +      schunk->immutable = true;
+ +      bitmap_fill(schunk->populated, pcpu_unit_pages);
   
         if (reserved_size) {
                 schunk->free_size = reserved_size;
@@@ -1391,39 -1113,93 +1391,39 @@@
   
         /* init dynamic chunk if necessary */
         if (dyn_size) {
- -              dchunk = alloc_bootmem(sizeof(struct pcpu_chunk));
+ +              dchunk = alloc_bootmem(pcpu_chunk_struct_size);
                 INIT_LIST_HEAD(&dchunk->list);
                 dchunk->vm = &first_vm;
                 dchunk->map = dmap;
                 dchunk->map_alloc = ARRAY_SIZE(dmap);
- -              dchunk->page = schunk->page_ar; /* share page map with schunk */
+ +              dchunk->immutable = true;
+ +              bitmap_fill(dchunk->populated, pcpu_unit_pages);
   
                 dchunk->contig_hint = dchunk->free_size = dyn_size;
                 dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit;
                 dchunk->map[dchunk->map_used++] = dchunk->free_size;
         }
   
- -      /* allocate vm address */
- -      first_vm.flags = VM_ALLOC;
- -      first_vm.size = pcpu_chunk_size;
- -
- -      if (!base_addr)
- -              vm_area_register_early(&first_vm, PAGE_SIZE);
- -      else {
- -              /*
- -               * Pages already mapped.  No need to remap into
- -               * vmalloc area.  In this case the first chunks can't
- -               * be mapped or unmapped by percpu and are marked
- -               * immutable.
- -               */
- -              first_vm.addr = base_addr;
- -              schunk->immutable = true;
- -              if (dchunk)
- -                      dchunk->immutable = true;
- -      }
- -
- -      /* assign pages */
- -      nr_pages = -1;
- -      for_each_possible_cpu(cpu) {
- -              for (i = 0; i < pcpu_unit_pages; i++) {
- -                      struct page *page = get_page_fn(cpu, i);
- -
- -                      if (!page)
- -                              break;
- -                      *pcpu_chunk_pagep(schunk, cpu, i) = page;
- -              }
- -
- -              BUG_ON(i < PFN_UP(static_size));
- -
- -              if (nr_pages < 0)
- -                      nr_pages = i;
- -              else
- -                      BUG_ON(nr_pages != i);
- -      }
- -
- -      /* map them */
- -      if (populate_pte_fn) {
- -              for_each_possible_cpu(cpu)
- -                      for (i = 0; i < nr_pages; i++)
- -                              populate_pte_fn(pcpu_chunk_addr(schunk,
- -                                                              cpu, i));
- -
- -              err = pcpu_map(schunk, 0, nr_pages);
- -              if (err)
- -                      panic("failed to setup static percpu area, err=%d\n",
- -                            err);
- -      }
- -
         /* link the first chunk in */
         pcpu_first_chunk = dchunk ?: schunk;
         pcpu_chunk_relocate(pcpu_first_chunk, -1);
   
         /* we're done */
- -      pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0);
+ +      pcpu_base_addr = schunk->vm->addr;
         return pcpu_unit_size;
   }
   
- -/*
- - * Embedding first chunk setup helper.
- - */
- -static void *pcpue_ptr __initdata;
- -static size_t pcpue_size __initdata;
- -static size_t pcpue_unit_size __initdata;
- -
- -static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
+ +static size_t pcpu_calc_fc_sizes(size_t static_size, size_t reserved_size,
+ +                               ssize_t *dyn_sizep)
   {
- -      size_t off = (size_t)pageno << PAGE_SHIFT;
+ +      size_t size_sum;
   
- -      if (off >= pcpue_size)
- -              return NULL;
+ +      size_sum = PFN_ALIGN(static_size + reserved_size +
+ +                           (*dyn_sizep >= 0 ? *dyn_sizep : 0));
+ +      if (*dyn_sizep != 0)
+ +              *dyn_sizep = size_sum - static_size - reserved_size;
   
- -      return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off);
+ +      return size_sum;
   }
   
   /**
@@@ -1431,6 -1207,7 +1431,6 @@@
    * @static_size: the size of static percpu area in bytes
    * @reserved_size: the size of reserved percpu area in bytes
    * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
- - * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto
    *
    * This is a helper to ease setting up embedded first percpu chunk and
    * can be called where pcpu_setup_first_chunk() is expected.
@@@ -1442,9 -1219,9 +1442,9 @@@
    * page size.
    *
    * When @dyn_size is positive, dynamic area might be larger than
- - * specified to fill page alignment.  Also, when @dyn_size is auto,
- - * @dyn_size does not fill the whole first chunk but only what's
- - * necessary for page alignment after static and reserved areas.
+ + * specified to fill page alignment.  When @dyn_size is auto,
+ + * @dyn_size is just big enough to fill page alignment after static
+ + * and reserved areas.
    *
    * If the needed size is smaller than the minimum or specified unit
    * size, the leftover is returned to the bootmem allocator.
@@@ -1454,562 -1231,50 +1454,565 @@@
    * percpu access on success, -errno on failure.
    */
   ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
- -                                    ssize_t dyn_size, ssize_t unit_size)
+ +                                    ssize_t dyn_size)
   {
- -      size_t chunk_size;
+ +      size_t size_sum, unit_size, chunk_size;
+ +      void *base;
         unsigned int cpu;
   
         /* determine parameters and allocate */
- -      pcpue_size = PFN_ALIGN(static_size + reserved_size +
- -                             (dyn_size >= 0 ? dyn_size : 0));
- -      if (dyn_size != 0)
- -              dyn_size = pcpue_size - static_size - reserved_size;
- -
- -      if (unit_size >= 0) {
- -              BUG_ON(unit_size < pcpue_size);
- -              pcpue_unit_size = unit_size;
- -      } else
- -              pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE);
- -
- -      chunk_size = pcpue_unit_size * nr_cpu_ids;
- -
- -      pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE,
- -                                          __pa(MAX_DMA_ADDRESS));
- -      if (!pcpue_ptr) {
+ +      size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
+ +
+ +      unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
-       chunk_size = unit_size * num_possible_cpus();
++      chunk_size = unit_size * nr_cpu_ids;
+ +
+ +      base = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE,
+ +                                     __pa(MAX_DMA_ADDRESS));
+ +      if (!base) {
                 pr_warning("PERCPU: failed to allocate %zu bytes for "
                            "embedding\n", chunk_size);
                 return -ENOMEM;
         }
   
         /* return the leftover and copy */
-       for_each_possible_cpu(cpu) {
+       for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
- -              void *ptr = pcpue_ptr + cpu * pcpue_unit_size;
+ +              void *ptr = base + cpu * unit_size;
   
-               free_bootmem(__pa(ptr + size_sum), unit_size - size_sum);
-               memcpy(ptr, __per_cpu_load, static_size);
+               if (cpu_possible(cpu)) {
- -                      free_bootmem(__pa(ptr + pcpue_size),
- -                                   pcpue_unit_size - pcpue_size);
++                      free_bootmem(__pa(ptr + size_sum),
++                                   unit_size - size_sum);
+                       memcpy(ptr, __per_cpu_load, static_size);
+               } else
- -                      free_bootmem(__pa(ptr), pcpue_unit_size);
++                      free_bootmem(__pa(ptr), unit_size);
         }
   
         /* we're ready, commit */
         pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
- -              pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size);
+ +              size_sum >> PAGE_SHIFT, base, static_size);
+ +
+ +      return pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
+ +                                    unit_size, base, NULL);
+ +}
+ +
+ +/**
+ + * pcpu_4k_first_chunk - map the first chunk using PAGE_SIZE pages
+ + * @static_size: the size of static percpu area in bytes
+ + * @reserved_size: the size of reserved percpu area in bytes
+ + * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE
+ + * @free_fn: funtion to free percpu page, always called with PAGE_SIZE
+ + * @populate_pte_fn: function to populate pte
+ + *
+ + * This is a helper to ease setting up embedded first percpu chunk and
+ + * can be called where pcpu_setup_first_chunk() is expected.
+ + *
+ + * This is the basic allocator.  Static percpu area is allocated
+ + * page-by-page into vmalloc area.
+ + *
+ + * RETURNS:
+ + * The determined pcpu_unit_size which can be used to initialize
+ + * percpu access on success, -errno on failure.
+ + */
+ +ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size,
+ +                                 pcpu_fc_alloc_fn_t alloc_fn,
+ +                                 pcpu_fc_free_fn_t free_fn,
+ +                                 pcpu_fc_populate_pte_fn_t populate_pte_fn)
+ +{
+ +      static struct vm_struct vm;
+ +      int unit_pages;
+ +      size_t pages_size;
+ +      struct page **pages;
+ +      unsigned int cpu;
+ +      int i, j;
+ +      ssize_t ret;
+ +
+ +      unit_pages = PFN_UP(max_t(size_t, static_size + reserved_size,
+ +                                PCPU_MIN_UNIT_SIZE));
+ +
+ +      /* unaligned allocations can't be freed, round up to page size */
-       pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
-                              sizeof(pages[0]));
++      pages_size = PFN_ALIGN(unit_pages * nr_cpu_ids * sizeof(pages[0]));
+ +      pages = alloc_bootmem(pages_size);
+ +
+ +      /* allocate pages */
+ +      j = 0;
+ +      for_each_possible_cpu(cpu)
+ +              for (i = 0; i < unit_pages; i++) {
+ +                      void *ptr;
+ +
+ +                      ptr = alloc_fn(cpu, PAGE_SIZE);
+ +                      if (!ptr) {
+ +                              pr_warning("PERCPU: failed to allocate "
+ +                                         "4k page for cpu%u\n", cpu);
+ +                              goto enomem;
+ +                      }
+ +                      pages[j++] = virt_to_page(ptr);
+ +              }
+ +
+ +      /* allocate vm area, map the pages and copy static data */
+ +      vm.flags = VM_ALLOC;
-       vm.size = num_possible_cpus() * unit_pages << PAGE_SHIFT;
++      vm.size = nr_cpu_ids * unit_pages << PAGE_SHIFT;
+ +      vm_area_register_early(&vm, PAGE_SIZE);
+ +
+ +      for_each_possible_cpu(cpu) {
+ +              unsigned long unit_addr = (unsigned long)vm.addr +
+ +                      (cpu * unit_pages << PAGE_SHIFT);
+ +
+ +              for (i = 0; i < unit_pages; i++)
+ +                      populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
+ +
+ +              /* pte already populated, the following shouldn't fail */
+ +              ret = __pcpu_map_pages(unit_addr, &pages[cpu * unit_pages],
+ +                                     unit_pages);
+ +              if (ret < 0)
+ +                      panic("failed to map percpu area, err=%zd\n", ret);
+ +
+ +              /*
+ +               * FIXME: Archs with virtual cache should flush local
+ +               * cache for the linear mapping here - something
+ +               * equivalent to flush_cache_vmap() on the local cpu.
+ +               * flush_cache_vmap() can't be used as most supporting
+ +               * data structures are not set up yet.
+ +               */
+ +
+ +              /* copy static data */
+ +              memcpy((void *)unit_addr, __per_cpu_load, static_size);
+ +      }
+ +
+ +      /* we're ready, commit */
+ +      pr_info("PERCPU: %d 4k pages per cpu, static data %zu bytes\n",
+ +              unit_pages, static_size);
+ +
+ +      ret = pcpu_setup_first_chunk(static_size, reserved_size, -1,
+ +                                   unit_pages << PAGE_SHIFT, vm.addr, NULL);
+ +      goto out_free_ar;
+ +
+ +enomem:
+ +      while (--j >= 0)
+ +              free_fn(page_address(pages[j]), PAGE_SIZE);
+ +      ret = -ENOMEM;
+ +out_free_ar:
+ +      free_bootmem(__pa(pages), pages_size);
+ +      return ret;
+ +}
+ +
+ +/*
+ + * Large page remapping first chunk setup helper
+ + */
+ +#ifdef CONFIG_NEED_MULTIPLE_NODES
+ +
+ +/**
+ + * pcpu_lpage_build_unit_map - build unit_map for large page remapping
+ + * @static_size: the size of static percpu area in bytes
+ + * @reserved_size: the size of reserved percpu area in bytes
+ + * @dyn_sizep: in/out parameter for dynamic size, -1 for auto
+ + * @unit_sizep: out parameter for unit size
+ + * @unit_map: unit_map to be filled
+ + * @cpu_distance_fn: callback to determine distance between cpus
+ + *
+ + * This function builds cpu -> unit map and determine other parameters
+ + * considering needed percpu size, large page size and distances
+ + * between CPUs in NUMA.
+ + *
+ + * CPUs which are of LOCAL_DISTANCE both ways are grouped together and
+ + * may share units in the same large page.  The returned configuration
+ + * is guaranteed to have CPUs on different nodes on different large
+ + * pages and >=75% usage of allocated virtual address space.
+ + *
+ + * RETURNS:
+ + * On success, fills in @unit_map, sets *@dyn_sizep, *@unit_sizep and
+ + * returns the number of units to be allocated.  -errno on failure.
+ + */
+ +int __init pcpu_lpage_build_unit_map(size_t static_size, size_t reserved_size,
+ +                                   ssize_t *dyn_sizep, size_t *unit_sizep,
+ +                                   size_t lpage_size, int *unit_map,
+ +                                   pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
+ +{
+ +      static int group_map[NR_CPUS] __initdata;
+ +      static int group_cnt[NR_CPUS] __initdata;
+ +      int group_cnt_max = 0;
+ +      size_t size_sum, min_unit_size, alloc_size;
+ +      int upa, max_upa, uninitialized_var(best_upa);  /* units_per_alloc */
+ +      int last_allocs;
+ +      unsigned int cpu, tcpu;
+ +      int group, unit;
+ +
+ +      /*
+ +       * Determine min_unit_size, alloc_size and max_upa such that
+ +       * alloc_size is multiple of lpage_size and is the smallest
+ +       * which can accomodate 4k aligned segments which are equal to
+ +       * or larger than min_unit_size.
+ +       */
+ +      size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, dyn_sizep);
+ +      min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
+ +
+ +      alloc_size = roundup(min_unit_size, lpage_size);
+ +      upa = alloc_size / min_unit_size;
+ +      while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
+ +              upa--;
+ +      max_upa = upa;
+ +
+ +      /* group cpus according to their proximity */
+ +      for_each_possible_cpu(cpu) {
+ +              group = 0;
+ +      next_group:
+ +              for_each_possible_cpu(tcpu) {
+ +                      if (cpu == tcpu)
+ +                              break;
+ +                      if (group_map[tcpu] == group &&
+ +                          (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
+ +                           cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
+ +                              group++;
+ +                              goto next_group;
+ +                      }
+ +              }
+ +              group_map[cpu] = group;
+ +              group_cnt[group]++;
+ +              group_cnt_max = max(group_cnt_max, group_cnt[group]);
+ +      }
+ +
+ +      /*
+ +       * Expand unit size until address space usage goes over 75%
+ +       * and then as much as possible without using more address
+ +       * space.
+ +       */
+ +      last_allocs = INT_MAX;
+ +      for (upa = max_upa; upa; upa--) {
+ +              int allocs = 0, wasted = 0;
+ +
+ +              if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
+ +                      continue;
+ +
+ +              for (group = 0; group_cnt[group]; group++) {
+ +                      int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
+ +                      allocs += this_allocs;
+ +                      wasted += this_allocs * upa - group_cnt[group];
+ +              }
+ +
+ +              /*
+ +               * Don't accept if wastage is over 25%.  The
+ +               * greater-than comparison ensures upa==1 always
+ +               * passes the following check.
+ +               */
+ +              if (wasted > num_possible_cpus() / 3)
+ +                      continue;
+ +
+ +              /* and then don't consume more memory */
+ +              if (allocs > last_allocs)
+ +                      break;
+ +              last_allocs = allocs;
+ +              best_upa = upa;
+ +      }
+ +      *unit_sizep = alloc_size / best_upa;
   
- -      return pcpu_setup_first_chunk(pcpue_get_page, static_size,
- -                                    reserved_size, dyn_size,
- -                                    pcpue_unit_size, pcpue_ptr, NULL);
+ +      /* assign units to cpus accordingly */
+ +      unit = 0;
+ +      for (group = 0; group_cnt[group]; group++) {
+ +              for_each_possible_cpu(cpu)
+ +                      if (group_map[cpu] == group)
+ +                              unit_map[cpu] = unit++;
+ +              unit = roundup(unit, best_upa);
+ +      }
+ +
+ +      return unit;    /* unit contains aligned number of units */
+ +}
+ +
+ +struct pcpul_ent {
+ +      void            *ptr;
+ +      void            *map_addr;
+ +};
+ +
+ +static size_t pcpul_size;
+ +static size_t pcpul_lpage_size;
+ +static int pcpul_nr_lpages;
+ +static struct pcpul_ent *pcpul_map;
+ +
+ +static bool __init pcpul_unit_to_cpu(int unit, const int *unit_map,
+ +                                   unsigned int *cpup)
+ +{
+ +      unsigned int cpu;
+ +
+ +      for_each_possible_cpu(cpu)
+ +              if (unit_map[cpu] == unit) {
+ +                      if (cpup)
+ +                              *cpup = cpu;
+ +                      return true;
+ +              }
+ +
+ +      return false;
+ +}
+ +
+ +static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size,
+ +                                      size_t reserved_size, size_t dyn_size,
+ +                                      size_t unit_size, size_t lpage_size,
+ +                                      const int *unit_map, int nr_units)
+ +{
+ +      int width = 1, v = nr_units;
+ +      char empty_str[] = "--------";
+ +      int upl, lpl;   /* units per lpage, lpage per line */
+ +      unsigned int cpu;
+ +      int lpage, unit;
+ +
+ +      while (v /= 10)
+ +              width++;
+ +      empty_str[min_t(int, width, sizeof(empty_str) - 1)] = '\0';
+ +
+ +      upl = max_t(int, lpage_size / unit_size, 1);
+ +      lpl = rounddown_pow_of_two(max_t(int, 60 / (upl * (width + 1) + 2), 1));
+ +
+ +      printk("%spcpu-lpage: sta/res/dyn=%zu/%zu/%zu unit=%zu lpage=%zu", lvl,
+ +             static_size, reserved_size, dyn_size, unit_size, lpage_size);
+ +
+ +      for (lpage = 0, unit = 0; unit < nr_units; unit++) {
+ +              if (!(unit % upl)) {
+ +                      if (!(lpage++ % lpl)) {
+ +                              printk("\n");
+ +                              printk("%spcpu-lpage: ", lvl);
+ +                      } else
+ +                              printk("| ");
+ +              }
+ +              if (pcpul_unit_to_cpu(unit, unit_map, &cpu))
+ +                      printk("%0*d ", width, cpu);
+ +              else
+ +                      printk("%s ", empty_str);
+ +      }
+ +      printk("\n");
+ +}
+ +
+ +/**
+ + * pcpu_lpage_first_chunk - remap the first percpu chunk using large page
+ + * @static_size: the size of static percpu area in bytes
+ + * @reserved_size: the size of reserved percpu area in bytes
+ + * @dyn_size: free size for dynamic allocation in bytes
+ + * @unit_size: unit size in bytes
+ + * @lpage_size: the size of a large page
+ + * @unit_map: cpu -> unit mapping
+ + * @nr_units: the number of units
+ + * @alloc_fn: function to allocate percpu lpage, always called with lpage_size
+ + * @free_fn: function to free percpu memory, @size <= lpage_size
+ + * @map_fn: function to map percpu lpage, always called with lpage_size
+ + *
+ + * This allocator uses large page to build and map the first chunk.
+ + * Unlike other helpers, the caller should always specify @dyn_size
+ + * and @unit_size.  These parameters along with @unit_map and
+ + * @nr_units can be determined using pcpu_lpage_build_unit_map().
+ + * This two stage initialization is to allow arch code to evaluate the
+ + * parameters before committing to it.
+ + *
+ + * Large pages are allocated as directed by @unit_map and other
+ + * parameters and mapped to vmalloc space.  Unused holes are returned
+ + * to the page allocator.  Note that these holes end up being actively
+ + * mapped twice - once to the physical mapping and to the vmalloc area
+ + * for the first percpu chunk.  Depending on architecture, this might
+ + * cause problem when changing page attributes of the returned area.
+ + * These double mapped areas can be detected using
+ + * pcpu_lpage_remapped().
+ + *
+ + * RETURNS:
+ + * The determined pcpu_unit_size which can be used to initialize
+ + * percpu access on success, -errno on failure.
+ + */
+ +ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size,
+ +                                    size_t dyn_size, size_t unit_size,
+ +                                    size_t lpage_size, const int *unit_map,
+ +                                    int nr_units,
+ +                                    pcpu_fc_alloc_fn_t alloc_fn,
+ +                                    pcpu_fc_free_fn_t free_fn,
+ +                                    pcpu_fc_map_fn_t map_fn)
+ +{
+ +      static struct vm_struct vm;
+ +      size_t chunk_size = unit_size * nr_units;
+ +      size_t map_size;
+ +      unsigned int cpu;
+ +      ssize_t ret;
+ +      int i, j, unit;
+ +
+ +      pcpul_lpage_dump_cfg(KERN_DEBUG, static_size, reserved_size, dyn_size,
+ +                           unit_size, lpage_size, unit_map, nr_units);
+ +
+ +      BUG_ON(chunk_size % lpage_size);
+ +
+ +      pcpul_size = static_size + reserved_size + dyn_size;
+ +      pcpul_lpage_size = lpage_size;
+ +      pcpul_nr_lpages = chunk_size / lpage_size;
+ +
+ +      /* allocate pointer array and alloc large pages */
+ +      map_size = pcpul_nr_lpages * sizeof(pcpul_map[0]);
+ +      pcpul_map = alloc_bootmem(map_size);
+ +
+ +      /* allocate all pages */
+ +      for (i = 0; i < pcpul_nr_lpages; i++) {
+ +              size_t offset = i * lpage_size;
+ +              int first_unit = offset / unit_size;
+ +              int last_unit = (offset + lpage_size - 1) / unit_size;
+ +              void *ptr;
+ +
+ +              /* find out which cpu is mapped to this unit */
+ +              for (unit = first_unit; unit <= last_unit; unit++)
+ +                      if (pcpul_unit_to_cpu(unit, unit_map, &cpu))
+ +                              goto found;
+ +              continue;
+ +      found:
+ +              ptr = alloc_fn(cpu, lpage_size);
+ +              if (!ptr) {
+ +                      pr_warning("PERCPU: failed to allocate large page "
+ +                                 "for cpu%u\n", cpu);
+ +                      goto enomem;
+ +              }
+ +
+ +              pcpul_map[i].ptr = ptr;
+ +      }
+ +
+ +      /* return unused holes */
+ +      for (unit = 0; unit < nr_units; unit++) {
+ +              size_t start = unit * unit_size;
+ +              size_t end = start + unit_size;
+ +              size_t off, next;
+ +
+ +              /* don't free used part of occupied unit */
+ +              if (pcpul_unit_to_cpu(unit, unit_map, NULL))
+ +                      start += pcpul_size;
+ +
+ +              /* unit can span more than one page, punch the holes */
+ +              for (off = start; off < end; off = next) {
+ +                      void *ptr = pcpul_map[off / lpage_size].ptr;
+ +                      next = min(roundup(off + 1, lpage_size), end);
+ +                      if (ptr)
+ +                              free_fn(ptr + off % lpage_size, next - off);
+ +              }
+ +      }
+ +
+ +      /* allocate address, map and copy */
+ +      vm.flags = VM_ALLOC;
+ +      vm.size = chunk_size;
+ +      vm_area_register_early(&vm, unit_size);
+ +
+ +      for (i = 0; i < pcpul_nr_lpages; i++) {
+ +              if (!pcpul_map[i].ptr)
+ +                      continue;
+ +              pcpul_map[i].map_addr = vm.addr + i * lpage_size;
+ +              map_fn(pcpul_map[i].ptr, lpage_size, pcpul_map[i].map_addr);
+ +      }
+ +
+ +      for_each_possible_cpu(cpu)
+ +              memcpy(vm.addr + unit_map[cpu] * unit_size, __per_cpu_load,
+ +                     static_size);
+ +
+ +      /* we're ready, commit */
+ +      pr_info("PERCPU: Remapped at %p with large pages, static data "
+ +              "%zu bytes\n", vm.addr, static_size);
+ +
+ +      ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
+ +                                   unit_size, vm.addr, unit_map);
+ +
+ +      /*
+ +       * Sort pcpul_map array for pcpu_lpage_remapped().  Unmapped
+ +       * lpages are pushed to the end and trimmed.
+ +       */
+ +      for (i = 0; i < pcpul_nr_lpages - 1; i++)
+ +              for (j = i + 1; j < pcpul_nr_lpages; j++) {
+ +                      struct pcpul_ent tmp;
+ +
+ +                      if (!pcpul_map[j].ptr)
+ +                              continue;
+ +                      if (pcpul_map[i].ptr &&
+ +                          pcpul_map[i].ptr < pcpul_map[j].ptr)
+ +                              continue;
+ +
+ +                      tmp = pcpul_map[i];
+ +                      pcpul_map[i] = pcpul_map[j];
+ +                      pcpul_map[j] = tmp;
+ +              }
+ +
+ +      while (pcpul_nr_lpages && !pcpul_map[pcpul_nr_lpages - 1].ptr)
+ +              pcpul_nr_lpages--;
+ +
+ +      return ret;
+ +
+ +enomem:
+ +      for (i = 0; i < pcpul_nr_lpages; i++)
+ +              if (pcpul_map[i].ptr)
+ +                      free_fn(pcpul_map[i].ptr, lpage_size);
+ +      free_bootmem(__pa(pcpul_map), map_size);
+ +      return -ENOMEM;
+ +}
+ +
+ +/**
+ + * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area
+ + * @kaddr: the kernel address in question
+ + *
+ + * Determine whether @kaddr falls in the pcpul recycled area.  This is
+ + * used by pageattr to detect VM aliases and break up the pcpu large
+ + * page mapping such that the same physical page is not mapped under
+ + * different attributes.
+ + *
+ + * The recycled area is always at the tail of a partially used large
+ + * page.
+ + *
+ + * RETURNS:
+ + * Address of corresponding remapped pcpu address if match is found;
+ + * otherwise, NULL.
+ + */
+ +void *pcpu_lpage_remapped(void *kaddr)
+ +{
+ +      unsigned long lpage_mask = pcpul_lpage_size - 1;
+ +      void *lpage_addr = (void *)((unsigned long)kaddr & ~lpage_mask);
+ +      unsigned long offset = (unsigned long)kaddr & lpage_mask;
+ +      int left = 0, right = pcpul_nr_lpages - 1;
+ +      int pos;
+ +
+ +      /* pcpul in use at all? */
+ +      if (!pcpul_map)
+ +              return NULL;
+ +
+ +      /* okay, perform binary search */
+ +      while (left <= right) {
+ +              pos = (left + right) / 2;
+ +
+ +              if (pcpul_map[pos].ptr < lpage_addr)
+ +                      left = pos + 1;
+ +              else if (pcpul_map[pos].ptr > lpage_addr)
+ +                      right = pos - 1;
+ +              else
+ +                      return pcpul_map[pos].map_addr + offset;
+ +      }
+ +
+ +      return NULL;
+ +}
+ +#endif
+ +
+ +/*
+ + * Generic percpu area setup.
+ + *
+ + * The embedding helper is used because its behavior closely resembles
+ + * the original non-dynamic generic percpu area setup.  This is
+ + * important because many archs have addressing restrictions and might
+ + * fail if the percpu area is located far away from the previous
+ + * location.  As an added bonus, in non-NUMA cases, embedding is
+ + * generally a good idea TLB-wise because percpu area can piggy back
+ + * on the physical linear memory mapping which uses large page
+ + * mappings on applicable archs.
+ + */
+ +#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
+ +unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
+ +EXPORT_SYMBOL(__per_cpu_offset);
+ +
+ +void __init setup_per_cpu_areas(void)
+ +{
+ +      size_t static_size = __per_cpu_end - __per_cpu_start;
+ +      ssize_t unit_size;
+ +      unsigned long delta;
+ +      unsigned int cpu;
+ +
+ +      /*
+ +       * Always reserve area for module percpu variables.  That's
+ +       * what the legacy allocator did.
+ +       */
+ +      unit_size = pcpu_embed_first_chunk(static_size, PERCPU_MODULE_RESERVE,
+ +                                         PERCPU_DYNAMIC_RESERVE);
+ +      if (unit_size < 0)
+ +              panic("Failed to initialized percpu areas.");
+ +
+ +      delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
+ +      for_each_possible_cpu(cpu)
+ +              __per_cpu_offset[cpu] = delta + cpu * unit_size;
   }
+ +#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
diff --combined mm/slub.c

index ffc895c,b9f1491..dc9765b
--- 1/mm/slub.c
--- 2/mm/slub.c
+++ b/mm/slub.c
@@@ -21,7 -21,6 +21,6 @@@
   #include <linux/kmemcheck.h>
   #include <linux/cpu.h>
   #include <linux/cpuset.h>
- #include <linux/kmemleak.h>
   #include <linux/mempolicy.h>
   #include <linux/ctype.h>
   #include <linux/debugobjects.h>
@@@ -2092,8 -2091,8 +2091,8 @@@ init_kmem_cache_node(struct kmem_cache_
    */
   #define NR_KMEM_CACHE_CPU 100
   
- -static DEFINE_PER_CPU(struct kmem_cache_cpu,
- -                              kmem_cache_cpu)[NR_KMEM_CACHE_CPU];
+ +static DEFINE_PER_CPU(struct kmem_cache_cpu [NR_KMEM_CACHE_CPU],
+ +                    kmem_cache_cpu);
   
   static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free);
   static DECLARE_BITMAP(kmem_cach_cpu_free_init_once, CONFIG_NR_CPUS);
@@@ -2595,6 -2594,8 +2594,8 @@@ static inline int kmem_cache_close(stru
    */
   void kmem_cache_destroy(struct kmem_cache *s)
   {
+       if (s->flags & SLAB_DESTROY_BY_RCU)
+               rcu_barrier();
         down_write(&slub_lock);
         s->refcount--;
         if (!s->refcount) {
@@@ -2833,13 -2834,15 +2834,15 @@@ EXPORT_SYMBOL(__kmalloc)
   static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
   {
         struct page *page;
+       void *ptr = NULL;
   
         flags |= __GFP_COMP | __GFP_NOTRACK;
         page = alloc_pages_node(node, flags, get_order(size));
         if (page)
-               return page_address(page);
-       else
-               return NULL;
+               ptr = page_address(page);
+ 
+       kmemleak_alloc(ptr, size, 1, flags);
+       return ptr;
   }
   
   #ifdef CONFIG_NUMA
@@@ -2924,6 -2927,7 +2927,7 @@@ void kfree(const void *x
         page = virt_to_head_page(x);
         if (unlikely(!PageSlab(page))) {
                 BUG_ON(!PageCompound(page));
+               kmemleak_free(x);
                 put_page(page);
                 return;
         }
author	Tejun Heo <tj@kernel.org>
	Fri, 14 Aug 2009 05:41:02 +0000 (14:41 +0900)
committer	Tejun Heo <tj@kernel.org>
	Fri, 14 Aug 2009 05:45:31 +0000 (14:45 +0900)
		1	2
Makefile	patch \|	diff1 \|	diff2 \|	blob \| history
arch/mn10300/kernel/vmlinux.lds.S	patch \|	diff1 \|	diff2 \|	blob \| history
arch/sparc/kernel/smp_64.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/cpu/mcheck/mce.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/cpu/perf_counter.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/setup_percpu.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/vmlinux.lds.S	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/mm/pageattr.c	patch \|	diff1 \|	diff2 \|	blob \| history
block/cfq-iosched.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/cpufreq/cpufreq_conservative.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/cpufreq/cpufreq_ondemand.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/xen/events.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/asm-generic/vmlinux.lds.h	patch \|	diff1 \|	diff2 \|	blob \| history
init/main.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/module.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/perf_counter.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/trace/trace_events.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/page-writeback.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/percpu.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/slub.c	patch \|	diff1 \|	diff2 \|	blob \| history