slub: tid must be retrieved from the percpu area of the current processor

[platform/adaptation/renesas_rcar/renesas_kernel.git] / mm / slub.c
diff --git a/mm/slub.c b/mm/slub.c

index 487f0bd..4df2c0c 100644 (file)
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -31,6 +31,7 @@
  #include <linux/fault-inject.h>
  #include <linux/stacktrace.h>
  #include <linux/prefetch.h>
+#include <linux/memcontrol.h>
  
  #include <trace/events/kmem.h>
  
@@ -112,9 +113,6 @@
   *                     the fast path and disables lockless freelists.
   */
  
-#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
-               SLAB_TRACE | SLAB_DEBUG_FREE)
-
  static inline int kmem_cache_debug(struct kmem_cache *s)
  {
  #ifdef CONFIG_SLUB_DEBUG
@@ -179,8 +177,6 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
  #define __OBJECT_POISON                0x80000000UL /* Poison object */
  #define __CMPXCHG_DOUBLE       0x40000000UL /* Use cmpxchg_double */
  
-static int kmem_size = sizeof(struct kmem_cache);
-
  #ifdef CONFIG_SMP
  static struct notifier_block slab_notifier;
  #endif
@@ -205,13 +201,14 @@ enum track_item { TRACK_ALLOC, TRACK_FREE };
  static int sysfs_slab_add(struct kmem_cache *);
  static int sysfs_slab_alias(struct kmem_cache *, const char *);
  static void sysfs_slab_remove(struct kmem_cache *);
-
+static void memcg_propagate_slab_attrs(struct kmem_cache *s);
  #else
  static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
  static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
                                                         { return 0; }
  static inline void sysfs_slab_remove(struct kmem_cache *s) { }
  
+static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { }
  #endif
  
  static inline void stat(const struct kmem_cache *s, enum stat_item si)
@@ -1008,7 +1005,7 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects)
          * dilemma by deferring the increment of the count during
          * bootstrap (see early_kmem_cache_node_alloc).
          */
-       if (n) {
+       if (likely(n)) {
                 atomic_long_inc(&n->nr_slabs);
                 atomic_long_add(objects, &n->total_objects);
         }
@@ -1092,11 +1089,11 @@ static noinline struct kmem_cache_node *free_debug_processing(
         if (!check_object(s, page, object, SLUB_RED_ACTIVE))
                 goto out;
  
-       if (unlikely(s != page->slab)) {
+       if (unlikely(s != page->slab_cache)) {
                 if (!PageSlab(page)) {
                         slab_err(s, page, "Attempt to free object(0x%p) "
                                 "outside of slab", object);
-               } else if (!page->slab) {
+               } else if (!page->slab_cache) {
                         printk(KERN_ERR
                                 "SLUB <none>: no slab for object 0x%p.\n",
                                                 object);
@@ -1348,6 +1345,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
         void *start;
         void *last;
         void *p;
+       int order;
  
         BUG_ON(flags & GFP_SLAB_BUG_MASK);
  
@@ -1356,8 +1354,10 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
         if (!page)
                 goto out;
  
+       order = compound_order(page);
         inc_slabs_node(s, page_to_nid(page), page->objects);
-       page->slab = s;
+       memcg_bind_pages(s, order);
+       page->slab_cache = s;
         __SetPageSlab(page);
         if (page->pfmemalloc)
                 SetPageSlabPfmemalloc(page);
@@ -1365,7 +1365,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
         start = page_address(page);
  
         if (unlikely(s->flags & SLAB_POISON))
-               memset(start, POISON_INUSE, PAGE_SIZE << compound_order(page));
+               memset(start, POISON_INUSE, PAGE_SIZE << order);
  
         last = start;
         for_each_object(p, s, start, page->objects) {
@@ -1406,10 +1406,12 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
  
         __ClearPageSlabPfmemalloc(page);
         __ClearPageSlab(page);
+
+       memcg_release_pages(s, order);
         reset_page_mapcount(page);
         if (current->reclaim_state)
                 current->reclaim_state->reclaimed_slab += pages;
-       __free_pages(page, order);
+       __free_memcg_kmem_pages(page, order);
  }
  
  #define need_reserve_slab_rcu                                          \
@@ -1424,7 +1426,7 @@ static void rcu_free_slab(struct rcu_head *h)
         else
                 page = container_of((struct list_head *)h, struct page, lru);
  
-       __free_slab(page->slab, page);
+       __free_slab(page->slab_cache, page);
  }
  
  static void free_slab(struct kmem_cache *s, struct page *page)
@@ -1491,7 +1493,7 @@ static inline void remove_partial(struct kmem_cache_node *n,
   */
  static inline void *acquire_slab(struct kmem_cache *s,
                 struct kmem_cache_node *n, struct page *page,
-               int mode)
+               int mode, int *objects)
  {
         void *freelist;
         unsigned long counters;
@@ -1505,6 +1507,7 @@ static inline void *acquire_slab(struct kmem_cache *s,
         freelist = page->freelist;
         counters = page->counters;
         new.counters = counters;
+       *objects = new.objects - new.inuse;
         if (mode) {
                 new.inuse = page->objects;
                 new.freelist = NULL;
@@ -1526,7 +1529,7 @@ static inline void *acquire_slab(struct kmem_cache *s,
         return freelist;
  }
  
-static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain);
+static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain);
  static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags);
  
  /*
@@ -1537,6 +1540,8 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
  {
         struct page *page, *page2;
         void *object = NULL;
+       int available = 0;
+       int objects;
  
         /*
          * Racy check. If we mistakenly see no partial slabs then we
@@ -1550,22 +1555,21 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
         spin_lock(&n->list_lock);
         list_for_each_entry_safe(page, page2, &n->partial, lru) {
                 void *t;
-               int available;
  
                 if (!pfmemalloc_match(page, flags))
                         continue;
  
-               t = acquire_slab(s, n, page, object == NULL);
+               t = acquire_slab(s, n, page, object == NULL, &objects);
                 if (!t)
                         break;
  
+               available += objects;
                 if (!object) {
                         c->page = page;
                         stat(s, ALLOC_FROM_PARTIAL);
                         object = t;
-                       available =  page->objects - page->inuse;
                 } else {
-                       available = put_cpu_partial(s, page, 0);
+                       put_cpu_partial(s, page, 0);
                         stat(s, CPU_PARTIAL_NODE);
                 }
                 if (kmem_cache_debug(s) || available > s->cpu_partial / 2)
@@ -1872,12 +1876,14 @@ redo:
  /*
   * Unfreeze all the cpu partial slabs.
   *
- * This function must be called with interrupt disabled.
+ * This function must be called with interrupts disabled
+ * for the cpu using c (or some other guarantee must be there
+ * to guarantee no concurrent accesses).
   */
-static void unfreeze_partials(struct kmem_cache *s)
+static void unfreeze_partials(struct kmem_cache *s,
+               struct kmem_cache_cpu *c)
  {
         struct kmem_cache_node *n = NULL, *n2 = NULL;
-       struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab);
         struct page *page, *discard_page = NULL;
  
         while ((page = c->partial)) {
@@ -1942,7 +1948,7 @@ static void unfreeze_partials(struct kmem_cache *s)
   * If we did not find a slot then simply move all the partials to the
   * per node partial list.
   */
-static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
+static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
  {
         struct page *oldpage;
         int pages;
@@ -1963,7 +1969,7 @@ static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
                                  * set to the per node partial list.
                                  */
                                 local_irq_save(flags);
-                               unfreeze_partials(s);
+                               unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
                                 local_irq_restore(flags);
                                 oldpage = NULL;
                                 pobjects = 0;
@@ -1980,7 +1986,6 @@ static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
                 page->next = oldpage;
  
         } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage);
-       return pobjects;
  }
  
  static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
@@ -2006,7 +2011,7 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
                 if (c->page)
                         flush_slab(s, c);
  
-               unfreeze_partials(s);
+               unfreeze_partials(s, c);
         }
  }
  
@@ -2037,7 +2042,7 @@ static void flush_all(struct kmem_cache *s)
  static inline int node_match(struct page *page, int node)
  {
  #ifdef CONFIG_NUMA
-       if (node != NUMA_NO_NODE && page_to_nid(page) != node)
+       if (!page || (node != NUMA_NO_NODE && page_to_nid(page) != node))
                 return 0;
  #endif
         return 1;
@@ -2325,14 +2330,20 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s,
         if (slab_pre_alloc_hook(s, gfpflags))
                 return NULL;
  
+       s = memcg_kmem_get_cache(s, gfpflags);
  redo:
-
         /*
          * Must read kmem_cache cpu data via this cpu ptr. Preemption is
          * enabled. We may switch back and forth between cpus while
          * reading from one cpu area. That does not matter as long
          * as we end up on the original cpu again when doing the cmpxchg.
+        *
+        * Preemption is disabled for the retrieval of the tid because that
+        * must occur from the current processor. We cannot allow rescheduling
+        * on a different processor between the determination of the pointer
+        * and the retrieval of the tid.
          */
+       preempt_disable();
         c = __this_cpu_ptr(s->cpu_slab);
  
         /*
@@ -2342,7 +2353,7 @@ redo:
          * linked list in between.
          */
         tid = c->tid;
-       barrier();
+       preempt_enable();
  
         object = c->freelist;
         page = c->page;
@@ -2459,7 +2470,6 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
         void *prior;
         void **object = (void *)x;
         int was_frozen;
-       int inuse;
         struct page new;
         unsigned long counters;
         struct kmem_cache_node *n = NULL;
@@ -2472,13 +2482,17 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
                 return;
  
         do {
+               if (unlikely(n)) {
+                       spin_unlock_irqrestore(&n->list_lock, flags);
+                       n = NULL;
+               }
                 prior = page->freelist;
                 counters = page->counters;
                 set_freepointer(s, object, prior);
                 new.counters = counters;
                 was_frozen = new.frozen;
                 new.inuse--;
-               if ((!new.inuse || !prior) && !was_frozen && !n) {
+               if ((!new.inuse || !prior) && !was_frozen) {
  
                         if (!kmem_cache_debug(s) && !prior)
  
@@ -2503,7 +2517,6 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
  
                         }
                 }
-               inuse = new.inuse;
  
         } while (!cmpxchg_double_slab(s, page,
                 prior, counters,
@@ -2529,25 +2542,17 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
                  return;
          }
  
+       if (unlikely(!new.inuse && n->nr_partial > s->min_partial))
+               goto slab_empty;
+
         /*
-        * was_frozen may have been set after we acquired the list_lock in
-        * an earlier loop. So we need to check it here again.
+        * Objects left in the slab. If it was not on the partial list before
+        * then add it.
          */
-       if (was_frozen)
-               stat(s, FREE_FROZEN);
-       else {
-               if (unlikely(!inuse && n->nr_partial > s->min_partial))
-                        goto slab_empty;
-
-               /*
-                * Objects left in the slab. If it was not on the partial list before
-                * then add it.
-                */
-               if (unlikely(!prior)) {
-                       remove_full(s, page);
-                       add_partial(n, page, DEACTIVATE_TO_TAIL);
-                       stat(s, FREE_ADD_PARTIAL);
-               }
+       if (kmem_cache_debug(s) && unlikely(!prior)) {
+               remove_full(s, page);
+               add_partial(n, page, DEACTIVATE_TO_TAIL);
+               stat(s, FREE_ADD_PARTIAL);
         }
         spin_unlock_irqrestore(&n->list_lock, flags);
         return;
@@ -2595,10 +2600,11 @@ redo:
          * data is retrieved via this pointer. If we are on the same cpu
          * during the cmpxchg then the free will succedd.
          */
+       preempt_disable();
         c = __this_cpu_ptr(s->cpu_slab);
  
         tid = c->tid;
-       barrier();
+       preempt_enable();
  
         if (likely(page == c->page)) {
                 set_freepointer(s, object, c->freelist);
@@ -2619,19 +2625,10 @@ redo:
  
  void kmem_cache_free(struct kmem_cache *s, void *x)
  {
-       struct page *page;
-
-       page = virt_to_head_page(x);
-
-       if (kmem_cache_debug(s) && page->slab != s) {
-               pr_err("kmem_cache_free: Wrong slab cache. %s but object"
-                       " is from  %s\n", page->slab->name, s->name);
-               WARN_ON_ONCE(1);
+       s = cache_from_obj(s, x);
+       if (!s)
                 return;
-       }
-
-       slab_free(s, page, x, _RET_IP_);
-
+       slab_free(s, virt_to_head_page(x), x, _RET_IP_);
         trace_kmem_cache_free(_RET_IP_, x);
  }
  EXPORT_SYMBOL(kmem_cache_free);
@@ -2769,32 +2766,6 @@ static inline int calculate_order(int size, int reserved)
         return -ENOSYS;
  }
  
-/*
- * Figure out what the alignment of the objects will be.
- */
-static unsigned long calculate_alignment(unsigned long flags,
-               unsigned long align, unsigned long size)
-{
-       /*
-        * If the user wants hardware cache aligned objects then follow that
-        * suggestion if the object is sufficiently large.
-        *
-        * The hardware cache alignment cannot override the specified
-        * alignment though. If that is greater then use it.
-        */
-       if (flags & SLAB_HWCACHE_ALIGN) {
-               unsigned long ralign = cache_line_size();
-               while (size <= ralign / 2)
-                       ralign /= 2;
-               align = max(align, ralign);
-       }
-
-       if (align < ARCH_SLAB_MINALIGN)
-               align = ARCH_SLAB_MINALIGN;
-
-       return ALIGN(align, sizeof(void *));
-}
-
  static void
  init_kmem_cache_node(struct kmem_cache_node *n)
  {
@@ -2811,7 +2782,7 @@ init_kmem_cache_node(struct kmem_cache_node *n)
  static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
  {
         BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
-                       SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu));
+                       KMALLOC_SHIFT_HIGH * sizeof(struct kmem_cache_cpu));
  
         /*
          * Must align to double word boundary for the double cmpxchg
@@ -2928,7 +2899,6 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
  {
         unsigned long flags = s->flags;
         unsigned long size = s->object_size;
-       unsigned long align = s->align;
         int order;
  
         /*
@@ -3000,19 +2970,11 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
  #endif
  
         /*
-        * Determine the alignment based on various parameters that the
-        * user specified and the dynamic determination of cache line size
-        * on bootup.
-        */
-       align = calculate_alignment(flags, align, s->object_size);
-       s->align = align;
-
-       /*
          * SLUB stores one object immediately after another beginning from
          * offset 0. In order to align the objects we have to simply size
          * each object to conform to the alignment.
          */
-       size = ALIGN(size, align);
+       size = ALIGN(size, s->align);
         s->size = size;
         if (forced_order >= 0)
                 order = forced_order;
@@ -3027,7 +2989,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
                 s->allocflags |= __GFP_COMP;
  
         if (s->flags & SLAB_CACHE_DMA)
-               s->allocflags |= SLUB_DMA;
+               s->allocflags |= GFP_DMA;
  
         if (s->flags & SLAB_RECLAIM_ACCOUNT)
                 s->allocflags |= __GFP_RECLAIMABLE;
@@ -3041,7 +3003,6 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
                 s->max = s->oo;
  
         return !!oo_objects(s->oo);
-
  }
  
  static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
@@ -3127,15 +3088,6 @@ error:
         return -EINVAL;
  }
  
-/*
- * Determine the size of a slab object
- */
-unsigned int kmem_cache_size(struct kmem_cache *s)
-{
-       return s->object_size;
-}
-EXPORT_SYMBOL(kmem_cache_size);
-
  static void list_slab_objects(struct kmem_cache *s, struct page *page,
                                                         const char *text)
  {
@@ -3208,8 +3160,19 @@ int __kmem_cache_shutdown(struct kmem_cache *s)
  {
         int rc = kmem_cache_close(s);
  
-       if (!rc)
+       if (!rc) {
+               /*
+                * We do the same lock strategy around sysfs_slab_add, see
+                * __kmem_cache_create. Because this is pretty much the last
+                * operation we do and the lock will be released shortly after
+                * that in slab_common.c, we could just move sysfs_slab_remove
+                * to a later point in common code. We should do that when we
+                * have a common sysfs framework for all allocators.
+                */
+               mutex_unlock(&slab_mutex);
                 sysfs_slab_remove(s);
+               mutex_lock(&slab_mutex);
+       }
  
         return rc;
  }
@@ -3218,13 +3181,6 @@ int __kmem_cache_shutdown(struct kmem_cache *s)
   *             Kmalloc subsystem
   *******************************************************************/
  
-struct kmem_cache *kmalloc_caches[SLUB_PAGE_SHIFT];
-EXPORT_SYMBOL(kmalloc_caches);
-
-#ifdef CONFIG_ZONE_DMA
-static struct kmem_cache *kmalloc_dma_caches[SLUB_PAGE_SHIFT];
-#endif
-
  static int __init setup_slub_min_order(char *str)
  {
         get_option(&str, &slub_min_order);
@@ -3261,99 +3217,15 @@ static int __init setup_slub_nomerge(char *str)
  
  __setup("slub_nomerge", setup_slub_nomerge);
  
-static struct kmem_cache *__init create_kmalloc_cache(const char *name,
-                                               int size, unsigned int flags)
-{
-       struct kmem_cache *s;
-
-       s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
-
-       s->name = name;
-       s->size = s->object_size = size;
-       s->align = ARCH_KMALLOC_MINALIGN;
-
-       /*
-        * This function is called with IRQs disabled during early-boot on
-        * single CPU so there's no need to take slab_mutex here.
-        */
-       if (kmem_cache_open(s, flags))
-               goto panic;
-
-       list_add(&s->list, &slab_caches);
-       return s;
-
-panic:
-       panic("Creation of kmalloc slab %s size=%d failed.\n", name, size);
-       return NULL;
-}
-
-/*
- * Conversion table for small slabs sizes / 8 to the index in the
- * kmalloc array. This is necessary for slabs < 192 since we have non power
- * of two cache sizes there. The size of larger slabs can be determined using
- * fls.
- */
-static s8 size_index[24] = {
-       3,      /* 8 */
-       4,      /* 16 */
-       5,      /* 24 */
-       5,      /* 32 */
-       6,      /* 40 */
-       6,      /* 48 */
-       6,      /* 56 */
-       6,      /* 64 */
-       1,      /* 72 */
-       1,      /* 80 */
-       1,      /* 88 */
-       1,      /* 96 */
-       7,      /* 104 */
-       7,      /* 112 */
-       7,      /* 120 */
-       7,      /* 128 */
-       2,      /* 136 */
-       2,      /* 144 */
-       2,      /* 152 */
-       2,      /* 160 */
-       2,      /* 168 */
-       2,      /* 176 */
-       2,      /* 184 */
-       2       /* 192 */
-};
-
-static inline int size_index_elem(size_t bytes)
-{
-       return (bytes - 1) / 8;
-}
-
-static struct kmem_cache *get_slab(size_t size, gfp_t flags)
-{
-       int index;
-
-       if (size <= 192) {
-               if (!size)
-                       return ZERO_SIZE_PTR;
-
-               index = size_index[size_index_elem(size)];
-       } else
-               index = fls(size - 1);
-
-#ifdef CONFIG_ZONE_DMA
-       if (unlikely((flags & SLUB_DMA)))
-               return kmalloc_dma_caches[index];
-
-#endif
-       return kmalloc_caches[index];
-}
-
  void *__kmalloc(size_t size, gfp_t flags)
  {
         struct kmem_cache *s;
         void *ret;
  
-       if (unlikely(size > SLUB_MAX_SIZE))
+       if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
                 return kmalloc_large(size, flags);
  
-       s = get_slab(size, flags);
+       s = kmalloc_slab(size, flags);
  
         if (unlikely(ZERO_OR_NULL_PTR(s)))
                 return s;
@@ -3372,7 +3244,7 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
         struct page *page;
         void *ptr = NULL;
  
-       flags |= __GFP_COMP | __GFP_NOTRACK;
+       flags |= __GFP_COMP | __GFP_NOTRACK | __GFP_KMEMCG;
         page = alloc_pages_node(node, flags, get_order(size));
         if (page)
                 ptr = page_address(page);
@@ -3386,7 +3258,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
         struct kmem_cache *s;
         void *ret;
  
-       if (unlikely(size > SLUB_MAX_SIZE)) {
+       if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
                 ret = kmalloc_large_node(size, flags, node);
  
                 trace_kmalloc_node(_RET_IP_, ret,
@@ -3396,7 +3268,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
                 return ret;
         }
  
-       s = get_slab(size, flags);
+       s = kmalloc_slab(size, flags);
  
         if (unlikely(ZERO_OR_NULL_PTR(s)))
                 return s;
@@ -3424,7 +3296,7 @@ size_t ksize(const void *object)
                 return PAGE_SIZE << compound_order(page);
         }
  
-       return slab_ksize(page->slab);
+       return slab_ksize(page->slab_cache);
  }
  EXPORT_SYMBOL(ksize);
  
@@ -3449,8 +3321,8 @@ bool verify_mem_not_deleted(const void *x)
         }
  
         slab_lock(page);
-       if (on_freelist(page->slab, page, object)) {
-               object_err(page->slab, page, object, "Object is on free-list");
+       if (on_freelist(page->slab_cache, page, object)) {
+               object_err(page->slab_cache, page, object, "Object is on free-list");
                 rv = false;
         } else {
                 rv = true;
@@ -3478,10 +3350,10 @@ void kfree(const void *x)
         if (unlikely(!PageSlab(page))) {
                 BUG_ON(!PageCompound(page));
                 kmemleak_free(x);
-               __free_pages(page, compound_order(page));
+               __free_memcg_kmem_pages(page, compound_order(page));
                 return;
         }
-       slab_free(page->slab, page, object, _RET_IP_);
+       slab_free(page->slab_cache, page, object, _RET_IP_);
  }
  EXPORT_SYMBOL(kfree);
  
@@ -3676,193 +3548,85 @@ static int slab_memory_callback(struct notifier_block *self,
  
  /*
   * Used for early kmem_cache structures that were allocated using
- * the page allocator
+ * the page allocator. Allocate them properly then fix up the pointers
+ * that may be pointing to the wrong kmem_cache structure.
   */
  
-static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s)
+static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
  {
         int node;
+       struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
  
-       list_add(&s->list, &slab_caches);
-       s->refcount = -1;
+       memcpy(s, static_cache, kmem_cache->object_size);
  
+       /*
+        * This runs very early, and only the boot processor is supposed to be
+        * up.  Even if it weren't true, IRQs are not up so we couldn't fire
+        * IPIs around.
+        */
+       __flush_cpu_slab(s, smp_processor_id());
         for_each_node_state(node, N_NORMAL_MEMORY) {
                 struct kmem_cache_node *n = get_node(s, node);
                 struct page *p;
  
                 if (n) {
                         list_for_each_entry(p, &n->partial, lru)
-                               p->slab = s;
+                               p->slab_cache = s;
  
  #ifdef CONFIG_SLUB_DEBUG
                         list_for_each_entry(p, &n->full, lru)
-                               p->slab = s;
+                               p->slab_cache = s;
  #endif
                 }
         }
+       list_add(&s->list, &slab_caches);
+       return s;
  }
  
  void __init kmem_cache_init(void)
  {
-       int i;
-       int caches = 0;
-       struct kmem_cache *temp_kmem_cache;
-       int order;
-       struct kmem_cache *temp_kmem_cache_node;
-       unsigned long kmalloc_size;
+       static __initdata struct kmem_cache boot_kmem_cache,
+               boot_kmem_cache_node;
  
         if (debug_guardpage_minorder())
                 slub_max_order = 0;
  
-       kmem_size = offsetof(struct kmem_cache, node) +
-                       nr_node_ids * sizeof(struct kmem_cache_node *);
-
-       /* Allocate two kmem_caches from the page allocator */
-       kmalloc_size = ALIGN(kmem_size, cache_line_size());
-       order = get_order(2 * kmalloc_size);
-       kmem_cache = (void *)__get_free_pages(GFP_NOWAIT | __GFP_ZERO, order);
+       kmem_cache_node = &boot_kmem_cache_node;
+       kmem_cache = &boot_kmem_cache;
  
-       /*
-        * Must first have the slab cache available for the allocations of the
-        * struct kmem_cache_node's. There is special bootstrap code in
-        * kmem_cache_open for slab_state == DOWN.
-        */
-       kmem_cache_node = (void *)kmem_cache + kmalloc_size;
-
-       kmem_cache_node->name = "kmem_cache_node";
-       kmem_cache_node->size = kmem_cache_node->object_size =
-               sizeof(struct kmem_cache_node);
-       kmem_cache_open(kmem_cache_node, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
+       create_boot_cache(kmem_cache_node, "kmem_cache_node",
+               sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN);
  
         hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
  
         /* Able to allocate the per node structures */
         slab_state = PARTIAL;
  
-       temp_kmem_cache = kmem_cache;
-       kmem_cache->name = "kmem_cache";
-       kmem_cache->size = kmem_cache->object_size = kmem_size;
-       kmem_cache_open(kmem_cache, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
+       create_boot_cache(kmem_cache, "kmem_cache",
+                       offsetof(struct kmem_cache, node) +
+                               nr_node_ids * sizeof(struct kmem_cache_node *),
+                      SLAB_HWCACHE_ALIGN);
  
-       kmem_cache = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
-       memcpy(kmem_cache, temp_kmem_cache, kmem_size);
+       kmem_cache = bootstrap(&boot_kmem_cache);
  
         /*
          * Allocate kmem_cache_node properly from the kmem_cache slab.
          * kmem_cache_node is separately allocated so no need to
          * update any list pointers.
          */
-       temp_kmem_cache_node = kmem_cache_node;
-
-       kmem_cache_node = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
-       memcpy(kmem_cache_node, temp_kmem_cache_node, kmem_size);
-
-       kmem_cache_bootstrap_fixup(kmem_cache_node);
-
-       caches++;
-       kmem_cache_bootstrap_fixup(kmem_cache);
-       caches++;
-       /* Free temporary boot structure */
-       free_pages((unsigned long)temp_kmem_cache, order);
+       kmem_cache_node = bootstrap(&boot_kmem_cache_node);
  
         /* Now we can use the kmem_cache to allocate kmalloc slabs */
-
-       /*
-        * Patch up the size_index table if we have strange large alignment
-        * requirements for the kmalloc array. This is only the case for
-        * MIPS it seems. The standard arches will not generate any code here.
-        *
-        * Largest permitted alignment is 256 bytes due to the way we
-        * handle the index determination for the smaller caches.
-        *
-        * Make sure that nothing crazy happens if someone starts tinkering
-        * around with ARCH_KMALLOC_MINALIGN
-        */
-       BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
-               (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1)));
-
-       for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) {
-               int elem = size_index_elem(i);
-               if (elem >= ARRAY_SIZE(size_index))
-                       break;
-               size_index[elem] = KMALLOC_SHIFT_LOW;
-       }
-
-       if (KMALLOC_MIN_SIZE == 64) {
-               /*
-                * The 96 byte size cache is not used if the alignment
-                * is 64 byte.
-                */
-               for (i = 64 + 8; i <= 96; i += 8)
-                       size_index[size_index_elem(i)] = 7;
-       } else if (KMALLOC_MIN_SIZE == 128) {
-               /*
-                * The 192 byte sized cache is not used if the alignment
-                * is 128 byte. Redirect kmalloc to use the 256 byte cache
-                * instead.
-                */
-               for (i = 128 + 8; i <= 192; i += 8)
-                       size_index[size_index_elem(i)] = 8;
-       }
-
-       /* Caches that are not of the two-to-the-power-of size */
-       if (KMALLOC_MIN_SIZE <= 32) {
-               kmalloc_caches[1] = create_kmalloc_cache("kmalloc-96", 96, 0);
-               caches++;
-       }
-
-       if (KMALLOC_MIN_SIZE <= 64) {
-               kmalloc_caches[2] = create_kmalloc_cache("kmalloc-192", 192, 0);
-               caches++;
-       }
-
-       for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
-               kmalloc_caches[i] = create_kmalloc_cache("kmalloc", 1 << i, 0);
-               caches++;
-       }
-
-       slab_state = UP;
-
-       /* Provide the correct kmalloc names now that the caches are up */
-       if (KMALLOC_MIN_SIZE <= 32) {
-               kmalloc_caches[1]->name = kstrdup(kmalloc_caches[1]->name, GFP_NOWAIT);
-               BUG_ON(!kmalloc_caches[1]->name);
-       }
-
-       if (KMALLOC_MIN_SIZE <= 64) {
-               kmalloc_caches[2]->name = kstrdup(kmalloc_caches[2]->name, GFP_NOWAIT);
-               BUG_ON(!kmalloc_caches[2]->name);
-       }
-
-       for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
-               char *s = kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i);
-
-               BUG_ON(!s);
-               kmalloc_caches[i]->name = s;
-       }
+       create_kmalloc_caches(0);
  
  #ifdef CONFIG_SMP
         register_cpu_notifier(&slab_notifier);
  #endif
  
-#ifdef CONFIG_ZONE_DMA
-       for (i = 0; i < SLUB_PAGE_SHIFT; i++) {
-               struct kmem_cache *s = kmalloc_caches[i];
-
-               if (s && s->size) {
-                       char *name = kasprintf(GFP_NOWAIT,
-                                "dma-kmalloc-%d", s->object_size);
-
-                       BUG_ON(!name);
-                       kmalloc_dma_caches[i] = create_kmalloc_cache(name,
-                               s->object_size, SLAB_CACHE_DMA);
-               }
-       }
-#endif
         printk(KERN_INFO
-               "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d,"
+               "SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d,"
                 " CPUs=%d, Nodes=%d\n",
-               caches, cache_line_size(),
+               cache_line_size(),
                 slub_min_order, slub_max_order, slub_min_objects,
                 nr_cpu_ids, nr_node_ids);
  }
@@ -3891,7 +3655,7 @@ static int slab_unmergeable(struct kmem_cache *s)
         return 0;
  }
  
-static struct kmem_cache *find_mergeable(size_t size,
+static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size,
                 size_t align, unsigned long flags, const char *name,
                 void (*ctor)(void *))
  {
@@ -3927,17 +3691,21 @@ static struct kmem_cache *find_mergeable(size_t size,
                 if (s->size - size >= sizeof(void *))
                         continue;
  
+               if (!cache_match_memcg(s, memcg))
+                       continue;
+
                 return s;
         }
         return NULL;
  }
  
-struct kmem_cache *__kmem_cache_alias(const char *name, size_t size,
-               size_t align, unsigned long flags, void (*ctor)(void *))
+struct kmem_cache *
+__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
+                  size_t align, unsigned long flags, void (*ctor)(void *))
  {
         struct kmem_cache *s;
  
-       s = find_mergeable(size, align, flags, name, ctor);
+       s = find_mergeable(memcg, size, align, flags, name, ctor);
         if (s) {
                 s->refcount++;
                 /*
@@ -3964,6 +3732,11 @@ int __kmem_cache_create(struct kmem_cache *s, unsigned long flags)
         if (err)
                 return err;
  
+       /* Mutex is not taken during early boot */
+       if (slab_state <= UP)
+               return 0;
+
+       memcg_propagate_slab_attrs(s);
         mutex_unlock(&slab_mutex);
         err = sysfs_slab_add(s);
         mutex_lock(&slab_mutex);
@@ -4016,10 +3789,10 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
         struct kmem_cache *s;
         void *ret;
  
-       if (unlikely(size > SLUB_MAX_SIZE))
+       if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
                 return kmalloc_large(size, gfpflags);
  
-       s = get_slab(size, gfpflags);
+       s = kmalloc_slab(size, gfpflags);
  
         if (unlikely(ZERO_OR_NULL_PTR(s)))
                 return s;
@@ -4039,7 +3812,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
         struct kmem_cache *s;
         void *ret;
  
-       if (unlikely(size > SLUB_MAX_SIZE)) {
+       if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
                 ret = kmalloc_large_node(size, gfpflags, node);
  
                 trace_kmalloc_node(caller, ret,
@@ -4049,7 +3822,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
                 return ret;
         }
  
-       s = get_slab(size, gfpflags);
+       s = kmalloc_slab(size, gfpflags);
  
         if (unlikely(ZERO_OR_NULL_PTR(s)))
                 return s;
@@ -4398,7 +4171,7 @@ static void resiliency_test(void)
  {
         u8 *p;
  
-       BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || SLUB_PAGE_SHIFT < 10);
+       BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10);
  
         printk(KERN_ERR "SLUB resiliency testing\n");
         printk(KERN_ERR "-----------------------\n");
@@ -5197,10 +4970,95 @@ static ssize_t slab_attr_store(struct kobject *kobj,
                 return -EIO;
  
         err = attribute->store(s, buf, len);
+#ifdef CONFIG_MEMCG_KMEM
+       if (slab_state >= FULL && err >= 0 && is_root_cache(s)) {
+               int i;
  
+               mutex_lock(&slab_mutex);
+               if (s->max_attr_size < len)
+                       s->max_attr_size = len;
+
+               /*
+                * This is a best effort propagation, so this function's return
+                * value will be determined by the parent cache only. This is
+                * basically because not all attributes will have a well
+                * defined semantics for rollbacks - most of the actions will
+                * have permanent effects.
+                *
+                * Returning the error value of any of the children that fail
+                * is not 100 % defined, in the sense that users seeing the
+                * error code won't be able to know anything about the state of
+                * the cache.
+                *
+                * Only returning the error code for the parent cache at least
+                * has well defined semantics. The cache being written to
+                * directly either failed or succeeded, in which case we loop
+                * through the descendants with best-effort propagation.
+                */
+               for_each_memcg_cache_index(i) {
+                       struct kmem_cache *c = cache_from_memcg(s, i);
+                       if (c)
+                               attribute->store(c, buf, len);
+               }
+               mutex_unlock(&slab_mutex);
+       }
+#endif
         return err;
  }
  
+static void memcg_propagate_slab_attrs(struct kmem_cache *s)
+{
+#ifdef CONFIG_MEMCG_KMEM
+       int i;
+       char *buffer = NULL;
+
+       if (!is_root_cache(s))
+               return;
+
+       /*
+        * This mean this cache had no attribute written. Therefore, no point
+        * in copying default values around
+        */
+       if (!s->max_attr_size)
+               return;
+
+       for (i = 0; i < ARRAY_SIZE(slab_attrs); i++) {
+               char mbuf[64];
+               char *buf;
+               struct slab_attribute *attr = to_slab_attr(slab_attrs[i]);
+
+               if (!attr || !attr->store || !attr->show)
+                       continue;
+
+               /*
+                * It is really bad that we have to allocate here, so we will
+                * do it only as a fallback. If we actually allocate, though,
+                * we can just use the allocated buffer until the end.
+                *
+                * Most of the slub attributes will tend to be very small in
+                * size, but sysfs allows buffers up to a page, so they can
+                * theoretically happen.
+                */
+               if (buffer)
+                       buf = buffer;
+               else if (s->max_attr_size < ARRAY_SIZE(mbuf))
+                       buf = mbuf;
+               else {
+                       buffer = (char *) get_zeroed_page(GFP_KERNEL);
+                       if (WARN_ON(!buffer))
+                               continue;
+                       buf = buffer;
+               }
+
+               attr->show(s->memcg_params->root_cache, buf);
+               attr->store(s, buf, strlen(buf));
+       }
+
+       if (buffer)
+               free_page((unsigned long)buffer);
+#endif
+}
+
  static const struct sysfs_ops slab_sysfs_ops = {
         .show = slab_attr_show,
         .store = slab_attr_store,
@@ -5257,6 +5115,12 @@ static char *create_unique_id(struct kmem_cache *s)
         if (p != name + 1)
                 *p++ = '-';
         p += sprintf(p, "%07d", s->size);
+
+#ifdef CONFIG_MEMCG_KMEM
+       if (!is_root_cache(s))
+               p += sprintf(p, "-%08d", memcg_cache_id(s->memcg_params->memcg));
+#endif
+
         BUG_ON(p > name + ID_STR_LENGTH - 1);
         return name;
  }
@@ -5265,13 +5129,8 @@ static int sysfs_slab_add(struct kmem_cache *s)
  {
         int err;
         const char *name;
-       int unmergeable;
-
-       if (slab_state < FULL)
-               /* Defer until later */
-               return 0;
+       int unmergeable = slab_unmergeable(s);
  
-       unmergeable = slab_unmergeable(s);
         if (unmergeable) {
                 /*
                  * Slabcache can never be merged so we can use the name proper.
@@ -5405,49 +5264,14 @@ __initcall(slab_sysfs_init);
   * The /proc/slabinfo ABI
   */
  #ifdef CONFIG_SLABINFO
-static void print_slabinfo_header(struct seq_file *m)
-{
-       seq_puts(m, "slabinfo - version: 2.1\n");
-       seq_puts(m, "# name            <active_objs> <num_objs> <object_size> "
-                "<objperslab> <pagesperslab>");
-       seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
-       seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
-       seq_putc(m, '\n');
-}
-
-static void *s_start(struct seq_file *m, loff_t *pos)
-{
-       loff_t n = *pos;
-
-       mutex_lock(&slab_mutex);
-       if (!n)
-               print_slabinfo_header(m);
-
-       return seq_list_start(&slab_caches, *pos);
-}
-
-static void *s_next(struct seq_file *m, void *p, loff_t *pos)
-{
-       return seq_list_next(p, &slab_caches, pos);
-}
-
-static void s_stop(struct seq_file *m, void *p)
-{
-       mutex_unlock(&slab_mutex);
-}
-
-static int s_show(struct seq_file *m, void *p)
+void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
  {
         unsigned long nr_partials = 0;
         unsigned long nr_slabs = 0;
-       unsigned long nr_inuse = 0;
         unsigned long nr_objs = 0;
         unsigned long nr_free = 0;
-       struct kmem_cache *s;
         int node;
  
-       s = list_entry(p, struct kmem_cache, list);
-
         for_each_online_node(node) {
                 struct kmem_cache_node *n = get_node(s, node);
  
@@ -5460,41 +5284,21 @@ static int s_show(struct seq_file *m, void *p)
                 nr_free += count_partial(n, count_free);
         }
  
-       nr_inuse = nr_objs - nr_free;
-
-       seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", s->name, nr_inuse,
-                  nr_objs, s->size, oo_objects(s->oo),
-                  (1 << oo_order(s->oo)));
-       seq_printf(m, " : tunables %4u %4u %4u", 0, 0, 0);
-       seq_printf(m, " : slabdata %6lu %6lu %6lu", nr_slabs, nr_slabs,
-                  0UL);
-       seq_putc(m, '\n');
-       return 0;
+       sinfo->active_objs = nr_objs - nr_free;
+       sinfo->num_objs = nr_objs;
+       sinfo->active_slabs = nr_slabs;
+       sinfo->num_slabs = nr_slabs;
+       sinfo->objects_per_slab = oo_objects(s->oo);
+       sinfo->cache_order = oo_order(s->oo);
  }
  
-static const struct seq_operations slabinfo_op = {
-       .start = s_start,
-       .next = s_next,
-       .stop = s_stop,
-       .show = s_show,
-};
-
-static int slabinfo_open(struct inode *inode, struct file *file)
+void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s)
  {
-       return seq_open(file, &slabinfo_op);
  }
  
-static const struct file_operations proc_slabinfo_operations = {
-       .open           = slabinfo_open,
-       .read           = seq_read,
-       .llseek         = seq_lseek,
-       .release        = seq_release,
-};
-
-static int __init slab_proc_init(void)
+ssize_t slabinfo_write(struct file *file, const char __user *buffer,
+                      size_t count, loff_t *ppos)
  {
-       proc_create("slabinfo", S_IRUSR, NULL, &proc_slabinfo_operations);
-       return 0;
+       return -EIO;
  }
-module_init(slab_proc_init);
  #endif /* CONFIG_SLABINFO */