mm/slub.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * SLUB: A slab allocator that limits cache line use instead of queuing
   4  * objects in per cpu and per node lists.
   5  *
   6  * The allocator synchronizes using per slab locks or atomic operations
   7  * and only uses a centralized lock to manage a pool of partial slabs.
   8  *
   9  * (C) 2007 SGI, Christoph Lameter
  10  * (C) 2011 Linux Foundation, Christoph Lameter
  11  */
  12
  13 #include <linux/mm.h>
  14 #include <linux/swap.h> /* struct reclaim_state */
  15 #include <linux/module.h>
  16 #include <linux/bit_spinlock.h>
  17 #include <linux/interrupt.h>
  18 #include <linux/swab.h>
  19 #include <linux/bitops.h>
  20 #include <linux/slab.h>
  21 #include "slab.h"
  22 #include <linux/proc_fs.h>
  23 #include <linux/seq_file.h>
  24 #include <linux/kasan.h>
  25 #include <linux/cpu.h>
  26 #include <linux/cpuset.h>
  27 #include <linux/mempolicy.h>
  28 #include <linux/ctype.h>
  29 #include <linux/debugobjects.h>
  30 #include <linux/kallsyms.h>
  31 #include <linux/kfence.h>
  32 #include <linux/memory.h>
  33 #include <linux/math64.h>
  34 #include <linux/fault-inject.h>
  35 #include <linux/stacktrace.h>
  36 #include <linux/prefetch.h>
  37 #include <linux/memcontrol.h>
  38 #include <linux/random.h>
  39 #include <kunit/test.h>
  40
  41 #include <linux/debugfs.h>
  42 #include <trace/events/kmem.h>
  43
  44 #include "internal.h"
  45
  46 /*
  47  * Lock order:
  48  *   1. slab_mutex (Global Mutex)
  49  *   2. node->list_lock
  50  *   3. slab_lock(page) (Only on some arches and for debugging)
  51  *
  52  *   slab_mutex
  53  *
  54  *   The role of the slab_mutex is to protect the list of all the slabs
  55  *   and to synchronize major metadata changes to slab cache structures.
  56  *
  57  *   The slab_lock is only used for debugging and on arches that do not
  58  *   have the ability to do a cmpxchg_double. It only protects:
  59  *      A. page->freelist       -> List of object free in a page
  60  *      B. page->inuse          -> Number of objects in use
  61  *      C. page->objects        -> Number of objects in page
  62  *      D. page->frozen         -> frozen state
  63  *
  64  *   If a slab is frozen then it is exempt from list management. It is not
  65  *   on any list except per cpu partial list. The processor that froze the
  66  *   slab is the one who can perform list operations on the page. Other
  67  *   processors may put objects onto the freelist but the processor that
  68  *   froze the slab is the only one that can retrieve the objects from the
  69  *   page's freelist.
  70  *
  71  *   The list_lock protects the partial and full list on each node and
  72  *   the partial slab counter. If taken then no new slabs may be added or
  73  *   removed from the lists nor make the number of partial slabs be modified.
  74  *   (Note that the total number of slabs is an atomic value that may be
  75  *   modified without taking the list lock).
  76  *
  77  *   The list_lock is a centralized lock and thus we avoid taking it as
  78  *   much as possible. As long as SLUB does not have to handle partial
  79  *   slabs, operations can continue without any centralized lock. F.e.
  80  *   allocating a long series of objects that fill up slabs does not require
  81  *   the list lock.
  82  *   Interrupts are disabled during allocation and deallocation in order to
  83  *   make the slab allocator safe to use in the context of an irq. In addition
  84  *   interrupts are disabled to ensure that the processor does not change
  85  *   while handling per_cpu slabs, due to kernel preemption.
  86  *
  87  * SLUB assigns one slab for allocation to each processor.
  88  * Allocations only occur from these slabs called cpu slabs.
  89  *
  90  * Slabs with free elements are kept on a partial list and during regular
  91  * operations no list for full slabs is used. If an object in a full slab is
  92  * freed then the slab will show up again on the partial lists.
  93  * We track full slabs for debugging purposes though because otherwise we
  94  * cannot scan all objects.
  95  *
  96  * Slabs are freed when they become empty. Teardown and setup is
  97  * minimal so we rely on the page allocators per cpu caches for
  98  * fast frees and allocs.
  99  *
 100  * page->frozen         The slab is frozen and exempt from list processing.
 101  *                      This means that the slab is dedicated to a purpose
 102  *                      such as satisfying allocations for a specific
 103  *                      processor. Objects may be freed in the slab while
 104  *                      it is frozen but slab_free will then skip the usual
 105  *                      list operations. It is up to the processor holding
 106  *                      the slab to integrate the slab into the slab lists
 107  *                      when the slab is no longer needed.
 108  *
 109  *                      One use of this flag is to mark slabs that are
 110  *                      used for allocations. Then such a slab becomes a cpu
 111  *                      slab. The cpu slab may be equipped with an additional
 112  *                      freelist that allows lockless access to
 113  *                      free objects in addition to the regular freelist
 114  *                      that requires the slab lock.
 115  *
 116  * SLAB_DEBUG_FLAGS     Slab requires special handling due to debug
 117  *                      options set. This moves slab handling out of
 118  *                      the fast path and disables lockless freelists.
 119  */
 120
 121 #ifdef CONFIG_SLUB_DEBUG
 122 #ifdef CONFIG_SLUB_DEBUG_ON
 123 DEFINE_STATIC_KEY_TRUE(slub_debug_enabled);
 124 #else
 125 DEFINE_STATIC_KEY_FALSE(slub_debug_enabled);
 126 #endif
 127 #endif          /* CONFIG_SLUB_DEBUG */
 128
 129 static inline bool kmem_cache_debug(struct kmem_cache *s)
 130 {
 131         return kmem_cache_debug_flags(s, SLAB_DEBUG_FLAGS);
 132 }
 133
 134 void *fixup_red_left(struct kmem_cache *s, void *p)
 135 {
 136         if (kmem_cache_debug_flags(s, SLAB_RED_ZONE))
 137                 p += s->red_left_pad;
 138
 139         return p;
 140 }
 141
 142 static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
 143 {
 144 #ifdef CONFIG_SLUB_CPU_PARTIAL
 145         return !kmem_cache_debug(s);
 146 #else
 147         return false;
 148 #endif
 149 }
 150
 151 /*
 152  * Issues still to be resolved:
 153  *
 154  * - Support PAGE_ALLOC_DEBUG. Should be easy to do.
 155  *
 156  * - Variable sizing of the per node arrays
 157  */
 158
 159 /* Enable to log cmpxchg failures */
 160 #undef SLUB_DEBUG_CMPXCHG
 161
 162 /*
 163  * Minimum number of partial slabs. These will be left on the partial
 164  * lists even if they are empty. kmem_cache_shrink may reclaim them.
 165  */
 166 #define MIN_PARTIAL 5
 167
 168 /*
 169  * Maximum number of desirable partial slabs.
 170  * The existence of more partial slabs makes kmem_cache_shrink
 171  * sort the partial list by the number of objects in use.
 172  */
 173 #define MAX_PARTIAL 10
 174
 175 #define DEBUG_DEFAULT_FLAGS (SLAB_CONSISTENCY_CHECKS | SLAB_RED_ZONE | \
 176                                 SLAB_POISON | SLAB_STORE_USER)
 177
 178 /*
 179  * These debug flags cannot use CMPXCHG because there might be consistency
 180  * issues when checking or reading debug information
 181  */
 182 #define SLAB_NO_CMPXCHG (SLAB_CONSISTENCY_CHECKS | SLAB_STORE_USER | \
 183                                 SLAB_TRACE)
 184
 185
 186 /*
 187  * Debugging flags that require metadata to be stored in the slab.  These get
 188  * disabled when slub_debug=O is used and a cache's min order increases with
 189  * metadata.
 190  */
 191 #define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
 192
 193 #define OO_SHIFT        16
 194 #define OO_MASK         ((1 << OO_SHIFT) - 1)
 195 #define MAX_OBJS_PER_PAGE       32767 /* since page.objects is u15 */
 196
 197 /* Internal SLUB flags */
 198 /* Poison object */
 199 #define __OBJECT_POISON         ((slab_flags_t __force)0x80000000U)
 200 /* Use cmpxchg_double */
 201 #define __CMPXCHG_DOUBLE        ((slab_flags_t __force)0x40000000U)
 202
 203 /*
 204  * Tracking user of a slab.
 205  */
 206 #define TRACK_ADDRS_COUNT 16
 207 struct track {
 208         unsigned long addr;     /* Called from address */
 209 #ifdef CONFIG_STACKTRACE
 210         unsigned long addrs[TRACK_ADDRS_COUNT]; /* Called from address */
 211 #endif
 212         int cpu;                /* Was running on cpu */
 213         int pid;                /* Pid context */
 214         unsigned long when;     /* When did the operation occur */
 215 };
 216
 217 enum track_item { TRACK_ALLOC, TRACK_FREE };
 218
 219 #ifdef CONFIG_SYSFS
 220 static int sysfs_slab_add(struct kmem_cache *);
 221 static int sysfs_slab_alias(struct kmem_cache *, const char *);
 222 #else
 223 static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
 224 static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
 225                                                         { return 0; }
 226 #endif
 227
 228 #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_SLUB_DEBUG)
 229 static void debugfs_slab_add(struct kmem_cache *);
 230 #else
 231 static inline void debugfs_slab_add(struct kmem_cache *s) { }
 232 #endif
 233
 234 static inline void stat(const struct kmem_cache *s, enum stat_item si)
 235 {
 236 #ifdef CONFIG_SLUB_STATS
 237         /*
 238          * The rmw is racy on a preemptible kernel but this is acceptable, so
 239          * avoid this_cpu_add()'s irq-disable overhead.
 240          */
 241         raw_cpu_inc(s->cpu_slab->stat[si]);
 242 #endif
 243 }
 244
 245 /*
 246  * Tracks for which NUMA nodes we have kmem_cache_nodes allocated.
 247  * Corresponds to node_state[N_NORMAL_MEMORY], but can temporarily
 248  * differ during memory hotplug/hotremove operations.
 249  * Protected by slab_mutex.
 250  */
 251 static nodemask_t slab_nodes;
 252
 253 /********************************************************************
 254  *                      Core slab cache functions
 255  *******************************************************************/
 256
 257 /*
 258  * Returns freelist pointer (ptr). With hardening, this is obfuscated
 259  * with an XOR of the address where the pointer is held and a per-cache
 260  * random number.
 261  */
 262 static inline void *freelist_ptr(const struct kmem_cache *s, void *ptr,
 263                                  unsigned long ptr_addr)
 264 {
 265 #ifdef CONFIG_SLAB_FREELIST_HARDENED
 266         /*
 267          * When CONFIG_KASAN_SW/HW_TAGS is enabled, ptr_addr might be tagged.
 268          * Normally, this doesn't cause any issues, as both set_freepointer()
 269          * and get_freepointer() are called with a pointer with the same tag.
 270          * However, there are some issues with CONFIG_SLUB_DEBUG code. For
 271          * example, when __free_slub() iterates over objects in a cache, it
 272          * passes untagged pointers to check_object(). check_object() in turns
 273          * calls get_freepointer() with an untagged pointer, which causes the
 274          * freepointer to be restored incorrectly.
 275          */
 276         return (void *)((unsigned long)ptr ^ s->random ^
 277                         swab((unsigned long)kasan_reset_tag((void *)ptr_addr)));
 278 #else
 279         return ptr;
 280 #endif
 281 }
 282
 283 /* Returns the freelist pointer recorded at location ptr_addr. */
 284 static inline void *freelist_dereference(const struct kmem_cache *s,
 285                                          void *ptr_addr)
 286 {
 287         return freelist_ptr(s, (void *)*(unsigned long *)(ptr_addr),
 288                             (unsigned long)ptr_addr);
 289 }
 290
 291 static inline void *get_freepointer(struct kmem_cache *s, void *object)
 292 {
 293         object = kasan_reset_tag(object);
 294         return freelist_dereference(s, object + s->offset);
 295 }
 296
 297 static void prefetch_freepointer(const struct kmem_cache *s, void *object)
 298 {
 299         prefetch(object + s->offset);
 300 }
 301
 302 static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
 303 {
 304         unsigned long freepointer_addr;
 305         void *p;
 306
 307         if (!debug_pagealloc_enabled_static())
 308                 return get_freepointer(s, object);
 309
 310         object = kasan_reset_tag(object);
 311         freepointer_addr = (unsigned long)object + s->offset;
 312         copy_from_kernel_nofault(&p, (void **)freepointer_addr, sizeof(p));
 313         return freelist_ptr(s, p, freepointer_addr);
 314 }
 315
 316 static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
 317 {
 318         unsigned long freeptr_addr = (unsigned long)object + s->offset;
 319
 320 #ifdef CONFIG_SLAB_FREELIST_HARDENED
 321         BUG_ON(object == fp); /* naive detection of double free or corruption */
 322 #endif
 323
 324         freeptr_addr = (unsigned long)kasan_reset_tag((void *)freeptr_addr);
 325         *(void **)freeptr_addr = freelist_ptr(s, fp, freeptr_addr);
 326 }
 327
 328 /* Loop over all objects in a slab */
 329 #define for_each_object(__p, __s, __addr, __objects) \
 330         for (__p = fixup_red_left(__s, __addr); \
 331                 __p < (__addr) + (__objects) * (__s)->size; \
 332                 __p += (__s)->size)
 333
 334 static inline unsigned int order_objects(unsigned int order, unsigned int size)
 335 {
 336         return ((unsigned int)PAGE_SIZE << order) / size;
 337 }
 338
 339 static inline struct kmem_cache_order_objects oo_make(unsigned int order,
 340                 unsigned int size)
 341 {
 342         struct kmem_cache_order_objects x = {
 343                 (order << OO_SHIFT) + order_objects(order, size)
 344         };
 345
 346         return x;
 347 }
 348
 349 static inline unsigned int oo_order(struct kmem_cache_order_objects x)
 350 {
 351         return x.x >> OO_SHIFT;
 352 }
 353
 354 static inline unsigned int oo_objects(struct kmem_cache_order_objects x)
 355 {
 356         return x.x & OO_MASK;
 357 }
 358
 359 /*
 360  * Per slab locking using the pagelock
 361  */
 362 static __always_inline void __slab_lock(struct page *page)
 363 {
 364         VM_BUG_ON_PAGE(PageTail(page), page);
 365         bit_spin_lock(PG_locked, &page->flags);
 366 }
 367
 368 static __always_inline void __slab_unlock(struct page *page)
 369 {
 370         VM_BUG_ON_PAGE(PageTail(page), page);
 371         __bit_spin_unlock(PG_locked, &page->flags);
 372 }
 373
 374 static __always_inline void slab_lock(struct page *page, unsigned long *flags)
 375 {
 376         if (IS_ENABLED(CONFIG_PREEMPT_RT))
 377                 local_irq_save(*flags);
 378         __slab_lock(page);
 379 }
 380
 381 static __always_inline void slab_unlock(struct page *page, unsigned long *flags)
 382 {
 383         __slab_unlock(page);
 384         if (IS_ENABLED(CONFIG_PREEMPT_RT))
 385                 local_irq_restore(*flags);
 386 }
 387
 388 /*
 389  * Interrupts must be disabled (for the fallback code to work right), typically
 390  * by an _irqsave() lock variant. Except on PREEMPT_RT where locks are different
 391  * so we disable interrupts as part of slab_[un]lock().
 392  */
 393 static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
 394                 void *freelist_old, unsigned long counters_old,
 395                 void *freelist_new, unsigned long counters_new,
 396                 const char *n)
 397 {
 398         if (!IS_ENABLED(CONFIG_PREEMPT_RT))
 399                 lockdep_assert_irqs_disabled();
 400 #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
 401     defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
 402         if (s->flags & __CMPXCHG_DOUBLE) {
 403                 if (cmpxchg_double(&page->freelist, &page->counters,
 404                                    freelist_old, counters_old,
 405                                    freelist_new, counters_new))
 406                         return true;
 407         } else
 408 #endif
 409         {
 410                 /* init to 0 to prevent spurious warnings */
 411                 unsigned long flags = 0;
 412
 413                 slab_lock(page, &flags);
 414                 if (page->freelist == freelist_old &&
 415                                         page->counters == counters_old) {
 416                         page->freelist = freelist_new;
 417                         page->counters = counters_new;
 418                         slab_unlock(page, &flags);
 419                         return true;
 420                 }
 421                 slab_unlock(page, &flags);
 422         }
 423
 424         cpu_relax();
 425         stat(s, CMPXCHG_DOUBLE_FAIL);
 426
 427 #ifdef SLUB_DEBUG_CMPXCHG
 428         pr_info("%s %s: cmpxchg double redo ", n, s->name);
 429 #endif
 430
 431         return false;
 432 }
 433
 434 static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
 435                 void *freelist_old, unsigned long counters_old,
 436                 void *freelist_new, unsigned long counters_new,
 437                 const char *n)
 438 {
 439 #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
 440     defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
 441         if (s->flags & __CMPXCHG_DOUBLE) {
 442                 if (cmpxchg_double(&page->freelist, &page->counters,
 443                                    freelist_old, counters_old,
 444                                    freelist_new, counters_new))
 445                         return true;
 446         } else
 447 #endif
 448         {
 449                 unsigned long flags;
 450
 451                 local_irq_save(flags);
 452                 __slab_lock(page);
 453                 if (page->freelist == freelist_old &&
 454                                         page->counters == counters_old) {
 455                         page->freelist = freelist_new;
 456                         page->counters = counters_new;
 457                         __slab_unlock(page);
 458                         local_irq_restore(flags);
 459                         return true;
 460                 }
 461                 __slab_unlock(page);
 462                 local_irq_restore(flags);
 463         }
 464
 465         cpu_relax();
 466         stat(s, CMPXCHG_DOUBLE_FAIL);
 467
 468 #ifdef SLUB_DEBUG_CMPXCHG
 469         pr_info("%s %s: cmpxchg double redo ", n, s->name);
 470 #endif
 471
 472         return false;
 473 }
 474
 475 #ifdef CONFIG_SLUB_DEBUG
 476 static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)];
 477 static DEFINE_RAW_SPINLOCK(object_map_lock);
 478
 479 static void __fill_map(unsigned long *obj_map, struct kmem_cache *s,
 480                        struct page *page)
 481 {
 482         void *addr = page_address(page);
 483         void *p;
 484
 485         bitmap_zero(obj_map, page->objects);
 486
 487         for (p = page->freelist; p; p = get_freepointer(s, p))
 488                 set_bit(__obj_to_index(s, addr, p), obj_map);
 489 }
 490
 491 #if IS_ENABLED(CONFIG_KUNIT)
 492 static bool slab_add_kunit_errors(void)
 493 {
 494         struct kunit_resource *resource;
 495
 496         if (likely(!current->kunit_test))
 497                 return false;
 498
 499         resource = kunit_find_named_resource(current->kunit_test, "slab_errors");
 500         if (!resource)
 501                 return false;
 502
 503         (*(int *)resource->data)++;
 504         kunit_put_resource(resource);
 505         return true;
 506 }
 507 #else
 508 static inline bool slab_add_kunit_errors(void) { return false; }
 509 #endif
 510
 511 /*
 512  * Determine a map of object in use on a page.
 513  *
 514  * Node listlock must be held to guarantee that the page does
 515  * not vanish from under us.
 516  */
 517 static unsigned long *get_map(struct kmem_cache *s, struct page *page)
 518         __acquires(&object_map_lock)
 519 {
 520         VM_BUG_ON(!irqs_disabled());
 521
 522         raw_spin_lock(&object_map_lock);
 523
 524         __fill_map(object_map, s, page);
 525
 526         return object_map;
 527 }
 528
 529 static void put_map(unsigned long *map) __releases(&object_map_lock)
 530 {
 531         VM_BUG_ON(map != object_map);
 532         raw_spin_unlock(&object_map_lock);
 533 }
 534
 535 static inline unsigned int size_from_object(struct kmem_cache *s)
 536 {
 537         if (s->flags & SLAB_RED_ZONE)
 538                 return s->size - s->red_left_pad;
 539
 540         return s->size;
 541 }
 542
 543 static inline void *restore_red_left(struct kmem_cache *s, void *p)
 544 {
 545         if (s->flags & SLAB_RED_ZONE)
 546                 p -= s->red_left_pad;
 547
 548         return p;
 549 }
 550
 551 /*
 552  * Debug settings:
 553  */
 554 #if defined(CONFIG_SLUB_DEBUG_ON)
 555 static slab_flags_t slub_debug = DEBUG_DEFAULT_FLAGS;
 556 #else
 557 static slab_flags_t slub_debug;
 558 #endif
 559
 560 static char *slub_debug_string;
 561 static int disable_higher_order_debug;
 562
 563 /*
 564  * slub is about to manipulate internal object metadata.  This memory lies
 565  * outside the range of the allocated object, so accessing it would normally
 566  * be reported by kasan as a bounds error.  metadata_access_enable() is used
 567  * to tell kasan that these accesses are OK.
 568  */
 569 static inline void metadata_access_enable(void)
 570 {
 571         kasan_disable_current();
 572 }
 573
 574 static inline void metadata_access_disable(void)
 575 {
 576         kasan_enable_current();
 577 }
 578
 579 /*
 580  * Object debugging
 581  */
 582
 583 /* Verify that a pointer has an address that is valid within a slab page */
 584 static inline int check_valid_pointer(struct kmem_cache *s,
 585                                 struct page *page, void *object)
 586 {
 587         void *base;
 588
 589         if (!object)
 590                 return 1;
 591
 592         base = page_address(page);
 593         object = kasan_reset_tag(object);
 594         object = restore_red_left(s, object);
 595         if (object < base || object >= base + page->objects * s->size ||
 596                 (object - base) % s->size) {
 597                 return 0;
 598         }
 599
 600         return 1;
 601 }
 602
 603 static void print_section(char *level, char *text, u8 *addr,
 604                           unsigned int length)
 605 {
 606         metadata_access_enable();
 607         print_hex_dump(level, text, DUMP_PREFIX_ADDRESS,
 608                         16, 1, kasan_reset_tag((void *)addr), length, 1);
 609         metadata_access_disable();
 610 }
 611
 612 /*
 613  * See comment in calculate_sizes().
 614  */
 615 static inline bool freeptr_outside_object(struct kmem_cache *s)
 616 {
 617         return s->offset >= s->inuse;
 618 }
 619
 620 /*
 621  * Return offset of the end of info block which is inuse + free pointer if
 622  * not overlapping with object.
 623  */
 624 static inline unsigned int get_info_end(struct kmem_cache *s)
 625 {
 626         if (freeptr_outside_object(s))
 627                 return s->inuse + sizeof(void *);
 628         else
 629                 return s->inuse;
 630 }
 631
 632 static struct track *get_track(struct kmem_cache *s, void *object,
 633         enum track_item alloc)
 634 {
 635         struct track *p;
 636
 637         p = object + get_info_end(s);
 638
 639         return kasan_reset_tag(p + alloc);
 640 }
 641
 642 static void set_track(struct kmem_cache *s, void *object,
 643                         enum track_item alloc, unsigned long addr)
 644 {
 645         struct track *p = get_track(s, object, alloc);
 646
 647         if (addr) {
 648 #ifdef CONFIG_STACKTRACE
 649                 unsigned int nr_entries;
 650
 651                 metadata_access_enable();
 652                 nr_entries = stack_trace_save(kasan_reset_tag(p->addrs),
 653                                               TRACK_ADDRS_COUNT, 3);
 654                 metadata_access_disable();
 655
 656                 if (nr_entries < TRACK_ADDRS_COUNT)
 657                         p->addrs[nr_entries] = 0;
 658 #endif
 659                 p->addr = addr;
 660                 p->cpu = smp_processor_id();
 661                 p->pid = current->pid;
 662                 p->when = jiffies;
 663         } else {
 664                 memset(p, 0, sizeof(struct track));
 665         }
 666 }
 667
 668 static void init_tracking(struct kmem_cache *s, void *object)
 669 {
 670         if (!(s->flags & SLAB_STORE_USER))
 671                 return;
 672
 673         set_track(s, object, TRACK_FREE, 0UL);
 674         set_track(s, object, TRACK_ALLOC, 0UL);
 675 }
 676
 677 static void print_track(const char *s, struct track *t, unsigned long pr_time)
 678 {
 679         if (!t->addr)
 680                 return;
 681
 682         pr_err("%s in %pS age=%lu cpu=%u pid=%d\n",
 683                s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid);
 684 #ifdef CONFIG_STACKTRACE
 685         {
 686                 int i;
 687                 for (i = 0; i < TRACK_ADDRS_COUNT; i++)
 688                         if (t->addrs[i])
 689                                 pr_err("\t%pS\n", (void *)t->addrs[i]);
 690                         else
 691                                 break;
 692         }
 693 #endif
 694 }
 695
 696 void print_tracking(struct kmem_cache *s, void *object)
 697 {
 698         unsigned long pr_time = jiffies;
 699         if (!(s->flags & SLAB_STORE_USER))
 700                 return;
 701
 702         print_track("Allocated", get_track(s, object, TRACK_ALLOC), pr_time);
 703         print_track("Freed", get_track(s, object, TRACK_FREE), pr_time);
 704 }
 705
 706 static void print_page_info(struct page *page)
 707 {
 708         pr_err("Slab 0x%p objects=%u used=%u fp=0x%p flags=%#lx(%pGp)\n",
 709                page, page->objects, page->inuse, page->freelist,
 710                page->flags, &page->flags);
 711
 712 }
 713
 714 static void slab_bug(struct kmem_cache *s, char *fmt, ...)
 715 {
 716         struct va_format vaf;
 717         va_list args;
 718
 719         va_start(args, fmt);
 720         vaf.fmt = fmt;
 721         vaf.va = &args;
 722         pr_err("=============================================================================\n");
 723         pr_err("BUG %s (%s): %pV\n", s->name, print_tainted(), &vaf);
 724         pr_err("-----------------------------------------------------------------------------\n\n");
 725         va_end(args);
 726 }
 727
 728 __printf(2, 3)
 729 static void slab_fix(struct kmem_cache *s, char *fmt, ...)
 730 {
 731         struct va_format vaf;
 732         va_list args;
 733
 734         if (slab_add_kunit_errors())
 735                 return;
 736
 737         va_start(args, fmt);
 738         vaf.fmt = fmt;
 739         vaf.va = &args;
 740         pr_err("FIX %s: %pV\n", s->name, &vaf);
 741         va_end(args);
 742 }
 743
 744 static bool freelist_corrupted(struct kmem_cache *s, struct page *page,
 745                                void **freelist, void *nextfree)
 746 {
 747         if ((s->flags & SLAB_CONSISTENCY_CHECKS) &&
 748             !check_valid_pointer(s, page, nextfree) && freelist) {
 749                 object_err(s, page, *freelist, "Freechain corrupt");
 750                 *freelist = NULL;
 751                 slab_fix(s, "Isolate corrupted freechain");
 752                 return true;
 753         }
 754
 755         return false;
 756 }
 757
 758 static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
 759 {
 760         unsigned int off;       /* Offset of last byte */
 761         u8 *addr = page_address(page);
 762
 763         print_tracking(s, p);
 764
 765         print_page_info(page);
 766
 767         pr_err("Object 0x%p @offset=%tu fp=0x%p\n\n",
 768                p, p - addr, get_freepointer(s, p));
 769
 770         if (s->flags & SLAB_RED_ZONE)
 771                 print_section(KERN_ERR, "Redzone  ", p - s->red_left_pad,
 772                               s->red_left_pad);
 773         else if (p > addr + 16)
 774                 print_section(KERN_ERR, "Bytes b4 ", p - 16, 16);
 775
 776         print_section(KERN_ERR,         "Object   ", p,
 777                       min_t(unsigned int, s->object_size, PAGE_SIZE));
 778         if (s->flags & SLAB_RED_ZONE)
 779                 print_section(KERN_ERR, "Redzone  ", p + s->object_size,
 780                         s->inuse - s->object_size);
 781
 782         off = get_info_end(s);
 783
 784         if (s->flags & SLAB_STORE_USER)
 785                 off += 2 * sizeof(struct track);
 786
 787         off += kasan_metadata_size(s);
 788
 789         if (off != size_from_object(s))
 790                 /* Beginning of the filler is the free pointer */
 791                 print_section(KERN_ERR, "Padding  ", p + off,
 792                               size_from_object(s) - off);
 793
 794         dump_stack();
 795 }
 796
 797 void object_err(struct kmem_cache *s, struct page *page,
 798                         u8 *object, char *reason)
 799 {
 800         if (slab_add_kunit_errors())
 801                 return;
 802
 803         slab_bug(s, "%s", reason);
 804         print_trailer(s, page, object);
 805         add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
 806 }
 807
 808 static __printf(3, 4) void slab_err(struct kmem_cache *s, struct page *page,
 809                         const char *fmt, ...)
 810 {
 811         va_list args;
 812         char buf[100];
 813
 814         if (slab_add_kunit_errors())
 815                 return;
 816
 817         va_start(args, fmt);
 818         vsnprintf(buf, sizeof(buf), fmt, args);
 819         va_end(args);
 820         slab_bug(s, "%s", buf);
 821         print_page_info(page);
 822         dump_stack();
 823         add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
 824 }
 825
 826 static void init_object(struct kmem_cache *s, void *object, u8 val)
 827 {
 828         u8 *p = kasan_reset_tag(object);
 829
 830         if (s->flags & SLAB_RED_ZONE)
 831                 memset(p - s->red_left_pad, val, s->red_left_pad);
 832
 833         if (s->flags & __OBJECT_POISON) {
 834                 memset(p, POISON_FREE, s->object_size - 1);
 835                 p[s->object_size - 1] = POISON_END;
 836         }
 837
 838         if (s->flags & SLAB_RED_ZONE)
 839                 memset(p + s->object_size, val, s->inuse - s->object_size);
 840 }
 841
 842 static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
 843                                                 void *from, void *to)
 844 {
 845         slab_fix(s, "Restoring %s 0x%p-0x%p=0x%x", message, from, to - 1, data);
 846         memset(from, data, to - from);
 847 }
 848
 849 static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
 850                         u8 *object, char *what,
 851                         u8 *start, unsigned int value, unsigned int bytes)
 852 {
 853         u8 *fault;
 854         u8 *end;
 855         u8 *addr = page_address(page);
 856
 857         metadata_access_enable();
 858         fault = memchr_inv(kasan_reset_tag(start), value, bytes);
 859         metadata_access_disable();
 860         if (!fault)
 861                 return 1;
 862
 863         end = start + bytes;
 864         while (end > fault && end[-1] == value)
 865                 end--;
 866
 867         if (slab_add_kunit_errors())
 868                 goto skip_bug_print;
 869
 870         slab_bug(s, "%s overwritten", what);
 871         pr_err("0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n",
 872                                         fault, end - 1, fault - addr,
 873                                         fault[0], value);
 874         print_trailer(s, page, object);
 875         add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
 876
 877 skip_bug_print:
 878         restore_bytes(s, what, value, fault, end);
 879         return 0;
 880 }
 881
 882 /*
 883  * Object layout:
 884  *
 885  * object address
 886  *      Bytes of the object to be managed.
 887  *      If the freepointer may overlay the object then the free
 888  *      pointer is at the middle of the object.
 889  *
 890  *      Poisoning uses 0x6b (POISON_FREE) and the last byte is
 891  *      0xa5 (POISON_END)
 892  *
 893  * object + s->object_size
 894  *      Padding to reach word boundary. This is also used for Redzoning.
 895  *      Padding is extended by another word if Redzoning is enabled and
 896  *      object_size == inuse.
 897  *
 898  *      We fill with 0xbb (RED_INACTIVE) for inactive objects and with
 899  *      0xcc (RED_ACTIVE) for objects in use.
 900  *
 901  * object + s->inuse
 902  *      Meta data starts here.
 903  *
 904  *      A. Free pointer (if we cannot overwrite object on free)
 905  *      B. Tracking data for SLAB_STORE_USER
 906  *      C. Padding to reach required alignment boundary or at minimum
 907  *              one word if debugging is on to be able to detect writes
 908  *              before the word boundary.
 909  *
 910  *      Padding is done using 0x5a (POISON_INUSE)
 911  *
 912  * object + s->size
 913  *      Nothing is used beyond s->size.
 914  *
 915  * If slabcaches are merged then the object_size and inuse boundaries are mostly
 916  * ignored. And therefore no slab options that rely on these boundaries
 917  * may be used with merged slabcaches.
 918  */
 919
 920 static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p)
 921 {
 922         unsigned long off = get_info_end(s);    /* The end of info */
 923
 924         if (s->flags & SLAB_STORE_USER)
 925                 /* We also have user information there */
 926                 off += 2 * sizeof(struct track);
 927
 928         off += kasan_metadata_size(s);
 929
 930         if (size_from_object(s) == off)
 931                 return 1;
 932
 933         return check_bytes_and_report(s, page, p, "Object padding",
 934                         p + off, POISON_INUSE, size_from_object(s) - off);
 935 }
 936
 937 /* Check the pad bytes at the end of a slab page */
 938 static int slab_pad_check(struct kmem_cache *s, struct page *page)
 939 {
 940         u8 *start;
 941         u8 *fault;
 942         u8 *end;
 943         u8 *pad;
 944         int length;
 945         int remainder;
 946
 947         if (!(s->flags & SLAB_POISON))
 948                 return 1;
 949
 950         start = page_address(page);
 951         length = page_size(page);
 952         end = start + length;
 953         remainder = length % s->size;
 954         if (!remainder)
 955                 return 1;
 956
 957         pad = end - remainder;
 958         metadata_access_enable();
 959         fault = memchr_inv(kasan_reset_tag(pad), POISON_INUSE, remainder);
 960         metadata_access_disable();
 961         if (!fault)
 962                 return 1;
 963         while (end > fault && end[-1] == POISON_INUSE)
 964                 end--;
 965
 966         slab_err(s, page, "Padding overwritten. 0x%p-0x%p @offset=%tu",
 967                         fault, end - 1, fault - start);
 968         print_section(KERN_ERR, "Padding ", pad, remainder);
 969
 970         restore_bytes(s, "slab padding", POISON_INUSE, fault, end);
 971         return 0;
 972 }
 973
 974 static int check_object(struct kmem_cache *s, struct page *page,
 975                                         void *object, u8 val)
 976 {
 977         u8 *p = object;
 978         u8 *endobject = object + s->object_size;
 979
 980         if (s->flags & SLAB_RED_ZONE) {
 981                 if (!check_bytes_and_report(s, page, object, "Left Redzone",
 982                         object - s->red_left_pad, val, s->red_left_pad))
 983                         return 0;
 984
 985                 if (!check_bytes_and_report(s, page, object, "Right Redzone",
 986                         endobject, val, s->inuse - s->object_size))
 987                         return 0;
 988         } else {
 989                 if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) {
 990                         check_bytes_and_report(s, page, p, "Alignment padding",
 991                                 endobject, POISON_INUSE,
 992                                 s->inuse - s->object_size);
 993                 }
 994         }
 995
 996         if (s->flags & SLAB_POISON) {
 997                 if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) &&
 998                         (!check_bytes_and_report(s, page, p, "Poison", p,
 999                                         POISON_FREE, s->object_size - 1) ||
1000                          !check_bytes_and_report(s, page, p, "End Poison",
1001                                 p + s->object_size - 1, POISON_END, 1)))
1002                         return 0;
1003                 /*
1004                  * check_pad_bytes cleans up on its own.
1005                  */
1006                 check_pad_bytes(s, page, p);
1007         }
1008
1009         if (!freeptr_outside_object(s) && val == SLUB_RED_ACTIVE)
1010                 /*
1011                  * Object and freepointer overlap. Cannot check
1012                  * freepointer while object is allocated.
1013                  */
1014                 return 1;
1015
1016         /* Check free pointer validity */
1017         if (!check_valid_pointer(s, page, get_freepointer(s, p))) {
1018                 object_err(s, page, p, "Freepointer corrupt");
1019                 /*
1020                  * No choice but to zap it and thus lose the remainder
1021                  * of the free objects in this slab. May cause
1022                  * another error because the object count is now wrong.
1023                  */
1024                 set_freepointer(s, p, NULL);
1025                 return 0;
1026         }
1027         return 1;
1028 }
1029
1030 static int check_slab(struct kmem_cache *s, struct page *page)
1031 {
1032         int maxobj;
1033
1034         if (!PageSlab(page)) {
1035                 slab_err(s, page, "Not a valid slab page");
1036                 return 0;
1037         }
1038
1039         maxobj = order_objects(compound_order(page), s->size);
1040         if (page->objects > maxobj) {
1041                 slab_err(s, page, "objects %u > max %u",
1042                         page->objects, maxobj);
1043                 return 0;
1044         }
1045         if (page->inuse > page->objects) {
1046                 slab_err(s, page, "inuse %u > max %u",
1047                         page->inuse, page->objects);
1048                 return 0;
1049         }
1050         /* Slab_pad_check fixes things up after itself */
1051         slab_pad_check(s, page);
1052         return 1;
1053 }
1054
1055 /*
1056  * Determine if a certain object on a page is on the freelist. Must hold the
1057  * slab lock to guarantee that the chains are in a consistent state.
1058  */
1059 static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
1060 {
1061         int nr = 0;
1062         void *fp;
1063         void *object = NULL;
1064         int max_objects;
1065
1066         fp = page->freelist;
1067         while (fp && nr <= page->objects) {
1068                 if (fp == search)
1069                         return 1;
1070                 if (!check_valid_pointer(s, page, fp)) {
1071                         if (object) {
1072                                 object_err(s, page, object,
1073                                         "Freechain corrupt");
1074                                 set_freepointer(s, object, NULL);
1075                         } else {
1076                                 slab_err(s, page, "Freepointer corrupt");
1077                                 page->freelist = NULL;
1078                                 page->inuse = page->objects;
1079                                 slab_fix(s, "Freelist cleared");
1080                                 return 0;
1081                         }
1082                         break;
1083                 }
1084                 object = fp;
1085                 fp = get_freepointer(s, object);
1086                 nr++;
1087         }
1088
1089         max_objects = order_objects(compound_order(page), s->size);
1090         if (max_objects > MAX_OBJS_PER_PAGE)
1091                 max_objects = MAX_OBJS_PER_PAGE;
1092
1093         if (page->objects != max_objects) {
1094                 slab_err(s, page, "Wrong number of objects. Found %d but should be %d",
1095                          page->objects, max_objects);
1096                 page->objects = max_objects;
1097                 slab_fix(s, "Number of objects adjusted");
1098         }
1099         if (page->inuse != page->objects - nr) {
1100                 slab_err(s, page, "Wrong object count. Counter is %d but counted were %d",
1101                          page->inuse, page->objects - nr);
1102                 page->inuse = page->objects - nr;
1103                 slab_fix(s, "Object count adjusted");
1104         }
1105         return search == NULL;
1106 }
1107
1108 static void trace(struct kmem_cache *s, struct page *page, void *object,
1109                                                                 int alloc)
1110 {
1111         if (s->flags & SLAB_TRACE) {
1112                 pr_info("TRACE %s %s 0x%p inuse=%d fp=0x%p\n",
1113                         s->name,
1114                         alloc ? "alloc" : "free",
1115                         object, page->inuse,
1116                         page->freelist);
1117
1118                 if (!alloc)
1119                         print_section(KERN_INFO, "Object ", (void *)object,
1120                                         s->object_size);
1121
1122                 dump_stack();
1123         }
1124 }
1125
1126 /*
1127  * Tracking of fully allocated slabs for debugging purposes.
1128  */
1129 static void add_full(struct kmem_cache *s,
1130         struct kmem_cache_node *n, struct page *page)
1131 {
1132         if (!(s->flags & SLAB_STORE_USER))
1133                 return;
1134
1135         lockdep_assert_held(&n->list_lock);
1136         list_add(&page->slab_list, &n->full);
1137 }
1138
1139 static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page)
1140 {
1141         if (!(s->flags & SLAB_STORE_USER))
1142                 return;
1143
1144         lockdep_assert_held(&n->list_lock);
1145         list_del(&page->slab_list);
1146 }
1147
1148 /* Tracking of the number of slabs for debugging purposes */
1149 static inline unsigned long slabs_node(struct kmem_cache *s, int node)
1150 {
1151         struct kmem_cache_node *n = get_node(s, node);
1152
1153         return atomic_long_read(&n->nr_slabs);
1154 }
1155
1156 static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
1157 {
1158         return atomic_long_read(&n->nr_slabs);
1159 }
1160
1161 static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects)
1162 {
1163         struct kmem_cache_node *n = get_node(s, node);
1164
1165         /*
1166          * May be called early in order to allocate a slab for the
1167          * kmem_cache_node structure. Solve the chicken-egg
1168          * dilemma by deferring the increment of the count during
1169          * bootstrap (see early_kmem_cache_node_alloc).
1170          */
1171         if (likely(n)) {
1172                 atomic_long_inc(&n->nr_slabs);
1173                 atomic_long_add(objects, &n->total_objects);
1174         }
1175 }
1176 static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects)
1177 {
1178         struct kmem_cache_node *n = get_node(s, node);
1179
1180         atomic_long_dec(&n->nr_slabs);
1181         atomic_long_sub(objects, &n->total_objects);
1182 }
1183
1184 /* Object debug checks for alloc/free paths */
1185 static void setup_object_debug(struct kmem_cache *s, struct page *page,
1186                                                                 void *object)
1187 {
1188         if (!kmem_cache_debug_flags(s, SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON))
1189                 return;
1190
1191         init_object(s, object, SLUB_RED_INACTIVE);
1192         init_tracking(s, object);
1193 }
1194
1195 static
1196 void setup_page_debug(struct kmem_cache *s, struct page *page, void *addr)
1197 {
1198         if (!kmem_cache_debug_flags(s, SLAB_POISON))
1199                 return;
1200
1201         metadata_access_enable();
1202         memset(kasan_reset_tag(addr), POISON_INUSE, page_size(page));
1203         metadata_access_disable();
1204 }
1205
1206 static inline int alloc_consistency_checks(struct kmem_cache *s,
1207                                         struct page *page, void *object)
1208 {
1209         if (!check_slab(s, page))
1210                 return 0;
1211
1212         if (!check_valid_pointer(s, page, object)) {
1213                 object_err(s, page, object, "Freelist Pointer check fails");
1214                 return 0;
1215         }
1216
1217         if (!check_object(s, page, object, SLUB_RED_INACTIVE))
1218                 return 0;
1219
1220         return 1;
1221 }
1222
1223 static noinline int alloc_debug_processing(struct kmem_cache *s,
1224                                         struct page *page,
1225                                         void *object, unsigned long addr)
1226 {
1227         if (s->flags & SLAB_CONSISTENCY_CHECKS) {
1228                 if (!alloc_consistency_checks(s, page, object))
1229                         goto bad;
1230         }
1231
1232         /* Success perform special debug activities for allocs */
1233         if (s->flags & SLAB_STORE_USER)
1234                 set_track(s, object, TRACK_ALLOC, addr);
1235         trace(s, page, object, 1);
1236         init_object(s, object, SLUB_RED_ACTIVE);
1237         return 1;
1238
1239 bad:
1240         if (PageSlab(page)) {
1241                 /*
1242                  * If this is a slab page then lets do the best we can
1243                  * to avoid issues in the future. Marking all objects
1244                  * as used avoids touching the remaining objects.
1245                  */
1246                 slab_fix(s, "Marking all objects used");
1247                 page->inuse = page->objects;
1248                 page->freelist = NULL;
1249         }
1250         return 0;
1251 }
1252
1253 static inline int free_consistency_checks(struct kmem_cache *s,
1254                 struct page *page, void *object, unsigned long addr)
1255 {
1256         if (!check_valid_pointer(s, page, object)) {
1257                 slab_err(s, page, "Invalid object pointer 0x%p", object);
1258                 return 0;
1259         }
1260
1261         if (on_freelist(s, page, object)) {
1262                 object_err(s, page, object, "Object already free");
1263                 return 0;
1264         }
1265
1266         if (!check_object(s, page, object, SLUB_RED_ACTIVE))
1267                 return 0;
1268
1269         if (unlikely(s != page->slab_cache)) {
1270                 if (!PageSlab(page)) {
1271                         slab_err(s, page, "Attempt to free object(0x%p) outside of slab",
1272                                  object);
1273                 } else if (!page->slab_cache) {
1274                         pr_err("SLUB <none>: no slab for object 0x%p.\n",
1275                                object);
1276                         dump_stack();
1277                 } else
1278                         object_err(s, page, object,
1279                                         "page slab pointer corrupt.");
1280                 return 0;
1281         }
1282         return 1;
1283 }
1284
1285 /* Supports checking bulk free of a constructed freelist */
1286 static noinline int free_debug_processing(
1287         struct kmem_cache *s, struct page *page,
1288         void *head, void *tail, int bulk_cnt,
1289         unsigned long addr)
1290 {
1291         struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1292         void *object = head;
1293         int cnt = 0;
1294         unsigned long flags, flags2;
1295         int ret = 0;
1296
1297         spin_lock_irqsave(&n->list_lock, flags);
1298         slab_lock(page, &flags2);
1299
1300         if (s->flags & SLAB_CONSISTENCY_CHECKS) {
1301                 if (!check_slab(s, page))
1302                         goto out;
1303         }
1304
1305 next_object:
1306         cnt++;
1307
1308         if (s->flags & SLAB_CONSISTENCY_CHECKS) {
1309                 if (!free_consistency_checks(s, page, object, addr))
1310                         goto out;
1311         }
1312
1313         if (s->flags & SLAB_STORE_USER)
1314                 set_track(s, object, TRACK_FREE, addr);
1315         trace(s, page, object, 0);
1316         /* Freepointer not overwritten by init_object(), SLAB_POISON moved it */
1317         init_object(s, object, SLUB_RED_INACTIVE);
1318
1319         /* Reached end of constructed freelist yet? */
1320         if (object != tail) {
1321                 object = get_freepointer(s, object);
1322                 goto next_object;
1323         }
1324         ret = 1;
1325
1326 out:
1327         if (cnt != bulk_cnt)
1328                 slab_err(s, page, "Bulk freelist count(%d) invalid(%d)\n",
1329                          bulk_cnt, cnt);
1330
1331         slab_unlock(page, &flags2);
1332         spin_unlock_irqrestore(&n->list_lock, flags);
1333         if (!ret)
1334                 slab_fix(s, "Object at 0x%p not freed", object);
1335         return ret;
1336 }
1337
1338 /*
1339  * Parse a block of slub_debug options. Blocks are delimited by ';'
1340  *
1341  * @str:    start of block
1342  * @flags:  returns parsed flags, or DEBUG_DEFAULT_FLAGS if none specified
1343  * @slabs:  return start of list of slabs, or NULL when there's no list
1344  * @init:   assume this is initial parsing and not per-kmem-create parsing
1345  *
1346  * returns the start of next block if there's any, or NULL
1347  */
1348 static char *
1349 parse_slub_debug_flags(char *str, slab_flags_t *flags, char **slabs, bool init)
1350 {
1351         bool higher_order_disable = false;
1352
1353         /* Skip any completely empty blocks */
1354         while (*str && *str == ';')
1355                 str++;
1356
1357         if (*str == ',') {
1358                 /*
1359                  * No options but restriction on slabs. This means full
1360                  * debugging for slabs matching a pattern.
1361                  */
1362                 *flags = DEBUG_DEFAULT_FLAGS;
1363                 goto check_slabs;
1364         }
1365         *flags = 0;
1366
1367         /* Determine which debug features should be switched on */
1368         for (; *str && *str != ',' && *str != ';'; str++) {
1369                 switch (tolower(*str)) {
1370                 case '-':
1371                         *flags = 0;
1372                         break;
1373                 case 'f':
1374                         *flags |= SLAB_CONSISTENCY_CHECKS;
1375                         break;
1376                 case 'z':
1377                         *flags |= SLAB_RED_ZONE;
1378                         break;
1379                 case 'p':
1380                         *flags |= SLAB_POISON;
1381                         break;
1382                 case 'u':
1383                         *flags |= SLAB_STORE_USER;
1384                         break;
1385                 case 't':
1386                         *flags |= SLAB_TRACE;
1387                         break;
1388                 case 'a':
1389                         *flags |= SLAB_FAILSLAB;
1390                         break;
1391                 case 'o':
1392                         /*
1393                          * Avoid enabling debugging on caches if its minimum
1394                          * order would increase as a result.
1395                          */
1396                         higher_order_disable = true;
1397                         break;
1398                 default:
1399                         if (init)
1400                                 pr_err("slub_debug option '%c' unknown. skipped\n", *str);
1401                 }
1402         }
1403 check_slabs:
1404         if (*str == ',')
1405                 *slabs = ++str;
1406         else
1407                 *slabs = NULL;
1408
1409         /* Skip over the slab list */
1410         while (*str && *str != ';')
1411                 str++;
1412
1413         /* Skip any completely empty blocks */
1414         while (*str && *str == ';')
1415                 str++;
1416
1417         if (init && higher_order_disable)
1418                 disable_higher_order_debug = 1;
1419
1420         if (*str)
1421                 return str;
1422         else
1423                 return NULL;
1424 }
1425
1426 static int __init setup_slub_debug(char *str)
1427 {
1428         slab_flags_t flags;
1429         slab_flags_t global_flags;
1430         char *saved_str;
1431         char *slab_list;
1432         bool global_slub_debug_changed = false;
1433         bool slab_list_specified = false;
1434
1435         global_flags = DEBUG_DEFAULT_FLAGS;
1436         if (*str++ != '=' || !*str)
1437                 /*
1438                  * No options specified. Switch on full debugging.
1439                  */
1440                 goto out;
1441
1442         saved_str = str;
1443         while (str) {
1444                 str = parse_slub_debug_flags(str, &flags, &slab_list, true);
1445
1446                 if (!slab_list) {
1447                         global_flags = flags;
1448                         global_slub_debug_changed = true;
1449                 } else {
1450                         slab_list_specified = true;
1451                 }
1452         }
1453
1454         /*
1455          * For backwards compatibility, a single list of flags with list of
1456          * slabs means debugging is only changed for those slabs, so the global
1457          * slub_debug should be unchanged (0 or DEBUG_DEFAULT_FLAGS, depending
1458          * on CONFIG_SLUB_DEBUG_ON). We can extended that to multiple lists as
1459          * long as there is no option specifying flags without a slab list.
1460          */
1461         if (slab_list_specified) {
1462                 if (!global_slub_debug_changed)
1463                         global_flags = slub_debug;
1464                 slub_debug_string = saved_str;
1465         }
1466 out:
1467         slub_debug = global_flags;
1468         if (slub_debug != 0 || slub_debug_string)
1469                 static_branch_enable(&slub_debug_enabled);
1470         else
1471                 static_branch_disable(&slub_debug_enabled);
1472         if ((static_branch_unlikely(&init_on_alloc) ||
1473              static_branch_unlikely(&init_on_free)) &&
1474             (slub_debug & SLAB_POISON))
1475                 pr_info("mem auto-init: SLAB_POISON will take precedence over init_on_alloc/init_on_free\n");
1476         return 1;
1477 }
1478
1479 __setup("slub_debug", setup_slub_debug);
1480
1481 /*
1482  * kmem_cache_flags - apply debugging options to the cache
1483  * @object_size:        the size of an object without meta data
1484  * @flags:              flags to set
1485  * @name:               name of the cache
1486  *
1487  * Debug option(s) are applied to @flags. In addition to the debug
1488  * option(s), if a slab name (or multiple) is specified i.e.
1489  * slub_debug=<Debug-Options>,<slab name1>,<slab name2> ...
1490  * then only the select slabs will receive the debug option(s).
1491  */
1492 slab_flags_t kmem_cache_flags(unsigned int object_size,
1493         slab_flags_t flags, const char *name)
1494 {
1495         char *iter;
1496         size_t len;
1497         char *next_block;
1498         slab_flags_t block_flags;
1499         slab_flags_t slub_debug_local = slub_debug;
1500
1501         /*
1502          * If the slab cache is for debugging (e.g. kmemleak) then
1503          * don't store user (stack trace) information by default,
1504          * but let the user enable it via the command line below.
1505          */
1506         if (flags & SLAB_NOLEAKTRACE)
1507                 slub_debug_local &= ~SLAB_STORE_USER;
1508
1509         len = strlen(name);
1510         next_block = slub_debug_string;
1511         /* Go through all blocks of debug options, see if any matches our slab's name */
1512         while (next_block) {
1513                 next_block = parse_slub_debug_flags(next_block, &block_flags, &iter, false);
1514                 if (!iter)
1515                         continue;
1516                 /* Found a block that has a slab list, search it */
1517                 while (*iter) {
1518                         char *end, *glob;
1519                         size_t cmplen;
1520
1521                         end = strchrnul(iter, ',');
1522                         if (next_block && next_block < end)
1523                                 end = next_block - 1;
1524
1525                         glob = strnchr(iter, end - iter, '*');
1526                         if (glob)
1527                                 cmplen = glob - iter;
1528                         else
1529                                 cmplen = max_t(size_t, len, (end - iter));
1530
1531                         if (!strncmp(name, iter, cmplen)) {
1532                                 flags |= block_flags;
1533                                 return flags;
1534                         }
1535
1536                         if (!*end || *end == ';')
1537                                 break;
1538                         iter = end + 1;
1539                 }
1540         }
1541
1542         return flags | slub_debug_local;
1543 }
1544 #else /* !CONFIG_SLUB_DEBUG */
1545 static inline void setup_object_debug(struct kmem_cache *s,
1546                         struct page *page, void *object) {}
1547 static inline
1548 void setup_page_debug(struct kmem_cache *s, struct page *page, void *addr) {}
1549
1550 static inline int alloc_debug_processing(struct kmem_cache *s,
1551         struct page *page, void *object, unsigned long addr) { return 0; }
1552
1553 static inline int free_debug_processing(
1554         struct kmem_cache *s, struct page *page,
1555         void *head, void *tail, int bulk_cnt,
1556         unsigned long addr) { return 0; }
1557
1558 static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
1559                         { return 1; }
1560 static inline int check_object(struct kmem_cache *s, struct page *page,
1561                         void *object, u8 val) { return 1; }
1562 static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
1563                                         struct page *page) {}
1564 static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n,
1565                                         struct page *page) {}
1566 slab_flags_t kmem_cache_flags(unsigned int object_size,
1567         slab_flags_t flags, const char *name)
1568 {
1569         return flags;
1570 }
1571 #define slub_debug 0
1572
1573 #define disable_higher_order_debug 0
1574
1575 static inline unsigned long slabs_node(struct kmem_cache *s, int node)
1576                                                         { return 0; }
1577 static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
1578                                                         { return 0; }
1579 static inline void inc_slabs_node(struct kmem_cache *s, int node,
1580                                                         int objects) {}
1581 static inline void dec_slabs_node(struct kmem_cache *s, int node,
1582                                                         int objects) {}
1583
1584 static bool freelist_corrupted(struct kmem_cache *s, struct page *page,
1585                                void **freelist, void *nextfree)
1586 {
1587         return false;
1588 }
1589 #endif /* CONFIG_SLUB_DEBUG */
1590
1591 /*
1592  * Hooks for other subsystems that check memory allocations. In a typical
1593  * production configuration these hooks all should produce no code at all.
1594  */
1595 static inline void *kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
1596 {
1597         ptr = kasan_kmalloc_large(ptr, size, flags);
1598         /* As ptr might get tagged, call kmemleak hook after KASAN. */
1599         kmemleak_alloc(ptr, size, 1, flags);
1600         return ptr;
1601 }
1602
1603 static __always_inline void kfree_hook(void *x)
1604 {
1605         kmemleak_free(x);
1606         kasan_kfree_large(x);
1607 }
1608
1609 static __always_inline bool slab_free_hook(struct kmem_cache *s,
1610                                                 void *x, bool init)
1611 {
1612         kmemleak_free_recursive(x, s->flags);
1613
1614         debug_check_no_locks_freed(x, s->object_size);
1615
1616         if (!(s->flags & SLAB_DEBUG_OBJECTS))
1617                 debug_check_no_obj_freed(x, s->object_size);
1618
1619         /* Use KCSAN to help debug racy use-after-free. */
1620         if (!(s->flags & SLAB_TYPESAFE_BY_RCU))
1621                 __kcsan_check_access(x, s->object_size,
1622                                      KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT);
1623
1624         /*
1625          * As memory initialization might be integrated into KASAN,
1626          * kasan_slab_free and initialization memset's must be
1627          * kept together to avoid discrepancies in behavior.
1628          *
1629          * The initialization memset's clear the object and the metadata,
1630          * but don't touch the SLAB redzone.
1631          */
1632         if (init) {
1633                 int rsize;
1634
1635                 if (!kasan_has_integrated_init())
1636                         memset(kasan_reset_tag(x), 0, s->object_size);
1637                 rsize = (s->flags & SLAB_RED_ZONE) ? s->red_left_pad : 0;
1638                 memset((char *)kasan_reset_tag(x) + s->inuse, 0,
1639                        s->size - s->inuse - rsize);
1640         }
1641         /* KASAN might put x into memory quarantine, delaying its reuse. */
1642         return kasan_slab_free(s, x, init);
1643 }
1644
1645 static inline bool slab_free_freelist_hook(struct kmem_cache *s,
1646                                            void **head, void **tail)
1647 {
1648
1649         void *object;
1650         void *next = *head;
1651         void *old_tail = *tail ? *tail : *head;
1652
1653         if (is_kfence_address(next)) {
1654                 slab_free_hook(s, next, false);
1655                 return true;
1656         }
1657
1658         /* Head and tail of the reconstructed freelist */
1659         *head = NULL;
1660         *tail = NULL;
1661
1662         do {
1663                 object = next;
1664                 next = get_freepointer(s, object);
1665
1666                 /* If object's reuse doesn't have to be delayed */
1667                 if (!slab_free_hook(s, object, slab_want_init_on_free(s))) {
1668                         /* Move object to the new freelist */
1669                         set_freepointer(s, object, *head);
1670                         *head = object;
1671                         if (!*tail)
1672                                 *tail = object;
1673                 }
1674         } while (object != old_tail);
1675
1676         if (*head == *tail)
1677                 *tail = NULL;
1678
1679         return *head != NULL;
1680 }
1681
1682 static void *setup_object(struct kmem_cache *s, struct page *page,
1683                                 void *object)
1684 {
1685         setup_object_debug(s, page, object);
1686         object = kasan_init_slab_obj(s, object);
1687         if (unlikely(s->ctor)) {
1688                 kasan_unpoison_object_data(s, object);
1689                 s->ctor(object);
1690                 kasan_poison_object_data(s, object);
1691         }
1692         return object;
1693 }
1694
1695 /*
1696  * Slab allocation and freeing
1697  */
1698 static inline struct page *alloc_slab_page(struct kmem_cache *s,
1699                 gfp_t flags, int node, struct kmem_cache_order_objects oo)
1700 {
1701         struct page *page;
1702         unsigned int order = oo_order(oo);
1703
1704         if (node == NUMA_NO_NODE)
1705                 page = alloc_pages(flags, order);
1706         else
1707                 page = __alloc_pages_node(node, flags, order);
1708
1709         return page;
1710 }
1711
1712 #ifdef CONFIG_SLAB_FREELIST_RANDOM
1713 /* Pre-initialize the random sequence cache */
1714 static int init_cache_random_seq(struct kmem_cache *s)
1715 {
1716         unsigned int count = oo_objects(s->oo);
1717         int err;
1718
1719         /* Bailout if already initialised */
1720         if (s->random_seq)
1721                 return 0;
1722
1723         err = cache_random_seq_create(s, count, GFP_KERNEL);
1724         if (err) {
1725                 pr_err("SLUB: Unable to initialize free list for %s\n",
1726                         s->name);
1727                 return err;
1728         }
1729
1730         /* Transform to an offset on the set of pages */
1731         if (s->random_seq) {
1732                 unsigned int i;
1733
1734                 for (i = 0; i < count; i++)
1735                         s->random_seq[i] *= s->size;
1736         }
1737         return 0;
1738 }
1739
1740 /* Initialize each random sequence freelist per cache */
1741 static void __init init_freelist_randomization(void)
1742 {
1743         struct kmem_cache *s;
1744
1745         mutex_lock(&slab_mutex);
1746
1747         list_for_each_entry(s, &slab_caches, list)
1748                 init_cache_random_seq(s);
1749
1750         mutex_unlock(&slab_mutex);
1751 }
1752
1753 /* Get the next entry on the pre-computed freelist randomized */
1754 static void *next_freelist_entry(struct kmem_cache *s, struct page *page,
1755                                 unsigned long *pos, void *start,
1756                                 unsigned long page_limit,
1757                                 unsigned long freelist_count)
1758 {
1759         unsigned int idx;
1760
1761         /*
1762          * If the target page allocation failed, the number of objects on the
1763          * page might be smaller than the usual size defined by the cache.
1764          */
1765         do {
1766                 idx = s->random_seq[*pos];
1767                 *pos += 1;
1768                 if (*pos >= freelist_count)
1769                         *pos = 0;
1770         } while (unlikely(idx >= page_limit));
1771
1772         return (char *)start + idx;
1773 }
1774
1775 /* Shuffle the single linked freelist based on a random pre-computed sequence */
1776 static bool shuffle_freelist(struct kmem_cache *s, struct page *page)
1777 {
1778         void *start;
1779         void *cur;
1780         void *next;
1781         unsigned long idx, pos, page_limit, freelist_count;
1782
1783         if (page->objects < 2 || !s->random_seq)
1784                 return false;
1785
1786         freelist_count = oo_objects(s->oo);
1787         pos = get_random_int() % freelist_count;
1788
1789         page_limit = page->objects * s->size;
1790         start = fixup_red_left(s, page_address(page));
1791
1792         /* First entry is used as the base of the freelist */
1793         cur = next_freelist_entry(s, page, &pos, start, page_limit,
1794                                 freelist_count);
1795         cur = setup_object(s, page, cur);
1796         page->freelist = cur;
1797
1798         for (idx = 1; idx < page->objects; idx++) {
1799                 next = next_freelist_entry(s, page, &pos, start, page_limit,
1800                         freelist_count);
1801                 next = setup_object(s, page, next);
1802                 set_freepointer(s, cur, next);
1803                 cur = next;
1804         }
1805         set_freepointer(s, cur, NULL);
1806
1807         return true;
1808 }
1809 #else
1810 static inline int init_cache_random_seq(struct kmem_cache *s)
1811 {
1812         return 0;
1813 }
1814 static inline void init_freelist_randomization(void) { }
1815 static inline bool shuffle_freelist(struct kmem_cache *s, struct page *page)
1816 {
1817         return false;
1818 }
1819 #endif /* CONFIG_SLAB_FREELIST_RANDOM */
1820
1821 static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1822 {
1823         struct page *page;
1824         struct kmem_cache_order_objects oo = s->oo;
1825         gfp_t alloc_gfp;
1826         void *start, *p, *next;
1827         int idx;
1828         bool shuffle;
1829
1830         flags &= gfp_allowed_mask;
1831
1832         flags |= s->allocflags;
1833
1834         /*
1835          * Let the initial higher-order allocation fail under memory pressure
1836          * so we fall-back to the minimum order allocation.
1837          */
1838         alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
1839         if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min))
1840                 alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~(__GFP_RECLAIM|__GFP_NOFAIL);
1841
1842         page = alloc_slab_page(s, alloc_gfp, node, oo);
1843         if (unlikely(!page)) {
1844                 oo = s->min;
1845                 alloc_gfp = flags;
1846                 /*
1847                  * Allocation may have failed due to fragmentation.
1848                  * Try a lower order alloc if possible
1849                  */
1850                 page = alloc_slab_page(s, alloc_gfp, node, oo);
1851                 if (unlikely(!page))
1852                         goto out;
1853                 stat(s, ORDER_FALLBACK);
1854         }
1855
1856         page->objects = oo_objects(oo);
1857
1858         account_slab_page(page, oo_order(oo), s, flags);
1859
1860         page->slab_cache = s;
1861         __SetPageSlab(page);
1862         if (page_is_pfmemalloc(page))
1863                 SetPageSlabPfmemalloc(page);
1864
1865         kasan_poison_slab(page);
1866
1867         start = page_address(page);
1868
1869         setup_page_debug(s, page, start);
1870
1871         shuffle = shuffle_freelist(s, page);
1872
1873         if (!shuffle) {
1874                 start = fixup_red_left(s, start);
1875                 start = setup_object(s, page, start);
1876                 page->freelist = start;
1877                 for (idx = 0, p = start; idx < page->objects - 1; idx++) {
1878                         next = p + s->size;
1879                         next = setup_object(s, page, next);
1880                         set_freepointer(s, p, next);
1881                         p = next;
1882                 }
1883                 set_freepointer(s, p, NULL);
1884         }
1885
1886         page->inuse = page->objects;
1887         page->frozen = 1;
1888
1889 out:
1890         if (!page)
1891                 return NULL;
1892
1893         inc_slabs_node(s, page_to_nid(page), page->objects);
1894
1895         return page;
1896 }
1897
1898 static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1899 {
1900         if (unlikely(flags & GFP_SLAB_BUG_MASK))
1901                 flags = kmalloc_fix_flags(flags);
1902
1903         WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO));
1904
1905         return allocate_slab(s,
1906                 flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
1907 }
1908
1909 static void __free_slab(struct kmem_cache *s, struct page *page)
1910 {
1911         int order = compound_order(page);
1912         int pages = 1 << order;
1913
1914         if (kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) {
1915                 void *p;
1916
1917                 slab_pad_check(s, page);
1918                 for_each_object(p, s, page_address(page),
1919                                                 page->objects)
1920                         check_object(s, page, p, SLUB_RED_INACTIVE);
1921         }
1922
1923         __ClearPageSlabPfmemalloc(page);
1924         __ClearPageSlab(page);
1925         /* In union with page->mapping where page allocator expects NULL */
1926         page->slab_cache = NULL;
1927         if (current->reclaim_state)
1928                 current->reclaim_state->reclaimed_slab += pages;
1929         unaccount_slab_page(page, order, s);
1930         __free_pages(page, order);
1931 }
1932
1933 static void rcu_free_slab(struct rcu_head *h)
1934 {
1935         struct page *page = container_of(h, struct page, rcu_head);
1936
1937         __free_slab(page->slab_cache, page);
1938 }
1939
1940 static void free_slab(struct kmem_cache *s, struct page *page)
1941 {
1942         if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) {
1943                 call_rcu(&page->rcu_head, rcu_free_slab);
1944         } else
1945                 __free_slab(s, page);
1946 }
1947
1948 static void discard_slab(struct kmem_cache *s, struct page *page)
1949 {
1950         dec_slabs_node(s, page_to_nid(page), page->objects);
1951         free_slab(s, page);
1952 }
1953
1954 /*
1955  * Management of partially allocated slabs.
1956  */
1957 static inline void
1958 __add_partial(struct kmem_cache_node *n, struct page *page, int tail)
1959 {
1960         n->nr_partial++;
1961         if (tail == DEACTIVATE_TO_TAIL)
1962                 list_add_tail(&page->slab_list, &n->partial);
1963         else
1964                 list_add(&page->slab_list, &n->partial);
1965 }
1966
1967 static inline void add_partial(struct kmem_cache_node *n,
1968                                 struct page *page, int tail)
1969 {
1970         lockdep_assert_held(&n->list_lock);
1971         __add_partial(n, page, tail);
1972 }
1973
1974 static inline void remove_partial(struct kmem_cache_node *n,
1975                                         struct page *page)
1976 {
1977         lockdep_assert_held(&n->list_lock);
1978         list_del(&page->slab_list);
1979         n->nr_partial--;
1980 }
1981
1982 /*
1983  * Remove slab from the partial list, freeze it and
1984  * return the pointer to the freelist.
1985  *
1986  * Returns a list of objects or NULL if it fails.
1987  */
1988 static inline void *acquire_slab(struct kmem_cache *s,
1989                 struct kmem_cache_node *n, struct page *page,
1990                 int mode, int *objects)
1991 {
1992         void *freelist;
1993         unsigned long counters;
1994         struct page new;
1995
1996         lockdep_assert_held(&n->list_lock);
1997
1998         /*
1999          * Zap the freelist and set the frozen bit.
2000          * The old freelist is the list of objects for the
2001          * per cpu allocation list.
2002          */
2003         freelist = page->freelist;
2004         counters = page->counters;
2005         new.counters = counters;
2006         *objects = new.objects - new.inuse;
2007         if (mode) {
2008                 new.inuse = page->objects;
2009                 new.freelist = NULL;
2010         } else {
2011                 new.freelist = freelist;
2012         }
2013
2014         VM_BUG_ON(new.frozen);
2015         new.frozen = 1;
2016
2017         if (!__cmpxchg_double_slab(s, page,
2018                         freelist, counters,
2019                         new.freelist, new.counters,
2020                         "acquire_slab"))
2021                 return NULL;
2022
2023         remove_partial(n, page);
2024         WARN_ON(!freelist);
2025         return freelist;
2026 }
2027
2028 static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain);
2029 static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags);
2030
2031 /*
2032  * Try to allocate a partial slab from a specific node.
2033  */
2034 static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
2035                               struct page **ret_page, gfp_t gfpflags)
2036 {
2037         struct page *page, *page2;
2038         void *object = NULL;
2039         unsigned int available = 0;
2040         unsigned long flags;
2041         int objects;
2042
2043         /*
2044          * Racy check. If we mistakenly see no partial slabs then we
2045          * just allocate an empty slab. If we mistakenly try to get a
2046          * partial slab and there is none available then get_partial()
2047          * will return NULL.
2048          */
2049         if (!n || !n->nr_partial)
2050                 return NULL;
2051
2052         spin_lock_irqsave(&n->list_lock, flags);
2053         list_for_each_entry_safe(page, page2, &n->partial, slab_list) {
2054                 void *t;
2055
2056                 if (!pfmemalloc_match(page, gfpflags))
2057                         continue;
2058
2059                 t = acquire_slab(s, n, page, object == NULL, &objects);
2060                 if (!t)
2061                         break;
2062
2063                 available += objects;
2064                 if (!object) {
2065                         *ret_page = page;
2066                         stat(s, ALLOC_FROM_PARTIAL);
2067                         object = t;
2068                 } else {
2069                         put_cpu_partial(s, page, 0);
2070                         stat(s, CPU_PARTIAL_NODE);
2071                 }
2072                 if (!kmem_cache_has_cpu_partial(s)
2073                         || available > slub_cpu_partial(s) / 2)
2074                         break;
2075
2076         }
2077         spin_unlock_irqrestore(&n->list_lock, flags);
2078         return object;
2079 }
2080
2081 /*
2082  * Get a page from somewhere. Search in increasing NUMA distances.
2083  */
2084 static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
2085                              struct page **ret_page)
2086 {
2087 #ifdef CONFIG_NUMA
2088         struct zonelist *zonelist;
2089         struct zoneref *z;
2090         struct zone *zone;
2091         enum zone_type highest_zoneidx = gfp_zone(flags);
2092         void *object;
2093         unsigned int cpuset_mems_cookie;
2094
2095         /*
2096          * The defrag ratio allows a configuration of the tradeoffs between
2097          * inter node defragmentation and node local allocations. A lower
2098          * defrag_ratio increases the tendency to do local allocations
2099          * instead of attempting to obtain partial slabs from other nodes.
2100          *
2101          * If the defrag_ratio is set to 0 then kmalloc() always
2102          * returns node local objects. If the ratio is higher then kmalloc()
2103          * may return off node objects because partial slabs are obtained
2104          * from other nodes and filled up.
2105          *
2106          * If /sys/kernel/slab/xx/remote_node_defrag_ratio is set to 100
2107          * (which makes defrag_ratio = 1000) then every (well almost)
2108          * allocation will first attempt to defrag slab caches on other nodes.
2109          * This means scanning over all nodes to look for partial slabs which
2110          * may be expensive if we do it every time we are trying to find a slab
2111          * with available objects.
2112          */
2113         if (!s->remote_node_defrag_ratio ||
2114                         get_cycles() % 1024 > s->remote_node_defrag_ratio)
2115                 return NULL;
2116
2117         do {
2118                 cpuset_mems_cookie = read_mems_allowed_begin();
2119                 zonelist = node_zonelist(mempolicy_slab_node(), flags);
2120                 for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) {
2121                         struct kmem_cache_node *n;
2122
2123                         n = get_node(s, zone_to_nid(zone));
2124
2125                         if (n && cpuset_zone_allowed(zone, flags) &&
2126                                         n->nr_partial > s->min_partial) {
2127                                 object = get_partial_node(s, n, ret_page, flags);
2128                                 if (object) {
2129                                         /*
2130                                          * Don't check read_mems_allowed_retry()
2131                                          * here - if mems_allowed was updated in
2132                                          * parallel, that was a harmless race
2133                                          * between allocation and the cpuset
2134                                          * update
2135                                          */
2136                                         return object;
2137                                 }
2138                         }
2139                 }
2140         } while (read_mems_allowed_retry(cpuset_mems_cookie));
2141 #endif  /* CONFIG_NUMA */
2142         return NULL;
2143 }
2144
2145 /*
2146  * Get a partial page, lock it and return it.
2147  */
2148 static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
2149                          struct page **ret_page)
2150 {
2151         void *object;
2152         int searchnode = node;
2153
2154         if (node == NUMA_NO_NODE)
2155                 searchnode = numa_mem_id();
2156
2157         object = get_partial_node(s, get_node(s, searchnode), ret_page, flags);
2158         if (object || node != NUMA_NO_NODE)
2159                 return object;
2160
2161         return get_any_partial(s, flags, ret_page);
2162 }
2163
2164 #ifdef CONFIG_PREEMPTION
2165 /*
2166  * Calculate the next globally unique transaction for disambiguation
2167  * during cmpxchg. The transactions start with the cpu number and are then
2168  * incremented by CONFIG_NR_CPUS.
2169  */
2170 #define TID_STEP  roundup_pow_of_two(CONFIG_NR_CPUS)
2171 #else
2172 /*
2173  * No preemption supported therefore also no need to check for
2174  * different cpus.
2175  */
2176 #define TID_STEP 1
2177 #endif
2178
2179 static inline unsigned long next_tid(unsigned long tid)
2180 {
2181         return tid + TID_STEP;
2182 }
2183
2184 #ifdef SLUB_DEBUG_CMPXCHG
2185 static inline unsigned int tid_to_cpu(unsigned long tid)
2186 {
2187         return tid % TID_STEP;
2188 }
2189
2190 static inline unsigned long tid_to_event(unsigned long tid)
2191 {
2192         return tid / TID_STEP;
2193 }
2194 #endif
2195
2196 static inline unsigned int init_tid(int cpu)
2197 {
2198         return cpu;
2199 }
2200
2201 static inline void note_cmpxchg_failure(const char *n,
2202                 const struct kmem_cache *s, unsigned long tid)
2203 {
2204 #ifdef SLUB_DEBUG_CMPXCHG
2205         unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid);
2206
2207         pr_info("%s %s: cmpxchg redo ", n, s->name);
2208
2209 #ifdef CONFIG_PREEMPTION
2210         if (tid_to_cpu(tid) != tid_to_cpu(actual_tid))
2211                 pr_warn("due to cpu change %d -> %d\n",
2212                         tid_to_cpu(tid), tid_to_cpu(actual_tid));
2213         else
2214 #endif
2215         if (tid_to_event(tid) != tid_to_event(actual_tid))
2216                 pr_warn("due to cpu running other code. Event %ld->%ld\n",
2217                         tid_to_event(tid), tid_to_event(actual_tid));
2218         else
2219                 pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n",
2220                         actual_tid, tid, next_tid(tid));
2221 #endif
2222         stat(s, CMPXCHG_DOUBLE_CPU_FAIL);
2223 }
2224
2225 static void init_kmem_cache_cpus(struct kmem_cache *s)
2226 {
2227         int cpu;
2228
2229         for_each_possible_cpu(cpu)
2230                 per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu);
2231 }
2232
2233 /*
2234  * Finishes removing the cpu slab. Merges cpu's freelist with page's freelist,
2235  * unfreezes the slabs and puts it on the proper list.
2236  * Assumes the slab has been already safely taken away from kmem_cache_cpu
2237  * by the caller.
2238  */
2239 static void deactivate_slab(struct kmem_cache *s, struct page *page,
2240                             void *freelist)
2241 {
2242         enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE };
2243         struct kmem_cache_node *n = get_node(s, page_to_nid(page));
2244         int lock = 0, free_delta = 0;
2245         enum slab_modes l = M_NONE, m = M_NONE;
2246         void *nextfree, *freelist_iter, *freelist_tail;
2247         int tail = DEACTIVATE_TO_HEAD;
2248         unsigned long flags = 0;
2249         struct page new;
2250         struct page old;
2251
2252         if (page->freelist) {
2253                 stat(s, DEACTIVATE_REMOTE_FREES);
2254                 tail = DEACTIVATE_TO_TAIL;
2255         }
2256
2257         /*
2258          * Stage one: Count the objects on cpu's freelist as free_delta and
2259          * remember the last object in freelist_tail for later splicing.
2260          */
2261         freelist_tail = NULL;
2262         freelist_iter = freelist;
2263         while (freelist_iter) {
2264                 nextfree = get_freepointer(s, freelist_iter);
2265
2266                 /*
2267                  * If 'nextfree' is invalid, it is possible that the object at
2268                  * 'freelist_iter' is already corrupted.  So isolate all objects
2269                  * starting at 'freelist_iter' by skipping them.
2270                  */
2271                 if (freelist_corrupted(s, page, &freelist_iter, nextfree))
2272                         break;
2273
2274                 freelist_tail = freelist_iter;
2275                 free_delta++;
2276
2277                 freelist_iter = nextfree;
2278         }
2279
2280         /*
2281          * Stage two: Unfreeze the page while splicing the per-cpu
2282          * freelist to the head of page's freelist.
2283          *
2284          * Ensure that the page is unfrozen while the list presence
2285          * reflects the actual number of objects during unfreeze.
2286          *
2287          * We setup the list membership and then perform a cmpxchg
2288          * with the count. If there is a mismatch then the page
2289          * is not unfrozen but the page is on the wrong list.
2290          *
2291          * Then we restart the process which may have to remove
2292          * the page from the list that we just put it on again
2293          * because the number of objects in the slab may have
2294          * changed.
2295          */
2296 redo:
2297
2298         old.freelist = READ_ONCE(page->freelist);
2299         old.counters = READ_ONCE(page->counters);
2300         VM_BUG_ON(!old.frozen);
2301
2302         /* Determine target state of the slab */
2303         new.counters = old.counters;
2304         if (freelist_tail) {
2305                 new.inuse -= free_delta;
2306                 set_freepointer(s, freelist_tail, old.freelist);
2307                 new.freelist = freelist;
2308         } else
2309                 new.freelist = old.freelist;
2310
2311         new.frozen = 0;
2312
2313         if (!new.inuse && n->nr_partial >= s->min_partial)
2314                 m = M_FREE;
2315         else if (new.freelist) {
2316                 m = M_PARTIAL;
2317                 if (!lock) {
2318                         lock = 1;
2319                         /*
2320                          * Taking the spinlock removes the possibility
2321                          * that acquire_slab() will see a slab page that
2322                          * is frozen
2323                          */
2324                         spin_lock_irqsave(&n->list_lock, flags);
2325                 }
2326         } else {
2327                 m = M_FULL;
2328                 if (kmem_cache_debug_flags(s, SLAB_STORE_USER) && !lock) {
2329                         lock = 1;
2330                         /*
2331                          * This also ensures that the scanning of full
2332                          * slabs from diagnostic functions will not see
2333                          * any frozen slabs.
2334                          */
2335                         spin_lock_irqsave(&n->list_lock, flags);
2336                 }
2337         }
2338
2339         if (l != m) {
2340                 if (l == M_PARTIAL)
2341                         remove_partial(n, page);
2342                 else if (l == M_FULL)
2343                         remove_full(s, n, page);
2344
2345                 if (m == M_PARTIAL)
2346                         add_partial(n, page, tail);
2347                 else if (m == M_FULL)
2348                         add_full(s, n, page);
2349         }
2350
2351         l = m;
2352         if (!cmpxchg_double_slab(s, page,
2353                                 old.freelist, old.counters,
2354                                 new.freelist, new.counters,
2355                                 "unfreezing slab"))
2356                 goto redo;
2357
2358         if (lock)
2359                 spin_unlock_irqrestore(&n->list_lock, flags);
2360
2361         if (m == M_PARTIAL)
2362                 stat(s, tail);
2363         else if (m == M_FULL)
2364                 stat(s, DEACTIVATE_FULL);
2365         else if (m == M_FREE) {
2366                 stat(s, DEACTIVATE_EMPTY);
2367                 discard_slab(s, page);
2368                 stat(s, FREE_SLAB);
2369         }
2370 }
2371
2372 #ifdef CONFIG_SLUB_CPU_PARTIAL
2373 static void __unfreeze_partials(struct kmem_cache *s, struct page *partial_page)
2374 {
2375         struct kmem_cache_node *n = NULL, *n2 = NULL;
2376         struct page *page, *discard_page = NULL;
2377         unsigned long flags = 0;
2378
2379         while (partial_page) {
2380                 struct page new;
2381                 struct page old;
2382
2383                 page = partial_page;
2384                 partial_page = page->next;
2385
2386                 n2 = get_node(s, page_to_nid(page));
2387                 if (n != n2) {
2388                         if (n)
2389                                 spin_unlock_irqrestore(&n->list_lock, flags);
2390
2391                         n = n2;
2392                         spin_lock_irqsave(&n->list_lock, flags);
2393                 }
2394
2395                 do {
2396
2397                         old.freelist = page->freelist;
2398                         old.counters = page->counters;
2399                         VM_BUG_ON(!old.frozen);
2400
2401                         new.counters = old.counters;
2402                         new.freelist = old.freelist;
2403
2404                         new.frozen = 0;
2405
2406                 } while (!__cmpxchg_double_slab(s, page,
2407                                 old.freelist, old.counters,
2408                                 new.freelist, new.counters,
2409                                 "unfreezing slab"));
2410
2411                 if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) {
2412                         page->next = discard_page;
2413                         discard_page = page;
2414                 } else {
2415                         add_partial(n, page, DEACTIVATE_TO_TAIL);
2416                         stat(s, FREE_ADD_PARTIAL);
2417                 }
2418         }
2419
2420         if (n)
2421                 spin_unlock_irqrestore(&n->list_lock, flags);
2422
2423         while (discard_page) {
2424                 page = discard_page;
2425                 discard_page = discard_page->next;
2426
2427                 stat(s, DEACTIVATE_EMPTY);
2428                 discard_slab(s, page);
2429                 stat(s, FREE_SLAB);
2430         }
2431 }
2432
2433 /*
2434  * Unfreeze all the cpu partial slabs.
2435  */
2436 static void unfreeze_partials(struct kmem_cache *s)
2437 {
2438         struct page *partial_page;
2439         unsigned long flags;
2440
2441         local_irq_save(flags);
2442         partial_page = this_cpu_read(s->cpu_slab->partial);
2443         this_cpu_write(s->cpu_slab->partial, NULL);
2444         local_irq_restore(flags);
2445
2446         if (partial_page)
2447                 __unfreeze_partials(s, partial_page);
2448 }
2449
2450 static void unfreeze_partials_cpu(struct kmem_cache *s,
2451                                   struct kmem_cache_cpu *c)
2452 {
2453         struct page *partial_page;
2454
2455         partial_page = slub_percpu_partial(c);
2456         c->partial = NULL;
2457
2458         if (partial_page)
2459                 __unfreeze_partials(s, partial_page);
2460 }
2461
2462 #else   /* CONFIG_SLUB_CPU_PARTIAL */
2463
2464 static inline void unfreeze_partials(struct kmem_cache *s) { }
2465 static inline void unfreeze_partials_cpu(struct kmem_cache *s,
2466                                   struct kmem_cache_cpu *c) { }
2467
2468 #endif  /* CONFIG_SLUB_CPU_PARTIAL */
2469
2470 /*
2471  * Put a page that was just frozen (in __slab_free|get_partial_node) into a
2472  * partial page slot if available.
2473  *
2474  * If we did not find a slot then simply move all the partials to the
2475  * per node partial list.
2476  */
2477 static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
2478 {
2479 #ifdef CONFIG_SLUB_CPU_PARTIAL
2480         struct page *oldpage;
2481         int pages;
2482         int pobjects;
2483
2484         preempt_disable();
2485         do {
2486                 pages = 0;
2487                 pobjects = 0;
2488                 oldpage = this_cpu_read(s->cpu_slab->partial);
2489
2490                 if (oldpage) {
2491                         pobjects = oldpage->pobjects;
2492                         pages = oldpage->pages;
2493                         if (drain && pobjects > slub_cpu_partial(s)) {
2494                                 /*
2495                                  * partial array is full. Move the existing
2496                                  * set to the per node partial list.
2497                                  */
2498                                 unfreeze_partials(s);
2499                                 oldpage = NULL;
2500                                 pobjects = 0;
2501                                 pages = 0;
2502                                 stat(s, CPU_PARTIAL_DRAIN);
2503                         }
2504                 }
2505
2506                 pages++;
2507                 pobjects += page->objects - page->inuse;
2508
2509                 page->pages = pages;
2510                 page->pobjects = pobjects;
2511                 page->next = oldpage;
2512
2513         } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page)
2514                                                                 != oldpage);
2515         preempt_enable();
2516 #endif  /* CONFIG_SLUB_CPU_PARTIAL */
2517 }
2518
2519 static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
2520 {
2521         unsigned long flags;
2522         struct page *page;
2523         void *freelist;
2524
2525         local_irq_save(flags);
2526
2527         page = c->page;
2528         freelist = c->freelist;
2529
2530         c->page = NULL;
2531         c->freelist = NULL;
2532         c->tid = next_tid(c->tid);
2533
2534         local_irq_restore(flags);
2535
2536         if (page) {
2537                 deactivate_slab(s, page, freelist);
2538                 stat(s, CPUSLAB_FLUSH);
2539         }
2540 }
2541
2542 static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
2543 {
2544         struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
2545         void *freelist = c->freelist;
2546         struct page *page = c->page;
2547
2548         c->page = NULL;
2549         c->freelist = NULL;
2550         c->tid = next_tid(c->tid);
2551
2552         if (page) {
2553                 deactivate_slab(s, page, freelist);
2554                 stat(s, CPUSLAB_FLUSH);
2555         }
2556
2557         unfreeze_partials_cpu(s, c);
2558 }
2559
2560 struct slub_flush_work {
2561         struct work_struct work;
2562         struct kmem_cache *s;
2563         bool skip;
2564 };
2565
2566 /*
2567  * Flush cpu slab.
2568  *
2569  * Called from CPU work handler with migration disabled.
2570  */
2571 static void flush_cpu_slab(struct work_struct *w)
2572 {
2573         struct kmem_cache *s;
2574         struct kmem_cache_cpu *c;
2575         struct slub_flush_work *sfw;
2576
2577         sfw = container_of(w, struct slub_flush_work, work);
2578
2579         s = sfw->s;
2580         c = this_cpu_ptr(s->cpu_slab);
2581
2582         if (c->page)
2583                 flush_slab(s, c);
2584
2585         unfreeze_partials(s);
2586 }
2587
2588 static bool has_cpu_slab(int cpu, struct kmem_cache *s)
2589 {
2590         struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
2591
2592         return c->page || slub_percpu_partial(c);
2593 }
2594
2595 static DEFINE_MUTEX(flush_lock);
2596 static DEFINE_PER_CPU(struct slub_flush_work, slub_flush);
2597
2598 static void flush_all_cpus_locked(struct kmem_cache *s)
2599 {
2600         struct slub_flush_work *sfw;
2601         unsigned int cpu;
2602
2603         lockdep_assert_cpus_held();
2604         mutex_lock(&flush_lock);
2605
2606         for_each_online_cpu(cpu) {
2607                 sfw = &per_cpu(slub_flush, cpu);
2608                 if (!has_cpu_slab(cpu, s)) {
2609                         sfw->skip = true;
2610                         continue;
2611                 }
2612                 INIT_WORK(&sfw->work, flush_cpu_slab);
2613                 sfw->skip = false;
2614                 sfw->s = s;
2615                 schedule_work_on(cpu, &sfw->work);
2616         }
2617
2618         for_each_online_cpu(cpu) {
2619                 sfw = &per_cpu(slub_flush, cpu);
2620                 if (sfw->skip)
2621                         continue;
2622                 flush_work(&sfw->work);
2623         }
2624
2625         mutex_unlock(&flush_lock);
2626 }
2627
2628 static void flush_all(struct kmem_cache *s)
2629 {
2630         cpus_read_lock();
2631         flush_all_cpus_locked(s);
2632         cpus_read_unlock();
2633 }
2634
2635 /*
2636  * Use the cpu notifier to insure that the cpu slabs are flushed when
2637  * necessary.
2638  */
2639 static int slub_cpu_dead(unsigned int cpu)
2640 {
2641         struct kmem_cache *s;
2642
2643         mutex_lock(&slab_mutex);
2644         list_for_each_entry(s, &slab_caches, list)
2645                 __flush_cpu_slab(s, cpu);
2646         mutex_unlock(&slab_mutex);
2647         return 0;
2648 }
2649
2650 /*
2651  * Check if the objects in a per cpu structure fit numa
2652  * locality expectations.
2653  */
2654 static inline int node_match(struct page *page, int node)
2655 {
2656 #ifdef CONFIG_NUMA
2657         if (node != NUMA_NO_NODE && page_to_nid(page) != node)
2658                 return 0;
2659 #endif
2660         return 1;
2661 }
2662
2663 #ifdef CONFIG_SLUB_DEBUG
2664 static int count_free(struct page *page)
2665 {
2666         return page->objects - page->inuse;
2667 }
2668
2669 static inline unsigned long node_nr_objs(struct kmem_cache_node *n)
2670 {
2671         return atomic_long_read(&n->total_objects);
2672 }
2673 #endif /* CONFIG_SLUB_DEBUG */
2674
2675 #if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS)
2676 static unsigned long count_partial(struct kmem_cache_node *n,
2677                                         int (*get_count)(struct page *))
2678 {
2679         unsigned long flags;
2680         unsigned long x = 0;
2681         struct page *page;
2682
2683         spin_lock_irqsave(&n->list_lock, flags);
2684         list_for_each_entry(page, &n->partial, slab_list)
2685                 x += get_count(page);
2686         spin_unlock_irqrestore(&n->list_lock, flags);
2687         return x;
2688 }
2689 #endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */
2690
2691 static noinline void
2692 slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
2693 {
2694 #ifdef CONFIG_SLUB_DEBUG
2695         static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL,
2696                                       DEFAULT_RATELIMIT_BURST);
2697         int node;
2698         struct kmem_cache_node *n;
2699
2700         if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs))
2701                 return;
2702
2703         pr_warn("SLUB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n",
2704                 nid, gfpflags, &gfpflags);
2705         pr_warn("  cache: %s, object size: %u, buffer size: %u, default order: %u, min order: %u\n",
2706                 s->name, s->object_size, s->size, oo_order(s->oo),
2707                 oo_order(s->min));
2708
2709         if (oo_order(s->min) > get_order(s->object_size))
2710                 pr_warn("  %s debugging increased min order, use slub_debug=O to disable.\n",
2711                         s->name);
2712
2713         for_each_kmem_cache_node(s, node, n) {
2714                 unsigned long nr_slabs;
2715                 unsigned long nr_objs;
2716                 unsigned long nr_free;
2717
2718                 nr_free  = count_partial(n, count_free);
2719                 nr_slabs = node_nr_slabs(n);
2720                 nr_objs  = node_nr_objs(n);
2721
2722                 pr_warn("  node %d: slabs: %ld, objs: %ld, free: %ld\n",
2723                         node, nr_slabs, nr_objs, nr_free);
2724         }
2725 #endif
2726 }
2727
2728 static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags)
2729 {
2730         if (unlikely(PageSlabPfmemalloc(page)))
2731                 return gfp_pfmemalloc_allowed(gfpflags);
2732
2733         return true;
2734 }
2735
2736 /*
2737  * A variant of pfmemalloc_match() that tests page flags without asserting
2738  * PageSlab. Intended for opportunistic checks before taking a lock and
2739  * rechecking that nobody else freed the page under us.
2740  */
2741 static inline bool pfmemalloc_match_unsafe(struct page *page, gfp_t gfpflags)
2742 {
2743         if (unlikely(__PageSlabPfmemalloc(page)))
2744                 return gfp_pfmemalloc_allowed(gfpflags);
2745
2746         return true;
2747 }
2748
2749 /*
2750  * Check the page->freelist of a page and either transfer the freelist to the
2751  * per cpu freelist or deactivate the page.
2752  *
2753  * The page is still frozen if the return value is not NULL.
2754  *
2755  * If this function returns NULL then the page has been unfrozen.
2756  *
2757  * This function must be called with interrupt disabled.
2758  */
2759 static inline void *get_freelist(struct kmem_cache *s, struct page *page)
2760 {
2761         struct page new;
2762         unsigned long counters;
2763         void *freelist;
2764
2765         do {
2766                 freelist = page->freelist;
2767                 counters = page->counters;
2768
2769                 new.counters = counters;
2770                 VM_BUG_ON(!new.frozen);
2771
2772                 new.inuse = page->objects;
2773                 new.frozen = freelist != NULL;
2774
2775         } while (!__cmpxchg_double_slab(s, page,
2776                 freelist, counters,
2777                 NULL, new.counters,
2778                 "get_freelist"));
2779
2780         return freelist;
2781 }
2782
2783 /*
2784  * Slow path. The lockless freelist is empty or we need to perform
2785  * debugging duties.
2786  *
2787  * Processing is still very fast if new objects have been freed to the
2788  * regular freelist. In that case we simply take over the regular freelist
2789  * as the lockless freelist and zap the regular freelist.
2790  *
2791  * If that is not working then we fall back to the partial lists. We take the
2792  * first element of the freelist as the object to allocate now and move the
2793  * rest of the freelist to the lockless freelist.
2794  *
2795  * And if we were unable to get a new slab from the partial slab lists then
2796  * we need to allocate a new slab. This is the slowest path since it involves
2797  * a call to the page allocator and the setup of a new slab.
2798  *
2799  * Version of __slab_alloc to use when we know that preemption is
2800  * already disabled (which is the case for bulk allocation).
2801  */
2802 static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
2803                           unsigned long addr, struct kmem_cache_cpu *c)
2804 {
2805         void *freelist;
2806         struct page *page;
2807         unsigned long flags;
2808
2809         stat(s, ALLOC_SLOWPATH);
2810
2811 reread_page:
2812
2813         page = READ_ONCE(c->page);
2814         if (!page) {
2815                 /*
2816                  * if the node is not online or has no normal memory, just
2817                  * ignore the node constraint
2818                  */
2819                 if (unlikely(node != NUMA_NO_NODE &&
2820                              !node_isset(node, slab_nodes)))
2821                         node = NUMA_NO_NODE;
2822                 goto new_slab;
2823         }
2824 redo:
2825
2826         if (unlikely(!node_match(page, node))) {
2827                 /*
2828                  * same as above but node_match() being false already
2829                  * implies node != NUMA_NO_NODE
2830                  */
2831                 if (!node_isset(node, slab_nodes)) {
2832                         node = NUMA_NO_NODE;
2833                         goto redo;
2834                 } else {
2835                         stat(s, ALLOC_NODE_MISMATCH);
2836                         goto deactivate_slab;
2837                 }
2838         }
2839
2840         /*
2841          * By rights, we should be searching for a slab page that was
2842          * PFMEMALLOC but right now, we are losing the pfmemalloc
2843          * information when the page leaves the per-cpu allocator
2844          */
2845         if (unlikely(!pfmemalloc_match_unsafe(page, gfpflags)))
2846                 goto deactivate_slab;
2847
2848         /* must check again c->page in case IRQ handler changed it */
2849         local_irq_save(flags);
2850         if (unlikely(page != c->page)) {
2851                 local_irq_restore(flags);
2852                 goto reread_page;
2853         }
2854         freelist = c->freelist;
2855         if (freelist)
2856                 goto load_freelist;
2857
2858         freelist = get_freelist(s, page);
2859
2860         if (!freelist) {
2861                 c->page = NULL;
2862                 local_irq_restore(flags);
2863                 stat(s, DEACTIVATE_BYPASS);
2864                 goto new_slab;
2865         }
2866
2867         stat(s, ALLOC_REFILL);
2868
2869 load_freelist:
2870
2871         lockdep_assert_irqs_disabled();
2872
2873         /*
2874          * freelist is pointing to the list of objects to be used.
2875          * page is pointing to the page from which the objects are obtained.
2876          * That page must be frozen for per cpu allocations to work.
2877          */
2878         VM_BUG_ON(!c->page->frozen);
2879         c->freelist = get_freepointer(s, freelist);
2880         c->tid = next_tid(c->tid);
2881         local_irq_restore(flags);
2882         return freelist;
2883
2884 deactivate_slab:
2885
2886         local_irq_save(flags);
2887         if (page != c->page) {
2888                 local_irq_restore(flags);
2889                 goto reread_page;
2890         }
2891         freelist = c->freelist;
2892         c->page = NULL;
2893         c->freelist = NULL;
2894         local_irq_restore(flags);
2895         deactivate_slab(s, page, freelist);
2896
2897 new_slab:
2898
2899         if (slub_percpu_partial(c)) {
2900                 local_irq_save(flags);
2901                 if (unlikely(c->page)) {
2902                         local_irq_restore(flags);
2903                         goto reread_page;
2904                 }
2905                 if (unlikely(!slub_percpu_partial(c))) {
2906                         local_irq_restore(flags);
2907                         goto new_objects; /* stolen by an IRQ handler */
2908                 }
2909
2910                 page = c->page = slub_percpu_partial(c);
2911                 slub_set_percpu_partial(c, page);
2912                 local_irq_restore(flags);
2913                 stat(s, CPU_PARTIAL_ALLOC);
2914                 goto redo;
2915         }
2916
2917 new_objects:
2918
2919         freelist = get_partial(s, gfpflags, node, &page);
2920         if (freelist)
2921                 goto check_new_page;
2922
2923         put_cpu_ptr(s->cpu_slab);
2924         page = new_slab(s, gfpflags, node);
2925         c = get_cpu_ptr(s->cpu_slab);
2926
2927         if (unlikely(!page)) {
2928                 slab_out_of_memory(s, gfpflags, node);
2929                 return NULL;
2930         }
2931
2932         /*
2933          * No other reference to the page yet so we can
2934          * muck around with it freely without cmpxchg
2935          */
2936         freelist = page->freelist;
2937         page->freelist = NULL;
2938
2939         stat(s, ALLOC_SLAB);
2940
2941 check_new_page:
2942
2943         if (kmem_cache_debug(s)) {
2944                 if (!alloc_debug_processing(s, page, freelist, addr)) {
2945                         /* Slab failed checks. Next slab needed */
2946                         goto new_slab;
2947                 } else {
2948                         /*
2949                          * For debug case, we don't load freelist so that all
2950                          * allocations go through alloc_debug_processing()
2951                          */
2952                         goto return_single;
2953                 }
2954         }
2955
2956         if (unlikely(!pfmemalloc_match(page, gfpflags)))
2957                 /*
2958                  * For !pfmemalloc_match() case we don't load freelist so that
2959                  * we don't make further mismatched allocations easier.
2960                  */
2961                 goto return_single;
2962
2963 retry_load_page:
2964
2965         local_irq_save(flags);
2966         if (unlikely(c->page)) {
2967                 void *flush_freelist = c->freelist;
2968                 struct page *flush_page = c->page;
2969
2970                 c->page = NULL;
2971                 c->freelist = NULL;
2972                 c->tid = next_tid(c->tid);
2973
2974                 local_irq_restore(flags);
2975
2976                 deactivate_slab(s, flush_page, flush_freelist);
2977
2978                 stat(s, CPUSLAB_FLUSH);
2979
2980                 goto retry_load_page;
2981         }
2982         c->page = page;
2983
2984         goto load_freelist;
2985
2986 return_single:
2987
2988         deactivate_slab(s, page, get_freepointer(s, freelist));
2989         return freelist;
2990 }
2991
2992 /*
2993  * A wrapper for ___slab_alloc() for contexts where preemption is not yet
2994  * disabled. Compensates for possible cpu changes by refetching the per cpu area
2995  * pointer.
2996  */
2997 static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
2998                           unsigned long addr, struct kmem_cache_cpu *c)
2999 {
3000         void *p;
3001
3002 #ifdef CONFIG_PREEMPT_COUNT
3003         /*
3004          * We may have been preempted and rescheduled on a different
3005          * cpu before disabling preemption. Need to reload cpu area
3006          * pointer.
3007          */
3008         c = get_cpu_ptr(s->cpu_slab);
3009 #endif
3010
3011         p = ___slab_alloc(s, gfpflags, node, addr, c);
3012 #ifdef CONFIG_PREEMPT_COUNT
3013         put_cpu_ptr(s->cpu_slab);
3014 #endif
3015         return p;
3016 }
3017
3018 /*
3019  * If the object has been wiped upon free, make sure it's fully initialized by
3020  * zeroing out freelist pointer.
3021  */
3022 static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s,
3023                                                    void *obj)
3024 {
3025         if (unlikely(slab_want_init_on_free(s)) && obj)
3026                 memset((void *)((char *)kasan_reset_tag(obj) + s->offset),
3027                         0, sizeof(void *));
3028 }
3029
3030 /*
3031  * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
3032  * have the fastpath folded into their functions. So no function call
3033  * overhead for requests that can be satisfied on the fastpath.
3034  *
3035  * The fastpath works by first checking if the lockless freelist can be used.
3036  * If not then __slab_alloc is called for slow processing.
3037  *
3038  * Otherwise we can simply pick the next object from the lockless free list.
3039  */
3040 static __always_inline void *slab_alloc_node(struct kmem_cache *s,
3041                 gfp_t gfpflags, int node, unsigned long addr, size_t orig_size)
3042 {
3043         void *object;
3044         struct kmem_cache_cpu *c;
3045         struct page *page;
3046         unsigned long tid;
3047         struct obj_cgroup *objcg = NULL;
3048         bool init = false;
3049
3050         s = slab_pre_alloc_hook(s, &objcg, 1, gfpflags);
3051         if (!s)
3052                 return NULL;
3053
3054         object = kfence_alloc(s, orig_size, gfpflags);
3055         if (unlikely(object))
3056                 goto out;
3057
3058 redo:
3059         /*
3060          * Must read kmem_cache cpu data via this cpu ptr. Preemption is
3061          * enabled. We may switch back and forth between cpus while
3062          * reading from one cpu area. That does not matter as long
3063          * as we end up on the original cpu again when doing the cmpxchg.
3064          *
3065          * We must guarantee that tid and kmem_cache_cpu are retrieved on the
3066          * same cpu. We read first the kmem_cache_cpu pointer and use it to read
3067          * the tid. If we are preempted and switched to another cpu between the
3068          * two reads, it's OK as the two are still associated with the same cpu
3069          * and cmpxchg later will validate the cpu.
3070          */
3071         c = raw_cpu_ptr(s->cpu_slab);
3072         tid = READ_ONCE(c->tid);
3073
3074         /*
3075          * Irqless object alloc/free algorithm used here depends on sequence
3076          * of fetching cpu_slab's data. tid should be fetched before anything
3077          * on c to guarantee that object and page associated with previous tid
3078          * won't be used with current tid. If we fetch tid first, object and
3079          * page could be one associated with next tid and our alloc/free
3080          * request will be failed. In this case, we will retry. So, no problem.
3081          */
3082         barrier();
3083
3084         /*
3085          * The transaction ids are globally unique per cpu and per operation on
3086          * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
3087          * occurs on the right processor and that there was no operation on the
3088          * linked list in between.
3089          */
3090
3091         object = c->freelist;
3092         page = c->page;
3093         if (unlikely(!object || !page || !node_match(page, node))) {
3094                 object = __slab_alloc(s, gfpflags, node, addr, c);
3095         } else {
3096                 void *next_object = get_freepointer_safe(s, object);
3097
3098                 /*
3099                  * The cmpxchg will only match if there was no additional
3100                  * operation and if we are on the right processor.
3101                  *
3102                  * The cmpxchg does the following atomically (without lock
3103                  * semantics!)
3104                  * 1. Relocate first pointer to the current per cpu area.
3105                  * 2. Verify that tid and freelist have not been changed
3106                  * 3. If they were not changed replace tid and freelist
3107                  *
3108                  * Since this is without lock semantics the protection is only
3109                  * against code executing on this cpu *not* from access by
3110                  * other cpus.
3111                  */
3112                 if (unlikely(!this_cpu_cmpxchg_double(
3113                                 s->cpu_slab->freelist, s->cpu_slab->tid,
3114                                 object, tid,
3115                                 next_object, next_tid(tid)))) {
3116
3117                         note_cmpxchg_failure("slab_alloc", s, tid);
3118                         goto redo;
3119                 }
3120                 prefetch_freepointer(s, next_object);
3121                 stat(s, ALLOC_FASTPATH);
3122         }
3123
3124         maybe_wipe_obj_freeptr(s, object);
3125         init = slab_want_init_on_alloc(gfpflags, s);
3126
3127 out:
3128         slab_post_alloc_hook(s, objcg, gfpflags, 1, &object, init);
3129
3130         return object;
3131 }
3132
3133 static __always_inline void *slab_alloc(struct kmem_cache *s,
3134                 gfp_t gfpflags, unsigned long addr, size_t orig_size)
3135 {
3136         return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr, orig_size);
3137 }
3138
3139 void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
3140 {
3141         void *ret = slab_alloc(s, gfpflags, _RET_IP_, s->object_size);
3142
3143         trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size,
3144                                 s->size, gfpflags);
3145
3146         return ret;
3147 }
3148 EXPORT_SYMBOL(kmem_cache_alloc);
3149
3150 #ifdef CONFIG_TRACING
3151 void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
3152 {
3153         void *ret = slab_alloc(s, gfpflags, _RET_IP_, size);
3154         trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
3155         ret = kasan_kmalloc(s, ret, size, gfpflags);
3156         return ret;
3157 }
3158 EXPORT_SYMBOL(kmem_cache_alloc_trace);
3159 #endif
3160
3161 #ifdef CONFIG_NUMA
3162 void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
3163 {
3164         void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_, s->object_size);
3165
3166         trace_kmem_cache_alloc_node(_RET_IP_, ret,
3167                                     s->object_size, s->size, gfpflags, node);
3168
3169         return ret;
3170 }
3171 EXPORT_SYMBOL(kmem_cache_alloc_node);
3172
3173 #ifdef CONFIG_TRACING
3174 void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
3175                                     gfp_t gfpflags,
3176                                     int node, size_t size)
3177 {
3178         void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_, size);
3179
3180         trace_kmalloc_node(_RET_IP_, ret,
3181                            size, s->size, gfpflags, node);
3182
3183         ret = kasan_kmalloc(s, ret, size, gfpflags);
3184         return ret;
3185 }
3186 EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
3187 #endif
3188 #endif  /* CONFIG_NUMA */
3189
3190 /*
3191  * Slow path handling. This may still be called frequently since objects
3192  * have a longer lifetime than the cpu slabs in most processing loads.
3193  *
3194  * So we still attempt to reduce cache line usage. Just take the slab
3195  * lock and free the item. If there is no additional partial page
3196  * handling required then we can return immediately.
3197  */
3198 static void __slab_free(struct kmem_cache *s, struct page *page,
3199                         void *head, void *tail, int cnt,
3200                         unsigned long addr)
3201
3202 {
3203         void *prior;
3204         int was_frozen;
3205         struct page new;
3206         unsigned long counters;
3207         struct kmem_cache_node *n = NULL;
3208         unsigned long flags;
3209
3210         stat(s, FREE_SLOWPATH);
3211
3212         if (kfence_free(head))
3213                 return;
3214
3215         if (kmem_cache_debug(s) &&
3216             !free_debug_processing(s, page, head, tail, cnt, addr))
3217                 return;
3218
3219         do {
3220                 if (unlikely(n)) {
3221                         spin_unlock_irqrestore(&n->list_lock, flags);
3222                         n = NULL;
3223                 }
3224                 prior = page->freelist;
3225                 counters = page->counters;
3226                 set_freepointer(s, tail, prior);
3227                 new.counters = counters;
3228                 was_frozen = new.frozen;
3229                 new.inuse -= cnt;
3230                 if ((!new.inuse || !prior) && !was_frozen) {
3231
3232                         if (kmem_cache_has_cpu_partial(s) && !prior) {
3233
3234                                 /*
3235                                  * Slab was on no list before and will be
3236                                  * partially empty
3237                                  * We can defer the list move and instead
3238                                  * freeze it.
3239                                  */
3240                                 new.frozen = 1;
3241
3242                         } else { /* Needs to be taken off a list */
3243
3244                                 n = get_node(s, page_to_nid(page));
3245                                 /*
3246                                  * Speculatively acquire the list_lock.
3247                                  * If the cmpxchg does not succeed then we may
3248                                  * drop the list_lock without any processing.
3249                                  *
3250                                  * Otherwise the list_lock will synchronize with
3251                                  * other processors updating the list of slabs.
3252                                  */
3253                                 spin_lock_irqsave(&n->list_lock, flags);
3254
3255                         }
3256                 }
3257
3258         } while (!cmpxchg_double_slab(s, page,
3259                 prior, counters,
3260                 head, new.counters,
3261                 "__slab_free"));
3262
3263         if (likely(!n)) {
3264
3265                 if (likely(was_frozen)) {
3266                         /*
3267                          * The list lock was not taken therefore no list
3268                          * activity can be necessary.
3269                          */
3270                         stat(s, FREE_FROZEN);
3271                 } else if (new.frozen) {
3272                         /*
3273                          * If we just froze the page then put it onto the
3274                          * per cpu partial list.
3275                          */
3276                         put_cpu_partial(s, page, 1);
3277                         stat(s, CPU_PARTIAL_FREE);
3278                 }
3279
3280                 return;
3281         }
3282
3283         if (unlikely(!new.inuse && n->nr_partial >= s->min_partial))
3284                 goto slab_empty;
3285
3286         /*
3287          * Objects left in the slab. If it was not on the partial list before
3288          * then add it.
3289          */
3290         if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) {
3291                 remove_full(s, n, page);
3292                 add_partial(n, page, DEACTIVATE_TO_TAIL);
3293                 stat(s, FREE_ADD_PARTIAL);
3294         }
3295         spin_unlock_irqrestore(&n->list_lock, flags);
3296         return;
3297
3298 slab_empty:
3299         if (prior) {
3300                 /*
3301                  * Slab on the partial list.
3302                  */
3303                 remove_partial(n, page);
3304                 stat(s, FREE_REMOVE_PARTIAL);
3305         } else {
3306                 /* Slab must be on the full list */
3307                 remove_full(s, n, page);
3308         }
3309
3310         spin_unlock_irqrestore(&n->list_lock, flags);
3311         stat(s, FREE_SLAB);
3312         discard_slab(s, page);
3313 }
3314
3315 /*
3316  * Fastpath with forced inlining to produce a kfree and kmem_cache_free that
3317  * can perform fastpath freeing without additional function calls.
3318  *
3319  * The fastpath is only possible if we are freeing to the current cpu slab
3320  * of this processor. This typically the case if we have just allocated
3321  * the item before.
3322  *
3323  * If fastpath is not possible then fall back to __slab_free where we deal
3324  * with all sorts of special processing.
3325  *
3326  * Bulk free of a freelist with several objects (all pointing to the
3327  * same page) possible by specifying head and tail ptr, plus objects
3328  * count (cnt). Bulk free indicated by tail pointer being set.
3329  */
3330 static __always_inline void do_slab_free(struct kmem_cache *s,
3331                                 struct page *page, void *head, void *tail,
3332                                 int cnt, unsigned long addr)
3333 {
3334         void *tail_obj = tail ? : head;
3335         struct kmem_cache_cpu *c;
3336         unsigned long tid;
3337
3338         memcg_slab_free_hook(s, &head, 1);
3339 redo:
3340         /*
3341          * Determine the currently cpus per cpu slab.
3342          * The cpu may change afterward. However that does not matter since
3343          * data is retrieved via this pointer. If we are on the same cpu
3344          * during the cmpxchg then the free will succeed.
3345          */
3346         c = raw_cpu_ptr(s->cpu_slab);
3347         tid = READ_ONCE(c->tid);
3348
3349         /* Same with comment on barrier() in slab_alloc_node() */
3350         barrier();
3351
3352         if (likely(page == c->page)) {
3353                 void **freelist = READ_ONCE(c->freelist);
3354
3355                 set_freepointer(s, tail_obj, freelist);
3356
3357                 if (unlikely(!this_cpu_cmpxchg_double(
3358                                 s->cpu_slab->freelist, s->cpu_slab->tid,
3359                                 freelist, tid,
3360                                 head, next_tid(tid)))) {
3361
3362                         note_cmpxchg_failure("slab_free", s, tid);
3363                         goto redo;
3364                 }
3365                 stat(s, FREE_FASTPATH);
3366         } else
3367                 __slab_free(s, page, head, tail_obj, cnt, addr);
3368
3369 }
3370
3371 static __always_inline void slab_free(struct kmem_cache *s, struct page *page,
3372                                       void *head, void *tail, int cnt,
3373                                       unsigned long addr)
3374 {
3375         /*
3376          * With KASAN enabled slab_free_freelist_hook modifies the freelist
3377          * to remove objects, whose reuse must be delayed.
3378          */
3379         if (slab_free_freelist_hook(s, &head, &tail))
3380                 do_slab_free(s, page, head, tail, cnt, addr);
3381 }
3382
3383 #ifdef CONFIG_KASAN_GENERIC
3384 void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr)
3385 {
3386         do_slab_free(cache, virt_to_head_page(x), x, NULL, 1, addr);
3387 }
3388 #endif
3389
3390 void kmem_cache_free(struct kmem_cache *s, void *x)
3391 {
3392         s = cache_from_obj(s, x);
3393         if (!s)
3394                 return;
3395         slab_free(s, virt_to_head_page(x), x, NULL, 1, _RET_IP_);
3396         trace_kmem_cache_free(_RET_IP_, x, s->name);
3397 }
3398 EXPORT_SYMBOL(kmem_cache_free);
3399
3400 struct detached_freelist {
3401         struct page *page;
3402         void *tail;
3403         void *freelist;
3404         int cnt;
3405         struct kmem_cache *s;
3406 };
3407
3408 static inline void free_nonslab_page(struct page *page, void *object)
3409 {
3410         unsigned int order = compound_order(page);
3411
3412         VM_BUG_ON_PAGE(!PageCompound(page), page);
3413         kfree_hook(object);
3414         mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, -(PAGE_SIZE << order));
3415         __free_pages(page, order);
3416 }
3417
3418 /*
3419  * This function progressively scans the array with free objects (with
3420  * a limited look ahead) and extract objects belonging to the same
3421  * page.  It builds a detached freelist directly within the given
3422  * page/objects.  This can happen without any need for
3423  * synchronization, because the objects are owned by running process.
3424  * The freelist is build up as a single linked list in the objects.
3425  * The idea is, that this detached freelist can then be bulk
3426  * transferred to the real freelist(s), but only requiring a single
3427  * synchronization primitive.  Look ahead in the array is limited due
3428  * to performance reasons.
3429  */
3430 static inline
3431 int build_detached_freelist(struct kmem_cache *s, size_t size,
3432                             void **p, struct detached_freelist *df)
3433 {
3434         size_t first_skipped_index = 0;
3435         int lookahead = 3;
3436         void *object;
3437         struct page *page;
3438
3439         /* Always re-init detached_freelist */
3440         df->page = NULL;
3441
3442         do {
3443                 object = p[--size];
3444                 /* Do we need !ZERO_OR_NULL_PTR(object) here? (for kfree) */
3445         } while (!object && size);
3446
3447         if (!object)
3448                 return 0;
3449
3450         page = virt_to_head_page(object);
3451         if (!s) {
3452                 /* Handle kalloc'ed objects */
3453                 if (unlikely(!PageSlab(page))) {
3454                         free_nonslab_page(page, object);
3455                         p[size] = NULL; /* mark object processed */
3456                         return size;
3457                 }
3458                 /* Derive kmem_cache from object */
3459                 df->s = page->slab_cache;
3460         } else {
3461                 df->s = cache_from_obj(s, object); /* Support for memcg */
3462         }
3463
3464         if (is_kfence_address(object)) {
3465                 slab_free_hook(df->s, object, false);
3466                 __kfence_free(object);
3467                 p[size] = NULL; /* mark object processed */
3468                 return size;
3469         }
3470
3471         /* Start new detached freelist */
3472         df->page = page;
3473         set_freepointer(df->s, object, NULL);
3474         df->tail = object;
3475         df->freelist = object;
3476         p[size] = NULL; /* mark object processed */
3477         df->cnt = 1;
3478
3479         while (size) {
3480                 object = p[--size];
3481                 if (!object)
3482                         continue; /* Skip processed objects */
3483
3484                 /* df->page is always set at this point */
3485                 if (df->page == virt_to_head_page(object)) {
3486                         /* Opportunity build freelist */
3487                         set_freepointer(df->s, object, df->freelist);
3488                         df->freelist = object;
3489                         df->cnt++;
3490                         p[size] = NULL; /* mark object processed */
3491
3492                         continue;
3493                 }
3494
3495                 /* Limit look ahead search */
3496                 if (!--lookahead)
3497                         break;
3498
3499                 if (!first_skipped_index)
3500                         first_skipped_index = size + 1;
3501         }
3502
3503         return first_skipped_index;
3504 }
3505
3506 /* Note that interrupts must be enabled when calling this function. */
3507 void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
3508 {
3509         if (WARN_ON(!size))
3510                 return;
3511
3512         memcg_slab_free_hook(s, p, size);
3513         do {
3514                 struct detached_freelist df;
3515
3516                 size = build_detached_freelist(s, size, p, &df);
3517                 if (!df.page)
3518                         continue;
3519
3520                 slab_free(df.s, df.page, df.freelist, df.tail, df.cnt, _RET_IP_);
3521         } while (likely(size));
3522 }
3523 EXPORT_SYMBOL(kmem_cache_free_bulk);
3524
3525 /* Note that interrupts must be enabled when calling this function. */
3526 int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
3527                           void **p)
3528 {
3529         struct kmem_cache_cpu *c;
3530         int i;
3531         struct obj_cgroup *objcg = NULL;
3532
3533         /* memcg and kmem_cache debug support */
3534         s = slab_pre_alloc_hook(s, &objcg, size, flags);
3535         if (unlikely(!s))
3536                 return false;
3537         /*
3538          * Drain objects in the per cpu slab, while disabling local
3539          * IRQs, which protects against PREEMPT and interrupts
3540          * handlers invoking normal fastpath.
3541          */
3542         c = get_cpu_ptr(s->cpu_slab);
3543         local_irq_disable();
3544
3545         for (i = 0; i < size; i++) {
3546                 void *object = kfence_alloc(s, s->object_size, flags);
3547
3548                 if (unlikely(object)) {
3549                         p[i] = object;
3550                         continue;
3551                 }
3552
3553                 object = c->freelist;
3554                 if (unlikely(!object)) {
3555                         /*
3556                          * We may have removed an object from c->freelist using
3557                          * the fastpath in the previous iteration; in that case,
3558                          * c->tid has not been bumped yet.
3559                          * Since ___slab_alloc() may reenable interrupts while
3560                          * allocating memory, we should bump c->tid now.
3561                          */
3562                         c->tid = next_tid(c->tid);
3563
3564                         local_irq_enable();
3565
3566                         /*
3567                          * Invoking slow path likely have side-effect
3568                          * of re-populating per CPU c->freelist
3569                          */
3570                         p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE,
3571                                             _RET_IP_, c);
3572                         if (unlikely(!p[i]))
3573                                 goto error;
3574
3575                         c = this_cpu_ptr(s->cpu_slab);
3576                         maybe_wipe_obj_freeptr(s, p[i]);
3577
3578                         local_irq_disable();
3579
3580                         continue; /* goto for-loop */
3581                 }
3582                 c->freelist = get_freepointer(s, object);
3583                 p[i] = object;
3584                 maybe_wipe_obj_freeptr(s, p[i]);
3585         }
3586         c->tid = next_tid(c->tid);
3587         local_irq_enable();
3588         put_cpu_ptr(s->cpu_slab);
3589
3590         /*
3591          * memcg and kmem_cache debug support and memory initialization.
3592          * Done outside of the IRQ disabled fastpath loop.
3593          */
3594         slab_post_alloc_hook(s, objcg, flags, size, p,
3595                                 slab_want_init_on_alloc(flags, s));
3596         return i;
3597 error:
3598         put_cpu_ptr(s->cpu_slab);
3599         slab_post_alloc_hook(s, objcg, flags, i, p, false);
3600         __kmem_cache_free_bulk(s, i, p);
3601         return 0;
3602 }
3603 EXPORT_SYMBOL(kmem_cache_alloc_bulk);
3604
3605
3606 /*
3607  * Object placement in a slab is made very easy because we always start at
3608  * offset 0. If we tune the size of the object to the alignment then we can
3609  * get the required alignment by putting one properly sized object after
3610  * another.
3611  *
3612  * Notice that the allocation order determines the sizes of the per cpu
3613  * caches. Each processor has always one slab available for allocations.
3614  * Increasing the allocation order reduces the number of times that slabs
3615  * must be moved on and off the partial lists and is therefore a factor in
3616  * locking overhead.
3617  */
3618
3619 /*
3620  * Minimum / Maximum order of slab pages. This influences locking overhead
3621  * and slab fragmentation. A higher order reduces the number of partial slabs
3622  * and increases the number of allocations possible without having to
3623  * take the list_lock.
3624  */
3625 static unsigned int slub_min_order;
3626 static unsigned int slub_max_order = PAGE_ALLOC_COSTLY_ORDER;
3627 static unsigned int slub_min_objects;
3628
3629 /*
3630  * Calculate the order of allocation given an slab object size.
3631  *
3632  * The order of allocation has significant impact on performance and other
3633  * system components. Generally order 0 allocations should be preferred since
3634  * order 0 does not cause fragmentation in the page allocator. Larger objects
3635  * be problematic to put into order 0 slabs because there may be too much
3636  * unused space left. We go to a higher order if more than 1/16th of the slab
3637  * would be wasted.
3638  *
3639  * In order to reach satisfactory performance we must ensure that a minimum
3640  * number of objects is in one slab. Otherwise we may generate too much
3641  * activity on the partial lists which requires taking the list_lock. This is
3642  * less a concern for large slabs though which are rarely used.
3643  *
3644  * slub_max_order specifies the order where we begin to stop considering the
3645  * number of objects in a slab as critical. If we reach slub_max_order then
3646  * we try to keep the page order as low as possible. So we accept more waste
3647  * of space in favor of a small page order.
3648  *
3649  * Higher order allocations also allow the placement of more objects in a
3650  * slab and thereby reduce object handling overhead. If the user has
3651  * requested a higher minimum order then we start with that one instead of
3652  * the smallest order which will fit the object.
3653  */
3654 static inline unsigned int slab_order(unsigned int size,
3655                 unsigned int min_objects, unsigned int max_order,
3656                 unsigned int fract_leftover)
3657 {
3658         unsigned int min_order = slub_min_order;
3659         unsigned int order;
3660
3661         if (order_objects(min_order, size) > MAX_OBJS_PER_PAGE)
3662                 return get_order(size * MAX_OBJS_PER_PAGE) - 1;
3663
3664         for (order = max(min_order, (unsigned int)get_order(min_objects * size));
3665                         order <= max_order; order++) {
3666
3667                 unsigned int slab_size = (unsigned int)PAGE_SIZE << order;
3668                 unsigned int rem;
3669
3670                 rem = slab_size % size;
3671
3672                 if (rem <= slab_size / fract_leftover)
3673                         break;
3674         }
3675
3676         return order;
3677 }
3678
3679 static inline int calculate_order(unsigned int size)
3680 {
3681         unsigned int order;
3682         unsigned int min_objects;
3683         unsigned int max_objects;
3684         unsigned int nr_cpus;
3685
3686         /*
3687          * Attempt to find best configuration for a slab. This
3688          * works by first attempting to generate a layout with
3689          * the best configuration and backing off gradually.
3690          *
3691          * First we increase the acceptable waste in a slab. Then
3692          * we reduce the minimum objects required in a slab.
3693          */
3694         min_objects = slub_min_objects;
3695         if (!min_objects) {
3696                 /*
3697                  * Some architectures will only update present cpus when
3698                  * onlining them, so don't trust the number if it's just 1. But
3699                  * we also don't want to use nr_cpu_ids always, as on some other
3700                  * architectures, there can be many possible cpus, but never
3701                  * onlined. Here we compromise between trying to avoid too high
3702                  * order on systems that appear larger than they are, and too
3703                  * low order on systems that appear smaller than they are.
3704                  */
3705                 nr_cpus = num_present_cpus();
3706                 if (nr_cpus <= 1)
3707                         nr_cpus = nr_cpu_ids;
3708                 min_objects = 4 * (fls(nr_cpus) + 1);
3709         }
3710         max_objects = order_objects(slub_max_order, size);
3711         min_objects = min(min_objects, max_objects);
3712
3713         while (min_objects > 1) {
3714                 unsigned int fraction;
3715
3716                 fraction = 16;
3717                 while (fraction >= 4) {
3718                         order = slab_order(size, min_objects,
3719                                         slub_max_order, fraction);
3720                         if (order <= slub_max_order)
3721                                 return order;
3722                         fraction /= 2;
3723                 }
3724                 min_objects--;
3725         }
3726
3727         /*
3728          * We were unable to place multiple objects in a slab. Now
3729          * lets see if we can place a single object there.
3730          */
3731         order = slab_order(size, 1, slub_max_order, 1);
3732         if (order <= slub_max_order)
3733                 return order;
3734
3735         /*
3736          * Doh this slab cannot be placed using slub_max_order.
3737          */
3738         order = slab_order(size, 1, MAX_ORDER, 1);
3739         if (order < MAX_ORDER)
3740                 return order;
3741         return -ENOSYS;
3742 }
3743
3744 static void
3745 init_kmem_cache_node(struct kmem_cache_node *n)
3746 {
3747         n->nr_partial = 0;
3748         spin_lock_init(&n->list_lock);
3749         INIT_LIST_HEAD(&n->partial);
3750 #ifdef CONFIG_SLUB_DEBUG
3751         atomic_long_set(&n->nr_slabs, 0);
3752         atomic_long_set(&n->total_objects, 0);
3753         INIT_LIST_HEAD(&n->full);
3754 #endif
3755 }
3756
3757 static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
3758 {
3759         BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
3760                         KMALLOC_SHIFT_HIGH * sizeof(struct kmem_cache_cpu));
3761
3762         /*
3763          * Must align to double word boundary for the double cmpxchg
3764          * instructions to work; see __pcpu_double_call_return_bool().
3765          */
3766         s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu),
3767                                      2 * sizeof(void *));
3768
3769         if (!s->cpu_slab)
3770                 return 0;
3771
3772         init_kmem_cache_cpus(s);
3773
3774         return 1;
3775 }
3776
3777 static struct kmem_cache *kmem_cache_node;
3778
3779 /*
3780  * No kmalloc_node yet so do it by hand. We know that this is the first
3781  * slab on the node for this slabcache. There are no concurrent accesses
3782  * possible.
3783  *
3784  * Note that this function only works on the kmem_cache_node
3785  * when allocating for the kmem_cache_node. This is used for bootstrapping
3786  * memory on a fresh node that has no slab structures yet.
3787  */
3788 static void early_kmem_cache_node_alloc(int node)
3789 {
3790         struct page *page;
3791         struct kmem_cache_node *n;
3792
3793         BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node));
3794
3795         page = new_slab(kmem_cache_node, GFP_NOWAIT, node);
3796
3797         BUG_ON(!page);
3798         if (page_to_nid(page) != node) {
3799                 pr_err("SLUB: Unable to allocate memory from node %d\n", node);
3800                 pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n");
3801         }
3802
3803         n = page->freelist;
3804         BUG_ON(!n);
3805 #ifdef CONFIG_SLUB_DEBUG
3806         init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
3807         init_tracking(kmem_cache_node, n);
3808 #endif
3809         n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL, false);
3810         page->freelist = get_freepointer(kmem_cache_node, n);
3811         page->inuse = 1;
3812         page->frozen = 0;
3813         kmem_cache_node->node[node] = n;
3814         init_kmem_cache_node(n);
3815         inc_slabs_node(kmem_cache_node, node, page->objects);
3816
3817         /*
3818          * No locks need to be taken here as it has just been
3819          * initialized and there is no concurrent access.
3820          */
3821         __add_partial(n, page, DEACTIVATE_TO_HEAD);
3822 }
3823
3824 static void free_kmem_cache_nodes(struct kmem_cache *s)
3825 {
3826         int node;
3827         struct kmem_cache_node *n;
3828
3829         for_each_kmem_cache_node(s, node, n) {
3830                 s->node[node] = NULL;
3831                 kmem_cache_free(kmem_cache_node, n);
3832         }
3833 }
3834
3835 void __kmem_cache_release(struct kmem_cache *s)
3836 {
3837         cache_random_seq_destroy(s);
3838         free_percpu(s->cpu_slab);
3839         free_kmem_cache_nodes(s);
3840 }
3841
3842 static int init_kmem_cache_nodes(struct kmem_cache *s)
3843 {
3844         int node;
3845
3846         for_each_node_mask(node, slab_nodes) {
3847                 struct kmem_cache_node *n;
3848
3849                 if (slab_state == DOWN) {
3850                         early_kmem_cache_node_alloc(node);
3851                         continue;
3852                 }
3853                 n = kmem_cache_alloc_node(kmem_cache_node,
3854                                                 GFP_KERNEL, node);
3855
3856                 if (!n) {
3857                         free_kmem_cache_nodes(s);
3858                         return 0;
3859                 }
3860
3861                 init_kmem_cache_node(n);
3862                 s->node[node] = n;
3863         }
3864         return 1;
3865 }
3866
3867 static void set_min_partial(struct kmem_cache *s, unsigned long min)
3868 {
3869         if (min < MIN_PARTIAL)
3870                 min = MIN_PARTIAL;
3871         else if (min > MAX_PARTIAL)
3872                 min = MAX_PARTIAL;
3873         s->min_partial = min;
3874 }
3875
3876 static void set_cpu_partial(struct kmem_cache *s)
3877 {
3878 #ifdef CONFIG_SLUB_CPU_PARTIAL
3879         /*
3880          * cpu_partial determined the maximum number of objects kept in the
3881          * per cpu partial lists of a processor.
3882          *
3883          * Per cpu partial lists mainly contain slabs that just have one
3884          * object freed. If they are used for allocation then they can be
3885          * filled up again with minimal effort. The slab will never hit the
3886          * per node partial lists and therefore no locking will be required.
3887          *
3888          * This setting also determines
3889          *
3890          * A) The number of objects from per cpu partial slabs dumped to the
3891          *    per node list when we reach the limit.
3892          * B) The number of objects in cpu partial slabs to extract from the
3893          *    per node list when we run out of per cpu objects. We only fetch
3894          *    50% to keep some capacity around for frees.
3895          */
3896         if (!kmem_cache_has_cpu_partial(s))
3897                 slub_set_cpu_partial(s, 0);
3898         else if (s->size >= PAGE_SIZE)
3899                 slub_set_cpu_partial(s, 2);
3900         else if (s->size >= 1024)
3901                 slub_set_cpu_partial(s, 6);
3902         else if (s->size >= 256)
3903                 slub_set_cpu_partial(s, 13);
3904         else
3905                 slub_set_cpu_partial(s, 30);
3906 #endif
3907 }
3908
3909 /*
3910  * calculate_sizes() determines the order and the distribution of data within
3911  * a slab object.
3912  */
3913 static int calculate_sizes(struct kmem_cache *s, int forced_order)
3914 {
3915         slab_flags_t flags = s->flags;
3916         unsigned int size = s->object_size;
3917         unsigned int order;
3918
3919         /*
3920          * Round up object size to the next word boundary. We can only
3921          * place the free pointer at word boundaries and this determines
3922          * the possible location of the free pointer.
3923          */
3924         size = ALIGN(size, sizeof(void *));
3925
3926 #ifdef CONFIG_SLUB_DEBUG
3927         /*
3928          * Determine if we can poison the object itself. If the user of
3929          * the slab may touch the object after free or before allocation
3930          * then we should never poison the object itself.
3931          */
3932         if ((flags & SLAB_POISON) && !(flags & SLAB_TYPESAFE_BY_RCU) &&
3933                         !s->ctor)
3934                 s->flags |= __OBJECT_POISON;
3935         else
3936                 s->flags &= ~__OBJECT_POISON;
3937
3938
3939         /*
3940          * If we are Redzoning then check if there is some space between the
3941          * end of the object and the free pointer. If not then add an
3942          * additional word to have some bytes to store Redzone information.
3943          */
3944         if ((flags & SLAB_RED_ZONE) && size == s->object_size)
3945                 size += sizeof(void *);
3946 #endif
3947
3948         /*
3949          * With that we have determined the number of bytes in actual use
3950          * by the object and redzoning.
3951          */
3952         s->inuse = size;
3953
3954         if ((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) ||
3955             ((flags & SLAB_RED_ZONE) && s->object_size < sizeof(void *)) ||
3956             s->ctor) {
3957                 /*
3958                  * Relocate free pointer after the object if it is not
3959                  * permitted to overwrite the first word of the object on
3960                  * kmem_cache_free.
3961                  *
3962                  * This is the case if we do RCU, have a constructor or
3963                  * destructor, are poisoning the objects, or are
3964                  * redzoning an object smaller than sizeof(void *).
3965                  *
3966                  * The assumption that s->offset >= s->inuse means free
3967                  * pointer is outside of the object is used in the
3968                  * freeptr_outside_object() function. If that is no
3969                  * longer true, the function needs to be modified.
3970                  */
3971                 s->offset = size;
3972                 size += sizeof(void *);
3973         } else {
3974                 /*
3975                  * Store freelist pointer near middle of object to keep
3976                  * it away from the edges of the object to avoid small
3977                  * sized over/underflows from neighboring allocations.
3978                  */
3979                 s->offset = ALIGN_DOWN(s->object_size / 2, sizeof(void *));
3980         }
3981
3982 #ifdef CONFIG_SLUB_DEBUG
3983         if (flags & SLAB_STORE_USER)
3984                 /*
3985                  * Need to store information about allocs and frees after
3986                  * the object.
3987                  */
3988                 size += 2 * sizeof(struct track);
3989 #endif
3990
3991         kasan_cache_create(s, &size, &s->flags);
3992 #ifdef CONFIG_SLUB_DEBUG
3993         if (flags & SLAB_RED_ZONE) {
3994                 /*
3995                  * Add some empty padding so that we can catch
3996                  * overwrites from earlier objects rather than let
3997                  * tracking information or the free pointer be
3998                  * corrupted if a user writes before the start
3999                  * of the object.
4000                  */
4001                 size += sizeof(void *);
4002
4003                 s->red_left_pad = sizeof(void *);
4004                 s->red_left_pad = ALIGN(s->red_left_pad, s->align);
4005                 size += s->red_left_pad;
4006         }
4007 #endif
4008
4009         /*
4010          * SLUB stores one object immediately after another beginning from
4011          * offset 0. In order to align the objects we have to simply size
4012          * each object to conform to the alignment.
4013          */
4014         size = ALIGN(size, s->align);
4015         s->size = size;
4016         s->reciprocal_size = reciprocal_value(size);
4017         if (forced_order >= 0)
4018                 order = forced_order;
4019         else
4020                 order = calculate_order(size);
4021
4022         if ((int)order < 0)
4023                 return 0;
4024
4025         s->allocflags = 0;
4026         if (order)
4027                 s->allocflags |= __GFP_COMP;
4028
4029         if (s->flags & SLAB_CACHE_DMA)
4030                 s->allocflags |= GFP_DMA;
4031
4032         if (s->flags & SLAB_CACHE_DMA32)
4033                 s->allocflags |= GFP_DMA32;
4034
4035         if (s->flags & SLAB_RECLAIM_ACCOUNT)
4036                 s->allocflags |= __GFP_RECLAIMABLE;
4037
4038         /*
4039          * Determine the number of objects per slab
4040          */
4041         s->oo = oo_make(order, size);
4042         s->min = oo_make(get_order(size), size);
4043         if (oo_objects(s->oo) > oo_objects(s->max))
4044                 s->max = s->oo;
4045
4046         return !!oo_objects(s->oo);
4047 }
4048
4049 static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags)
4050 {
4051         s->flags = kmem_cache_flags(s->size, flags, s->name);
4052 #ifdef CONFIG_SLAB_FREELIST_HARDENED
4053         s->random = get_random_long();
4054 #endif
4055
4056         if (!calculate_sizes(s, -1))
4057                 goto error;
4058         if (disable_higher_order_debug) {
4059                 /*
4060                  * Disable debugging flags that store metadata if the min slab
4061                  * order increased.
4062                  */
4063                 if (get_order(s->size) > get_order(s->object_size)) {
4064                         s->flags &= ~DEBUG_METADATA_FLAGS;
4065                         s->offset = 0;
4066                         if (!calculate_sizes(s, -1))
4067                                 goto error;
4068                 }
4069         }
4070
4071 #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
4072     defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
4073         if (system_has_cmpxchg_double() && (s->flags & SLAB_NO_CMPXCHG) == 0)
4074                 /* Enable fast mode */
4075                 s->flags |= __CMPXCHG_DOUBLE;
4076 #endif
4077
4078         /*
4079          * The larger the object size is, the more pages we want on the partial
4080          * list to avoid pounding the page allocator excessively.
4081          */
4082         set_min_partial(s, ilog2(s->size) / 2);
4083
4084         set_cpu_partial(s);
4085
4086 #ifdef CONFIG_NUMA
4087         s->remote_node_defrag_ratio = 1000;
4088 #endif
4089
4090         /* Initialize the pre-computed randomized freelist if slab is up */
4091         if (slab_state >= UP) {
4092                 if (init_cache_random_seq(s))
4093                         goto error;
4094         }
4095
4096         if (!init_kmem_cache_nodes(s))
4097                 goto error;
4098
4099         if (alloc_kmem_cache_cpus(s))
4100                 return 0;
4101
4102         free_kmem_cache_nodes(s);
4103 error:
4104         return -EINVAL;
4105 }
4106
4107 static void list_slab_objects(struct kmem_cache *s, struct page *page,
4108                               const char *text)
4109 {
4110 #ifdef CONFIG_SLUB_DEBUG
4111         void *addr = page_address(page);
4112         unsigned long flags;
4113         unsigned long *map;
4114         void *p;
4115
4116         slab_err(s, page, text, s->name);
4117         slab_lock(page, &flags);
4118
4119         map = get_map(s, page);
4120         for_each_object(p, s, addr, page->objects) {
4121
4122                 if (!test_bit(__obj_to_index(s, addr, p), map)) {
4123                         pr_err("Object 0x%p @offset=%tu\n", p, p - addr);
4124                         print_tracking(s, p);
4125                 }
4126         }
4127         put_map(map);
4128         slab_unlock(page, &flags);
4129 #endif
4130 }
4131
4132 /*
4133  * Attempt to free all partial slabs on a node.
4134  * This is called from __kmem_cache_shutdown(). We must take list_lock
4135  * because sysfs file might still access partial list after the shutdowning.
4136  */
4137 static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
4138 {
4139         LIST_HEAD(discard);
4140         struct page *page, *h;
4141
4142         BUG_ON(irqs_disabled());
4143         spin_lock_irq(&n->list_lock);
4144         list_for_each_entry_safe(page, h, &n->partial, slab_list) {
4145                 if (!page->inuse) {
4146                         remove_partial(n, page);
4147                         list_add(&page->slab_list, &discard);
4148                 } else {
4149                         list_slab_objects(s, page,
4150                           "Objects remaining in %s on __kmem_cache_shutdown()");
4151                 }
4152         }
4153         spin_unlock_irq(&n->list_lock);
4154
4155         list_for_each_entry_safe(page, h, &discard, slab_list)
4156                 discard_slab(s, page);
4157 }
4158
4159 bool __kmem_cache_empty(struct kmem_cache *s)
4160 {
4161         int node;
4162         struct kmem_cache_node *n;
4163
4164         for_each_kmem_cache_node(s, node, n)
4165                 if (n->nr_partial || slabs_node(s, node))
4166                         return false;
4167         return true;
4168 }
4169
4170 /*
4171  * Release all resources used by a slab cache.
4172  */
4173 int __kmem_cache_shutdown(struct kmem_cache *s)
4174 {
4175         int node;
4176         struct kmem_cache_node *n;
4177
4178         flush_all_cpus_locked(s);
4179         /* Attempt to free all objects */
4180         for_each_kmem_cache_node(s, node, n) {
4181                 free_partial(s, n);
4182                 if (n->nr_partial || slabs_node(s, node))
4183                         return 1;
4184         }
4185         return 0;
4186 }
4187
4188 #ifdef CONFIG_PRINTK
4189 void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page)
4190 {
4191         void *base;
4192         int __maybe_unused i;
4193         unsigned int objnr;
4194         void *objp;
4195         void *objp0;
4196         struct kmem_cache *s = page->slab_cache;
4197         struct track __maybe_unused *trackp;
4198
4199         kpp->kp_ptr = object;
4200         kpp->kp_page = page;
4201         kpp->kp_slab_cache = s;
4202         base = page_address(page);
4203         objp0 = kasan_reset_tag(object);
4204 #ifdef CONFIG_SLUB_DEBUG
4205         objp = restore_red_left(s, objp0);
4206 #else
4207         objp = objp0;
4208 #endif
4209         objnr = obj_to_index(s, page, objp);
4210         kpp->kp_data_offset = (unsigned long)((char *)objp0 - (char *)objp);
4211         objp = base + s->size * objnr;
4212         kpp->kp_objp = objp;
4213         if (WARN_ON_ONCE(objp < base || objp >= base + page->objects * s->size || (objp - base) % s->size) ||
4214             !(s->flags & SLAB_STORE_USER))
4215                 return;
4216 #ifdef CONFIG_SLUB_DEBUG
4217         objp = fixup_red_left(s, objp);
4218         trackp = get_track(s, objp, TRACK_ALLOC);
4219         kpp->kp_ret = (void *)trackp->addr;
4220 #ifdef CONFIG_STACKTRACE
4221         for (i = 0; i < KS_ADDRS_COUNT && i < TRACK_ADDRS_COUNT; i++) {
4222                 kpp->kp_stack[i] = (void *)trackp->addrs[i];
4223                 if (!kpp->kp_stack[i])
4224                         break;
4225         }
4226
4227         trackp = get_track(s, objp, TRACK_FREE);
4228         for (i = 0; i < KS_ADDRS_COUNT && i < TRACK_ADDRS_COUNT; i++) {
4229                 kpp->kp_free_stack[i] = (void *)trackp->addrs[i];
4230                 if (!kpp->kp_free_stack[i])
4231                         break;
4232         }
4233 #endif
4234 #endif
4235 }
4236 #endif
4237
4238 /********************************************************************
4239  *              Kmalloc subsystem
4240  *******************************************************************/
4241
4242 static int __init setup_slub_min_order(char *str)
4243 {
4244         get_option(&str, (int *)&slub_min_order);
4245
4246         return 1;
4247 }
4248
4249 __setup("slub_min_order=", setup_slub_min_order);
4250
4251 static int __init setup_slub_max_order(char *str)
4252 {
4253         get_option(&str, (int *)&slub_max_order);
4254         slub_max_order = min(slub_max_order, (unsigned int)MAX_ORDER - 1);
4255
4256         return 1;
4257 }
4258
4259 __setup("slub_max_order=", setup_slub_max_order);
4260
4261 static int __init setup_slub_min_objects(char *str)
4262 {
4263         get_option(&str, (int *)&slub_min_objects);
4264
4265         return 1;
4266 }
4267
4268 __setup("slub_min_objects=", setup_slub_min_objects);
4269
4270 void *__kmalloc(size_t size, gfp_t flags)
4271 {
4272         struct kmem_cache *s;
4273         void *ret;
4274
4275         if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
4276                 return kmalloc_large(size, flags);
4277
4278         s = kmalloc_slab(size, flags);
4279
4280         if (unlikely(ZERO_OR_NULL_PTR(s)))
4281                 return s;
4282
4283         ret = slab_alloc(s, flags, _RET_IP_, size);
4284
4285         trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
4286
4287         ret = kasan_kmalloc(s, ret, size, flags);
4288
4289         return ret;
4290 }
4291 EXPORT_SYMBOL(__kmalloc);
4292
4293 #ifdef CONFIG_NUMA
4294 static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
4295 {
4296         struct page *page;
4297         void *ptr = NULL;
4298         unsigned int order = get_order(size);
4299
4300         flags |= __GFP_COMP;
4301         page = alloc_pages_node(node, flags, order);
4302         if (page) {
4303                 ptr = page_address(page);
4304                 mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B,
4305                                       PAGE_SIZE << order);
4306         }
4307
4308         return kmalloc_large_node_hook(ptr, size, flags);
4309 }
4310
4311 void *__kmalloc_node(size_t size, gfp_t flags, int node)
4312 {
4313         struct kmem_cache *s;
4314         void *ret;
4315
4316         if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
4317                 ret = kmalloc_large_node(size, flags, node);
4318
4319                 trace_kmalloc_node(_RET_IP_, ret,
4320                                    size, PAGE_SIZE << get_order(size),
4321                                    flags, node);
4322
4323                 return ret;
4324         }
4325
4326         s = kmalloc_slab(size, flags);
4327
4328         if (unlikely(ZERO_OR_NULL_PTR(s)))
4329                 return s;
4330
4331         ret = slab_alloc_node(s, flags, node, _RET_IP_, size);
4332
4333         trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
4334
4335         ret = kasan_kmalloc(s, ret, size, flags);
4336
4337         return ret;
4338 }
4339 EXPORT_SYMBOL(__kmalloc_node);
4340 #endif  /* CONFIG_NUMA */
4341
4342 #ifdef CONFIG_HARDENED_USERCOPY
4343 /*
4344  * Rejects incorrectly sized objects and objects that are to be copied
4345  * to/from userspace but do not fall entirely within the containing slab
4346  * cache's usercopy region.
4347  *
4348  * Returns NULL if check passes, otherwise const char * to name of cache
4349  * to indicate an error.
4350  */
4351 void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
4352                          bool to_user)
4353 {
4354         struct kmem_cache *s;
4355         unsigned int offset;
4356         size_t object_size;
4357         bool is_kfence = is_kfence_address(ptr);
4358
4359         ptr = kasan_reset_tag(ptr);
4360
4361         /* Find object and usable object size. */
4362         s = page->slab_cache;
4363
4364         /* Reject impossible pointers. */
4365         if (ptr < page_address(page))
4366                 usercopy_abort("SLUB object not in SLUB page?!", NULL,
4367                                to_user, 0, n);
4368
4369         /* Find offset within object. */
4370         if (is_kfence)
4371                 offset = ptr - kfence_object_start(ptr);
4372         else
4373                 offset = (ptr - page_address(page)) % s->size;
4374
4375         /* Adjust for redzone and reject if within the redzone. */
4376         if (!is_kfence && kmem_cache_debug_flags(s, SLAB_RED_ZONE)) {
4377                 if (offset < s->red_left_pad)
4378                         usercopy_abort("SLUB object in left red zone",
4379                                        s->name, to_user, offset, n);
4380                 offset -= s->red_left_pad;
4381         }
4382
4383         /* Allow address range falling entirely within usercopy region. */
4384         if (offset >= s->useroffset &&
4385             offset - s->useroffset <= s->usersize &&
4386             n <= s->useroffset - offset + s->usersize)
4387                 return;
4388
4389         /*
4390          * If the copy is still within the allocated object, produce
4391          * a warning instead of rejecting the copy. This is intended
4392          * to be a temporary method to find any missing usercopy
4393          * whitelists.
4394          */
4395         object_size = slab_ksize(s);
4396         if (usercopy_fallback &&
4397             offset <= object_size && n <= object_size - offset) {
4398                 usercopy_warn("SLUB object", s->name, to_user, offset, n);
4399                 return;
4400         }
4401
4402         usercopy_abort("SLUB object", s->name, to_user, offset, n);
4403 }
4404 #endif /* CONFIG_HARDENED_USERCOPY */
4405
4406 size_t __ksize(const void *object)
4407 {
4408         struct page *page;
4409
4410         if (unlikely(object == ZERO_SIZE_PTR))
4411                 return 0;
4412
4413         page = virt_to_head_page(object);
4414
4415         if (unlikely(!PageSlab(page))) {
4416                 WARN_ON(!PageCompound(page));
4417                 return page_size(page);
4418         }
4419
4420         return slab_ksize(page->slab_cache);
4421 }
4422 EXPORT_SYMBOL(__ksize);
4423
4424 void kfree(const void *x)
4425 {
4426         struct page *page;
4427         void *object = (void *)x;
4428
4429         trace_kfree(_RET_IP_, x);
4430
4431         if (unlikely(ZERO_OR_NULL_PTR(x)))
4432                 return;
4433
4434         page = virt_to_head_page(x);
4435         if (unlikely(!PageSlab(page))) {
4436                 free_nonslab_page(page, object);
4437                 return;
4438         }
4439         slab_free(page->slab_cache, page, object, NULL, 1, _RET_IP_);
4440 }
4441 EXPORT_SYMBOL(kfree);
4442
4443 #define SHRINK_PROMOTE_MAX 32
4444
4445 /*
4446  * kmem_cache_shrink discards empty slabs and promotes the slabs filled
4447  * up most to the head of the partial lists. New allocations will then
4448  * fill those up and thus they can be removed from the partial lists.
4449  *
4450  * The slabs with the least items are placed last. This results in them
4451  * being allocated from last increasing the chance that the last objects
4452  * are freed in them.
4453  */
4454 static int __kmem_cache_do_shrink(struct kmem_cache *s)
4455 {
4456         int node;
4457         int i;
4458         struct kmem_cache_node *n;
4459         struct page *page;
4460         struct page *t;
4461         struct list_head discard;
4462         struct list_head promote[SHRINK_PROMOTE_MAX];
4463         unsigned long flags;
4464         int ret = 0;
4465
4466         for_each_kmem_cache_node(s, node, n) {
4467                 INIT_LIST_HEAD(&discard);
4468                 for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
4469                         INIT_LIST_HEAD(promote + i);
4470
4471                 spin_lock_irqsave(&n->list_lock, flags);
4472
4473                 /*
4474                  * Build lists of slabs to discard or promote.
4475                  *
4476                  * Note that concurrent frees may occur while we hold the
4477                  * list_lock. page->inuse here is the upper limit.
4478                  */
4479                 list_for_each_entry_safe(page, t, &n->partial, slab_list) {
4480                         int free = page->objects - page->inuse;
4481
4482                         /* Do not reread page->inuse */
4483                         barrier();
4484
4485                         /* We do not keep full slabs on the list */
4486                         BUG_ON(free <= 0);
4487
4488                         if (free == page->objects) {
4489                                 list_move(&page->slab_list, &discard);
4490                                 n->nr_partial--;
4491                         } else if (free <= SHRINK_PROMOTE_MAX)
4492                                 list_move(&page->slab_list, promote + free - 1);
4493                 }
4494
4495                 /*
4496                  * Promote the slabs filled up most to the head of the
4497                  * partial list.
4498                  */
4499                 for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
4500                         list_splice(promote + i, &n->partial);
4501
4502                 spin_unlock_irqrestore(&n->list_lock, flags);
4503
4504                 /* Release empty slabs */
4505                 list_for_each_entry_safe(page, t, &discard, slab_list)
4506                         discard_slab(s, page);
4507
4508                 if (slabs_node(s, node))
4509                         ret = 1;
4510         }
4511
4512         return ret;
4513 }
4514
4515 int __kmem_cache_shrink(struct kmem_cache *s)
4516 {
4517         flush_all(s);
4518         return __kmem_cache_do_shrink(s);
4519 }
4520
4521 static int slab_mem_going_offline_callback(void *arg)
4522 {
4523         struct kmem_cache *s;
4524
4525         mutex_lock(&slab_mutex);
4526         list_for_each_entry(s, &slab_caches, list) {
4527                 flush_all_cpus_locked(s);
4528                 __kmem_cache_do_shrink(s);
4529         }
4530         mutex_unlock(&slab_mutex);
4531
4532         return 0;
4533 }
4534
4535 static void slab_mem_offline_callback(void *arg)
4536 {
4537         struct memory_notify *marg = arg;
4538         int offline_node;
4539
4540         offline_node = marg->status_change_nid_normal;
4541
4542         /*
4543          * If the node still has available memory. we need kmem_cache_node
4544          * for it yet.
4545          */
4546         if (offline_node < 0)
4547                 return;
4548
4549         mutex_lock(&slab_mutex);
4550         node_clear(offline_node, slab_nodes);
4551         /*
4552          * We no longer free kmem_cache_node structures here, as it would be
4553          * racy with all get_node() users, and infeasible to protect them with
4554          * slab_mutex.
4555          */
4556         mutex_unlock(&slab_mutex);
4557 }
4558
4559 static int slab_mem_going_online_callback(void *arg)
4560 {
4561         struct kmem_cache_node *n;
4562         struct kmem_cache *s;
4563         struct memory_notify *marg = arg;
4564         int nid = marg->status_change_nid_normal;
4565         int ret = 0;
4566
4567         /*
4568          * If the node's memory is already available, then kmem_cache_node is
4569          * already created. Nothing to do.
4570          */
4571         if (nid < 0)
4572                 return 0;
4573
4574         /*
4575          * We are bringing a node online. No memory is available yet. We must
4576          * allocate a kmem_cache_node structure in order to bring the node
4577          * online.
4578          */
4579         mutex_lock(&slab_mutex);
4580         list_for_each_entry(s, &slab_caches, list) {
4581                 /*
4582                  * The structure may already exist if the node was previously
4583                  * onlined and offlined.
4584                  */
4585                 if (get_node(s, nid))
4586                         continue;
4587                 /*
4588                  * XXX: kmem_cache_alloc_node will fallback to other nodes
4589                  *      since memory is not yet available from the node that
4590                  *      is brought up.
4591                  */
4592                 n = kmem_cache_alloc(kmem_cache_node, GFP_KERNEL);
4593                 if (!n) {
4594                         ret = -ENOMEM;
4595                         goto out;
4596                 }
4597                 init_kmem_cache_node(n);
4598                 s->node[nid] = n;
4599         }
4600         /*
4601          * Any cache created after this point will also have kmem_cache_node
4602          * initialized for the new node.
4603          */
4604         node_set(nid, slab_nodes);
4605 out:
4606         mutex_unlock(&slab_mutex);
4607         return ret;
4608 }
4609
4610 static int slab_memory_callback(struct notifier_block *self,
4611                                 unsigned long action, void *arg)
4612 {
4613         int ret = 0;
4614
4615         switch (action) {
4616         case MEM_GOING_ONLINE:
4617                 ret = slab_mem_going_online_callback(arg);
4618                 break;
4619         case MEM_GOING_OFFLINE:
4620                 ret = slab_mem_going_offline_callback(arg);
4621                 break;
4622         case MEM_OFFLINE:
4623         case MEM_CANCEL_ONLINE:
4624                 slab_mem_offline_callback(arg);
4625                 break;
4626         case MEM_ONLINE:
4627         case MEM_CANCEL_OFFLINE:
4628                 break;
4629         }
4630         if (ret)
4631                 ret = notifier_from_errno(ret);
4632         else
4633                 ret = NOTIFY_OK;
4634         return ret;
4635 }
4636
4637 static struct notifier_block slab_memory_callback_nb = {
4638         .notifier_call = slab_memory_callback,
4639         .priority = SLAB_CALLBACK_PRI,
4640 };
4641
4642 /********************************************************************
4643  *                      Basic setup of slabs
4644  *******************************************************************/
4645
4646 /*
4647  * Used for early kmem_cache structures that were allocated using
4648  * the page allocator. Allocate them properly then fix up the pointers
4649  * that may be pointing to the wrong kmem_cache structure.
4650  */
4651
4652 static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
4653 {
4654         int node;
4655         struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
4656         struct kmem_cache_node *n;
4657
4658         memcpy(s, static_cache, kmem_cache->object_size);
4659
4660         /*
4661          * This runs very early, and only the boot processor is supposed to be
4662          * up.  Even if it weren't true, IRQs are not up so we couldn't fire
4663          * IPIs around.
4664          */
4665         __flush_cpu_slab(s, smp_processor_id());
4666         for_each_kmem_cache_node(s, node, n) {
4667                 struct page *p;
4668
4669                 list_for_each_entry(p, &n->partial, slab_list)
4670                         p->slab_cache = s;
4671
4672 #ifdef CONFIG_SLUB_DEBUG
4673                 list_for_each_entry(p, &n->full, slab_list)
4674                         p->slab_cache = s;
4675 #endif
4676         }
4677         list_add(&s->list, &slab_caches);
4678         return s;
4679 }
4680
4681 void __init kmem_cache_init(void)
4682 {
4683         static __initdata struct kmem_cache boot_kmem_cache,
4684                 boot_kmem_cache_node;
4685         int node;
4686
4687         if (debug_guardpage_minorder())
4688                 slub_max_order = 0;
4689
4690         /* Print slub debugging pointers without hashing */
4691         if (__slub_debug_enabled())
4692                 no_hash_pointers_enable(NULL);
4693
4694         kmem_cache_node = &boot_kmem_cache_node;
4695         kmem_cache = &boot_kmem_cache;
4696
4697         /*
4698          * Initialize the nodemask for which we will allocate per node
4699          * structures. Here we don't need taking slab_mutex yet.
4700          */
4701         for_each_node_state(node, N_NORMAL_MEMORY)
4702                 node_set(node, slab_nodes);
4703
4704         create_boot_cache(kmem_cache_node, "kmem_cache_node",
4705                 sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN, 0, 0);
4706
4707         register_hotmemory_notifier(&slab_memory_callback_nb);
4708
4709         /* Able to allocate the per node structures */
4710         slab_state = PARTIAL;
4711
4712         create_boot_cache(kmem_cache, "kmem_cache",
4713                         offsetof(struct kmem_cache, node) +
4714                                 nr_node_ids * sizeof(struct kmem_cache_node *),
4715                        SLAB_HWCACHE_ALIGN, 0, 0);
4716
4717         kmem_cache = bootstrap(&boot_kmem_cache);
4718         kmem_cache_node = bootstrap(&boot_kmem_cache_node);
4719
4720         /* Now we can use the kmem_cache to allocate kmalloc slabs */
4721         setup_kmalloc_cache_index_table();
4722         create_kmalloc_caches(0);
4723
4724         /* Setup random freelists for each cache */
4725         init_freelist_randomization();
4726
4727         cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD, "slub:dead", NULL,
4728                                   slub_cpu_dead);
4729
4730         pr_info("SLUB: HWalign=%d, Order=%u-%u, MinObjects=%u, CPUs=%u, Nodes=%u\n",
4731                 cache_line_size(),
4732                 slub_min_order, slub_max_order, slub_min_objects,
4733                 nr_cpu_ids, nr_node_ids);
4734 }
4735
4736 void __init kmem_cache_init_late(void)
4737 {
4738 }
4739
4740 struct kmem_cache *
4741 __kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
4742                    slab_flags_t flags, void (*ctor)(void *))
4743 {
4744         struct kmem_cache *s;
4745
4746         s = find_mergeable(size, align, flags, name, ctor);
4747         if (s) {
4748                 s->refcount++;
4749
4750                 /*
4751                  * Adjust the object sizes so that we clear
4752                  * the complete object on kzalloc.
4753                  */
4754                 s->object_size = max(s->object_size, size);
4755                 s->inuse = max(s->inuse, ALIGN(size, sizeof(void *)));
4756
4757                 if (sysfs_slab_alias(s, name)) {
4758                         s->refcount--;
4759                         s = NULL;
4760                 }
4761         }
4762
4763         return s;
4764 }
4765
4766 int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags)
4767 {
4768         int err;
4769
4770         err = kmem_cache_open(s, flags);
4771         if (err)
4772                 return err;
4773
4774         /* Mutex is not taken during early boot */
4775         if (slab_state <= UP)
4776                 return 0;
4777
4778         err = sysfs_slab_add(s);
4779         if (err)
4780                 __kmem_cache_release(s);
4781
4782         if (s->flags & SLAB_STORE_USER)
4783                 debugfs_slab_add(s);
4784
4785         return err;
4786 }
4787
4788 void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
4789 {
4790         struct kmem_cache *s;
4791         void *ret;
4792
4793         if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
4794                 return kmalloc_large(size, gfpflags);
4795
4796         s = kmalloc_slab(size, gfpflags);
4797
4798         if (unlikely(ZERO_OR_NULL_PTR(s)))
4799                 return s;
4800
4801         ret = slab_alloc(s, gfpflags, caller, size);
4802
4803         /* Honor the call site pointer we received. */
4804         trace_kmalloc(caller, ret, size, s->size, gfpflags);
4805
4806         return ret;
4807 }
4808 EXPORT_SYMBOL(__kmalloc_track_caller);
4809
4810 #ifdef CONFIG_NUMA
4811 void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
4812                                         int node, unsigned long caller)
4813 {
4814         struct kmem_cache *s;
4815         void *ret;
4816
4817         if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
4818                 ret = kmalloc_large_node(size, gfpflags, node);
4819
4820                 trace_kmalloc_node(caller, ret,
4821                                    size, PAGE_SIZE << get_order(size),
4822                                    gfpflags, node);
4823
4824                 return ret;
4825         }
4826
4827         s = kmalloc_slab(size, gfpflags);
4828
4829         if (unlikely(ZERO_OR_NULL_PTR(s)))
4830                 return s;
4831
4832         ret = slab_alloc_node(s, gfpflags, node, caller, size);
4833
4834         /* Honor the call site pointer we received. */
4835         trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node);
4836
4837         return ret;
4838 }
4839 EXPORT_SYMBOL(__kmalloc_node_track_caller);
4840 #endif
4841
4842 #ifdef CONFIG_SYSFS
4843 static int count_inuse(struct page *page)
4844 {
4845         return page->inuse;
4846 }
4847
4848 static int count_total(struct page *page)
4849 {
4850         return page->objects;
4851 }
4852 #endif
4853
4854 #ifdef CONFIG_SLUB_DEBUG
4855 static void validate_slab(struct kmem_cache *s, struct page *page,
4856                           unsigned long *obj_map)
4857 {
4858         void *p;
4859         void *addr = page_address(page);
4860         unsigned long flags;
4861
4862         slab_lock(page, &flags);
4863
4864         if (!check_slab(s, page) || !on_freelist(s, page, NULL))
4865                 goto unlock;
4866
4867         /* Now we know that a valid freelist exists */
4868         __fill_map(obj_map, s, page);
4869         for_each_object(p, s, addr, page->objects) {
4870                 u8 val = test_bit(__obj_to_index(s, addr, p), obj_map) ?
4871                          SLUB_RED_INACTIVE : SLUB_RED_ACTIVE;
4872
4873                 if (!check_object(s, page, p, val))
4874                         break;
4875         }
4876 unlock:
4877         slab_unlock(page, &flags);
4878 }
4879
4880 static int validate_slab_node(struct kmem_cache *s,
4881                 struct kmem_cache_node *n, unsigned long *obj_map)
4882 {
4883         unsigned long count = 0;
4884         struct page *page;
4885         unsigned long flags;
4886
4887         spin_lock_irqsave(&n->list_lock, flags);
4888
4889         list_for_each_entry(page, &n->partial, slab_list) {
4890                 validate_slab(s, page, obj_map);
4891                 count++;
4892         }
4893         if (count != n->nr_partial) {
4894                 pr_err("SLUB %s: %ld partial slabs counted but counter=%ld\n",
4895                        s->name, count, n->nr_partial);
4896                 slab_add_kunit_errors();
4897         }
4898
4899         if (!(s->flags & SLAB_STORE_USER))
4900                 goto out;
4901
4902         list_for_each_entry(page, &n->full, slab_list) {
4903                 validate_slab(s, page, obj_map);
4904                 count++;
4905         }
4906         if (count != atomic_long_read(&n->nr_slabs)) {
4907                 pr_err("SLUB: %s %ld slabs counted but counter=%ld\n",
4908                        s->name, count, atomic_long_read(&n->nr_slabs));
4909                 slab_add_kunit_errors();
4910         }
4911
4912 out:
4913         spin_unlock_irqrestore(&n->list_lock, flags);
4914         return count;
4915 }
4916
4917 long validate_slab_cache(struct kmem_cache *s)
4918 {
4919         int node;
4920         unsigned long count = 0;
4921         struct kmem_cache_node *n;
4922         unsigned long *obj_map;
4923
4924         obj_map = bitmap_alloc(oo_objects(s->oo), GFP_KERNEL);
4925         if (!obj_map)
4926                 return -ENOMEM;
4927
4928         flush_all(s);
4929         for_each_kmem_cache_node(s, node, n)
4930                 count += validate_slab_node(s, n, obj_map);
4931
4932         bitmap_free(obj_map);
4933
4934         return count;
4935 }
4936 EXPORT_SYMBOL(validate_slab_cache);
4937
4938 #ifdef CONFIG_DEBUG_FS
4939 /*
4940  * Generate lists of code addresses where slabcache objects are allocated
4941  * and freed.
4942  */
4943
4944 struct location {
4945         unsigned long count;
4946         unsigned long addr;
4947         long long sum_time;
4948         long min_time;
4949         long max_time;
4950         long min_pid;
4951         long max_pid;
4952         DECLARE_BITMAP(cpus, NR_CPUS);
4953         nodemask_t nodes;
4954 };
4955
4956 struct loc_track {
4957         unsigned long max;
4958         unsigned long count;
4959         struct location *loc;
4960 };
4961
4962 static struct dentry *slab_debugfs_root;
4963
4964 static void free_loc_track(struct loc_track *t)
4965 {
4966         if (t->max)
4967                 free_pages((unsigned long)t->loc,
4968                         get_order(sizeof(struct location) * t->max));
4969 }
4970
4971 static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags)
4972 {
4973         struct location *l;
4974         int order;
4975
4976         order = get_order(sizeof(struct location) * max);
4977
4978         l = (void *)__get_free_pages(flags, order);
4979         if (!l)
4980                 return 0;
4981
4982         if (t->count) {
4983                 memcpy(l, t->loc, sizeof(struct location) * t->count);
4984                 free_loc_track(t);
4985         }
4986         t->max = max;
4987         t->loc = l;
4988         return 1;
4989 }
4990
4991 static int add_location(struct loc_track *t, struct kmem_cache *s,
4992                                 const struct track *track)
4993 {
4994         long start, end, pos;
4995         struct location *l;
4996         unsigned long caddr;
4997         unsigned long age = jiffies - track->when;
4998
4999         start = -1;
5000         end = t->count;
5001
5002         for ( ; ; ) {
5003                 pos = start + (end - start + 1) / 2;
5004
5005                 /*
5006                  * There is nothing at "end". If we end up there
5007                  * we need to add something to before end.
5008                  */
5009                 if (pos == end)
5010                         break;
5011
5012                 caddr = t->loc[pos].addr;
5013                 if (track->addr == caddr) {
5014
5015                         l = &t->loc[pos];
5016                         l->count++;
5017                         if (track->when) {
5018                                 l->sum_time += age;
5019                                 if (age < l->min_time)
5020                                         l->min_time = age;
5021                                 if (age > l->max_time)
5022                                         l->max_time = age;
5023
5024                                 if (track->pid < l->min_pid)
5025                                         l->min_pid = track->pid;
5026                                 if (track->pid > l->max_pid)
5027                                         l->max_pid = track->pid;
5028
5029                                 cpumask_set_cpu(track->cpu,
5030                                                 to_cpumask(l->cpus));
5031                         }
5032                         node_set(page_to_nid(virt_to_page(track)), l->nodes);
5033                         return 1;
5034                 }
5035
5036                 if (track->addr < caddr)
5037                         end = pos;
5038                 else
5039                         start = pos;
5040         }
5041
5042         /*
5043          * Not found. Insert new tracking element.
5044          */
5045         if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max, GFP_ATOMIC))
5046                 return 0;
5047
5048         l = t->loc + pos;
5049         if (pos < t->count)
5050                 memmove(l + 1, l,
5051                         (t->count - pos) * sizeof(struct location));
5052         t->count++;
5053         l->count = 1;
5054         l->addr = track->addr;
5055         l->sum_time = age;
5056         l->min_time = age;
5057         l->max_time = age;
5058         l->min_pid = track->pid;
5059         l->max_pid = track->pid;
5060         cpumask_clear(to_cpumask(l->cpus));
5061         cpumask_set_cpu(track->cpu, to_cpumask(l->cpus));
5062         nodes_clear(l->nodes);
5063         node_set(page_to_nid(virt_to_page(track)), l->nodes);
5064         return 1;
5065 }
5066
5067 static void process_slab(struct loc_track *t, struct kmem_cache *s,
5068                 struct page *page, enum track_item alloc,
5069                 unsigned long *obj_map)
5070 {
5071         void *addr = page_address(page);
5072         void *p;
5073
5074         __fill_map(obj_map, s, page);
5075
5076         for_each_object(p, s, addr, page->objects)
5077                 if (!test_bit(__obj_to_index(s, addr, p), obj_map))
5078                         add_location(t, s, get_track(s, p, alloc));
5079 }
5080 #endif  /* CONFIG_DEBUG_FS   */
5081 #endif  /* CONFIG_SLUB_DEBUG */
5082
5083 #ifdef CONFIG_SYSFS
5084 enum slab_stat_type {
5085         SL_ALL,                 /* All slabs */
5086         SL_PARTIAL,             /* Only partially allocated slabs */
5087         SL_CPU,                 /* Only slabs used for cpu caches */
5088         SL_OBJECTS,             /* Determine allocated objects not slabs */
5089         SL_TOTAL                /* Determine object capacity not slabs */
5090 };
5091
5092 #define SO_ALL          (1 << SL_ALL)
5093 #define SO_PARTIAL      (1 << SL_PARTIAL)
5094 #define SO_CPU          (1 << SL_CPU)
5095 #define SO_OBJECTS      (1 << SL_OBJECTS)
5096 #define SO_TOTAL        (1 << SL_TOTAL)
5097
5098 static ssize_t show_slab_objects(struct kmem_cache *s,
5099                                  char *buf, unsigned long flags)
5100 {
5101         unsigned long total = 0;
5102         int node;
5103         int x;
5104         unsigned long *nodes;
5105         int len = 0;
5106
5107         nodes = kcalloc(nr_node_ids, sizeof(unsigned long), GFP_KERNEL);
5108         if (!nodes)
5109                 return -ENOMEM;
5110
5111         if (flags & SO_CPU) {
5112                 int cpu;
5113
5114                 for_each_possible_cpu(cpu) {
5115                         struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab,
5116                                                                cpu);
5117                         int node;
5118                         struct page *page;
5119
5120                         page = READ_ONCE(c->page);
5121                         if (!page)
5122                                 continue;
5123
5124                         node = page_to_nid(page);
5125                         if (flags & SO_TOTAL)
5126                                 x = page->objects;
5127                         else if (flags & SO_OBJECTS)
5128                                 x = page->inuse;
5129                         else
5130                                 x = 1;
5131
5132                         total += x;
5133                         nodes[node] += x;
5134
5135                         page = slub_percpu_partial_read_once(c);
5136                         if (page) {
5137                                 node = page_to_nid(page);
5138                                 if (flags & SO_TOTAL)
5139                                         WARN_ON_ONCE(1);
5140                                 else if (flags & SO_OBJECTS)
5141                                         WARN_ON_ONCE(1);
5142                                 else
5143                                         x = page->pages;
5144                                 total += x;
5145                                 nodes[node] += x;
5146                         }
5147                 }
5148         }
5149
5150         /*
5151          * It is impossible to take "mem_hotplug_lock" here with "kernfs_mutex"
5152          * already held which will conflict with an existing lock order:
5153          *
5154          * mem_hotplug_lock->slab_mutex->kernfs_mutex
5155          *
5156          * We don't really need mem_hotplug_lock (to hold off
5157          * slab_mem_going_offline_callback) here because slab's memory hot
5158          * unplug code doesn't destroy the kmem_cache->node[] data.
5159          */
5160
5161 #ifdef CONFIG_SLUB_DEBUG
5162         if (flags & SO_ALL) {
5163                 struct kmem_cache_node *n;
5164
5165                 for_each_kmem_cache_node(s, node, n) {
5166
5167                         if (flags & SO_TOTAL)
5168                                 x = atomic_long_read(&n->total_objects);
5169                         else if (flags & SO_OBJECTS)
5170                                 x = atomic_long_read(&n->total_objects) -
5171                                         count_partial(n, count_free);
5172                         else
5173                                 x = atomic_long_read(&n->nr_slabs);
5174                         total += x;
5175                         nodes[node] += x;
5176                 }
5177
5178         } else
5179 #endif
5180         if (flags & SO_PARTIAL) {
5181                 struct kmem_cache_node *n;
5182
5183                 for_each_kmem_cache_node(s, node, n) {
5184                         if (flags & SO_TOTAL)
5185                                 x = count_partial(n, count_total);
5186                         else if (flags & SO_OBJECTS)
5187                                 x = count_partial(n, count_inuse);
5188                         else
5189                                 x = n->nr_partial;
5190                         total += x;
5191                         nodes[node] += x;
5192                 }
5193         }
5194
5195         len += sysfs_emit_at(buf, len, "%lu", total);
5196 #ifdef CONFIG_NUMA
5197         for (node = 0; node < nr_node_ids; node++) {
5198                 if (nodes[node])
5199                         len += sysfs_emit_at(buf, len, " N%d=%lu",
5200                                              node, nodes[node]);
5201         }
5202 #endif
5203         len += sysfs_emit_at(buf, len, "\n");
5204         kfree(nodes);
5205
5206         return len;
5207 }
5208
5209 #define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
5210 #define to_slab(n) container_of(n, struct kmem_cache, kobj)
5211
5212 struct slab_attribute {
5213         struct attribute attr;
5214         ssize_t (*show)(struct kmem_cache *s, char *buf);
5215         ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count);
5216 };
5217
5218 #define SLAB_ATTR_RO(_name) \
5219         static struct slab_attribute _name##_attr = \
5220         __ATTR(_name, 0400, _name##_show, NULL)
5221
5222 #define SLAB_ATTR(_name) \
5223         static struct slab_attribute _name##_attr =  \
5224         __ATTR(_name, 0600, _name##_show, _name##_store)
5225
5226 static ssize_t slab_size_show(struct kmem_cache *s, char *buf)
5227 {
5228         return sysfs_emit(buf, "%u\n", s->size);
5229 }
5230 SLAB_ATTR_RO(slab_size);
5231
5232 static ssize_t align_show(struct kmem_cache *s, char *buf)
5233 {
5234         return sysfs_emit(buf, "%u\n", s->align);
5235 }
5236 SLAB_ATTR_RO(align);
5237
5238 static ssize_t object_size_show(struct kmem_cache *s, char *buf)
5239 {
5240         return sysfs_emit(buf, "%u\n", s->object_size);
5241 }
5242 SLAB_ATTR_RO(object_size);
5243
5244 static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf)
5245 {
5246         return sysfs_emit(buf, "%u\n", oo_objects(s->oo));
5247 }
5248 SLAB_ATTR_RO(objs_per_slab);
5249
5250 static ssize_t order_show(struct kmem_cache *s, char *buf)
5251 {
5252         return sysfs_emit(buf, "%u\n", oo_order(s->oo));
5253 }
5254 SLAB_ATTR_RO(order);
5255
5256 static ssize_t min_partial_show(struct kmem_cache *s, char *buf)
5257 {
5258         return sysfs_emit(buf, "%lu\n", s->min_partial);
5259 }
5260
5261 static ssize_t min_partial_store(struct kmem_cache *s, const char *buf,
5262                                  size_t length)
5263 {
5264         unsigned long min;
5265         int err;
5266
5267         err = kstrtoul(buf, 10, &min);
5268         if (err)
5269                 return err;
5270
5271         set_min_partial(s, min);
5272         return length;
5273 }
5274 SLAB_ATTR(min_partial);
5275
5276 static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf)
5277 {
5278         return sysfs_emit(buf, "%u\n", slub_cpu_partial(s));
5279 }
5280
5281 static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf,
5282                                  size_t length)
5283 {
5284         unsigned int objects;
5285         int err;
5286
5287         err = kstrtouint(buf, 10, &objects);
5288         if (err)
5289                 return err;
5290         if (objects && !kmem_cache_has_cpu_partial(s))
5291                 return -EINVAL;
5292
5293         slub_set_cpu_partial(s, objects);
5294         flush_all(s);
5295         return length;
5296 }
5297 SLAB_ATTR(cpu_partial);
5298
5299 static ssize_t ctor_show(struct kmem_cache *s, char *buf)
5300 {
5301         if (!s->ctor)
5302                 return 0;
5303         return sysfs_emit(buf, "%pS\n", s->ctor);
5304 }
5305 SLAB_ATTR_RO(ctor);
5306
5307 static ssize_t aliases_show(struct kmem_cache *s, char *buf)
5308 {
5309         return sysfs_emit(buf, "%d\n", s->refcount < 0 ? 0 : s->refcount - 1);
5310 }
5311 SLAB_ATTR_RO(aliases);
5312
5313 static ssize_t partial_show(struct kmem_cache *s, char *buf)
5314 {
5315         return show_slab_objects(s, buf, SO_PARTIAL);
5316 }
5317 SLAB_ATTR_RO(partial);
5318
5319 static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf)
5320 {
5321         return show_slab_objects(s, buf, SO_CPU);
5322 }
5323 SLAB_ATTR_RO(cpu_slabs);
5324
5325 static ssize_t objects_show(struct kmem_cache *s, char *buf)
5326 {
5327         return show_slab_objects(s, buf, SO_ALL|SO_OBJECTS);
5328 }
5329 SLAB_ATTR_RO(objects);
5330
5331 static ssize_t objects_partial_show(struct kmem_cache *s, char *buf)
5332 {
5333         return show_slab_objects(s, buf, SO_PARTIAL|SO_OBJECTS);
5334 }
5335 SLAB_ATTR_RO(objects_partial);
5336
5337 static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)
5338 {
5339         int objects = 0;
5340         int pages = 0;
5341         int cpu;
5342         int len = 0;
5343
5344         for_each_online_cpu(cpu) {
5345                 struct page *page;
5346
5347                 page = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu));
5348
5349                 if (page) {
5350                         pages += page->pages;
5351                         objects += page->pobjects;
5352                 }
5353         }
5354
5355         len += sysfs_emit_at(buf, len, "%d(%d)", objects, pages);
5356
5357 #ifdef CONFIG_SMP
5358         for_each_online_cpu(cpu) {
5359                 struct page *page;
5360
5361                 page = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu));
5362                 if (page)
5363                         len += sysfs_emit_at(buf, len, " C%d=%d(%d)",
5364                                              cpu, page->pobjects, page->pages);
5365         }
5366 #endif
5367         len += sysfs_emit_at(buf, len, "\n");
5368
5369         return len;
5370 }
5371 SLAB_ATTR_RO(slabs_cpu_partial);
5372
5373 static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
5374 {
5375         return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
5376 }
5377 SLAB_ATTR_RO(reclaim_account);
5378
5379 static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
5380 {
5381         return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN));
5382 }
5383 SLAB_ATTR_RO(hwcache_align);
5384
5385 #ifdef CONFIG_ZONE_DMA
5386 static ssize_t cache_dma_show(struct kmem_cache *s, char *buf)
5387 {
5388         return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
5389 }
5390 SLAB_ATTR_RO(cache_dma);
5391 #endif
5392
5393 static ssize_t usersize_show(struct kmem_cache *s, char *buf)
5394 {
5395         return sysfs_emit(buf, "%u\n", s->usersize);
5396 }
5397 SLAB_ATTR_RO(usersize);
5398
5399 static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
5400 {
5401         return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_TYPESAFE_BY_RCU));
5402 }
5403 SLAB_ATTR_RO(destroy_by_rcu);
5404
5405 #ifdef CONFIG_SLUB_DEBUG
5406 static ssize_t slabs_show(struct kmem_cache *s, char *buf)
5407 {
5408         return show_slab_objects(s, buf, SO_ALL);
5409 }
5410 SLAB_ATTR_RO(slabs);
5411
5412 static ssize_t total_objects_show(struct kmem_cache *s, char *buf)
5413 {
5414         return show_slab_objects(s, buf, SO_ALL|SO_TOTAL);
5415 }
5416 SLAB_ATTR_RO(total_objects);
5417
5418 static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
5419 {
5420         return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_CONSISTENCY_CHECKS));
5421 }
5422 SLAB_ATTR_RO(sanity_checks);
5423
5424 static ssize_t trace_show(struct kmem_cache *s, char *buf)
5425 {
5426         return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_TRACE));
5427 }
5428 SLAB_ATTR_RO(trace);
5429
5430 static ssize_t red_zone_show(struct kmem_cache *s, char *buf)
5431 {
5432         return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE));
5433 }
5434
5435 SLAB_ATTR_RO(red_zone);
5436
5437 static ssize_t poison_show(struct kmem_cache *s, char *buf)
5438 {
5439         return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_POISON));
5440 }
5441
5442 SLAB_ATTR_RO(poison);
5443
5444 static ssize_t store_user_show(struct kmem_cache *s, char *buf)
5445 {
5446         return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_STORE_USER));
5447 }
5448
5449 SLAB_ATTR_RO(store_user);
5450
5451 static ssize_t validate_show(struct kmem_cache *s, char *buf)
5452 {
5453         return 0;
5454 }
5455
5456 static ssize_t validate_store(struct kmem_cache *s,
5457                         const char *buf, size_t length)
5458 {
5459         int ret = -EINVAL;
5460
5461         if (buf[0] == '1') {
5462                 ret = validate_slab_cache(s);
5463                 if (ret >= 0)
5464                         ret = length;
5465         }
5466         return ret;
5467 }
5468 SLAB_ATTR(validate);
5469
5470 #endif /* CONFIG_SLUB_DEBUG */
5471
5472 #ifdef CONFIG_FAILSLAB
5473 static ssize_t failslab_show(struct kmem_cache *s, char *buf)
5474 {
5475         return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
5476 }
5477 SLAB_ATTR_RO(failslab);
5478 #endif
5479
5480 static ssize_t shrink_show(struct kmem_cache *s, char *buf)
5481 {
5482         return 0;
5483 }
5484
5485 static ssize_t shrink_store(struct kmem_cache *s,
5486                         const char *buf, size_t length)
5487 {
5488         if (buf[0] == '1')
5489                 kmem_cache_shrink(s);
5490         else
5491                 return -EINVAL;
5492         return length;
5493 }
5494 SLAB_ATTR(shrink);
5495
5496 #ifdef CONFIG_NUMA
5497 static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf)
5498 {
5499         return sysfs_emit(buf, "%u\n", s->remote_node_defrag_ratio / 10);
5500 }
5501
5502 static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s,
5503                                 const char *buf, size_t length)
5504 {
5505         unsigned int ratio;
5506         int err;
5507
5508         err = kstrtouint(buf, 10, &ratio);
5509         if (err)
5510                 return err;
5511         if (ratio > 100)
5512                 return -ERANGE;
5513
5514         s->remote_node_defrag_ratio = ratio * 10;
5515
5516         return length;
5517 }
5518 SLAB_ATTR(remote_node_defrag_ratio);
5519 #endif
5520
5521 #ifdef CONFIG_SLUB_STATS
5522 static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
5523 {
5524         unsigned long sum  = 0;
5525         int cpu;
5526         int len = 0;
5527         int *data = kmalloc_array(nr_cpu_ids, sizeof(int), GFP_KERNEL);
5528
5529         if (!data)
5530                 return -ENOMEM;
5531
5532         for_each_online_cpu(cpu) {
5533                 unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si];
5534
5535                 data[cpu] = x;
5536                 sum += x;
5537         }
5538
5539         len += sysfs_emit_at(buf, len, "%lu", sum);
5540
5541 #ifdef CONFIG_SMP
5542         for_each_online_cpu(cpu) {
5543                 if (data[cpu])
5544                         len += sysfs_emit_at(buf, len, " C%d=%u",
5545                                              cpu, data[cpu]);
5546         }
5547 #endif
5548         kfree(data);
5549         len += sysfs_emit_at(buf, len, "\n");
5550
5551         return len;
5552 }
5553
5554 static void clear_stat(struct kmem_cache *s, enum stat_item si)
5555 {
5556         int cpu;
5557
5558         for_each_online_cpu(cpu)
5559                 per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0;
5560 }
5561
5562 #define STAT_ATTR(si, text)                                     \
5563 static ssize_t text##_show(struct kmem_cache *s, char *buf)     \
5564 {                                                               \
5565         return show_stat(s, buf, si);                           \
5566 }                                                               \
5567 static ssize_t text##_store(struct kmem_cache *s,               \
5568                                 const char *buf, size_t length) \
5569 {                                                               \
5570         if (buf[0] != '0')                                      \
5571                 return -EINVAL;                                 \
5572         clear_stat(s, si);                                      \
5573         return length;                                          \
5574 }                                                               \
5575 SLAB_ATTR(text);                                                \
5576
5577 STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath);
5578 STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath);
5579 STAT_ATTR(FREE_FASTPATH, free_fastpath);
5580 STAT_ATTR(FREE_SLOWPATH, free_slowpath);
5581 STAT_ATTR(FREE_FROZEN, free_frozen);
5582 STAT_ATTR(FREE_ADD_PARTIAL, free_add_partial);
5583 STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial);
5584 STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial);
5585 STAT_ATTR(ALLOC_SLAB, alloc_slab);
5586 STAT_ATTR(ALLOC_REFILL, alloc_refill);
5587 STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch);
5588 STAT_ATTR(FREE_SLAB, free_slab);
5589 STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush);
5590 STAT_ATTR(DEACTIVATE_FULL, deactivate_full);
5591 STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty);
5592 STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head);
5593 STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail);
5594 STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees);
5595 STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass);
5596 STAT_ATTR(ORDER_FALLBACK, order_fallback);
5597 STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail);
5598 STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail);
5599 STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc);
5600 STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free);
5601 STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node);
5602 STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain);
5603 #endif  /* CONFIG_SLUB_STATS */
5604
5605 static struct attribute *slab_attrs[] = {
5606         &slab_size_attr.attr,
5607         &object_size_attr.attr,
5608         &objs_per_slab_attr.attr,
5609         &order_attr.attr,
5610         &min_partial_attr.attr,
5611         &cpu_partial_attr.attr,
5612         &objects_attr.attr,
5613         &objects_partial_attr.attr,
5614         &partial_attr.attr,
5615         &cpu_slabs_attr.attr,
5616         &ctor_attr.attr,
5617         &aliases_attr.attr,
5618         &align_attr.attr,
5619         &hwcache_align_attr.attr,
5620         &reclaim_account_attr.attr,
5621         &destroy_by_rcu_attr.attr,
5622         &shrink_attr.attr,
5623         &slabs_cpu_partial_attr.attr,
5624 #ifdef CONFIG_SLUB_DEBUG
5625         &total_objects_attr.attr,
5626         &slabs_attr.attr,
5627         &sanity_checks_attr.attr,
5628         &trace_attr.attr,
5629         &red_zone_attr.attr,
5630         &poison_attr.attr,
5631         &store_user_attr.attr,
5632         &validate_attr.attr,
5633 #endif
5634 #ifdef CONFIG_ZONE_DMA
5635         &cache_dma_attr.attr,
5636 #endif
5637 #ifdef CONFIG_NUMA
5638         &remote_node_defrag_ratio_attr.attr,
5639 #endif
5640 #ifdef CONFIG_SLUB_STATS
5641         &alloc_fastpath_attr.attr,
5642         &alloc_slowpath_attr.attr,
5643         &free_fastpath_attr.attr,
5644         &free_slowpath_attr.attr,
5645         &free_frozen_attr.attr,
5646         &free_add_partial_attr.attr,
5647         &free_remove_partial_attr.attr,
5648         &alloc_from_partial_attr.attr,
5649         &alloc_slab_attr.attr,
5650         &alloc_refill_attr.attr,
5651         &alloc_node_mismatch_attr.attr,
5652         &free_slab_attr.attr,
5653         &cpuslab_flush_attr.attr,
5654         &deactivate_full_attr.attr,
5655         &deactivate_empty_attr.attr,
5656         &deactivate_to_head_attr.attr,
5657         &deactivate_to_tail_attr.attr,
5658         &deactivate_remote_frees_attr.attr,
5659         &deactivate_bypass_attr.attr,
5660         &order_fallback_attr.attr,
5661         &cmpxchg_double_fail_attr.attr,
5662         &cmpxchg_double_cpu_fail_attr.attr,
5663         &cpu_partial_alloc_attr.attr,
5664         &cpu_partial_free_attr.attr,
5665         &cpu_partial_node_attr.attr,
5666         &cpu_partial_drain_attr.attr,
5667 #endif
5668 #ifdef CONFIG_FAILSLAB
5669         &failslab_attr.attr,
5670 #endif
5671         &usersize_attr.attr,
5672
5673         NULL
5674 };
5675
5676 static const struct attribute_group slab_attr_group = {
5677         .attrs = slab_attrs,
5678 };
5679
5680 static ssize_t slab_attr_show(struct kobject *kobj,
5681                                 struct attribute *attr,
5682                                 char *buf)
5683 {
5684         struct slab_attribute *attribute;
5685         struct kmem_cache *s;
5686         int err;
5687
5688         attribute = to_slab_attr(attr);
5689         s = to_slab(kobj);
5690
5691         if (!attribute->show)
5692                 return -EIO;
5693
5694         err = attribute->show(s, buf);
5695
5696         return err;
5697 }
5698
5699 static ssize_t slab_attr_store(struct kobject *kobj,
5700                                 struct attribute *attr,
5701                                 const char *buf, size_t len)
5702 {
5703         struct slab_attribute *attribute;
5704         struct kmem_cache *s;
5705         int err;
5706
5707         attribute = to_slab_attr(attr);
5708         s = to_slab(kobj);
5709
5710         if (!attribute->store)
5711                 return -EIO;
5712
5713         err = attribute->store(s, buf, len);
5714         return err;
5715 }
5716
5717 static void kmem_cache_release(struct kobject *k)
5718 {
5719         slab_kmem_cache_release(to_slab(k));
5720 }
5721
5722 static const struct sysfs_ops slab_sysfs_ops = {
5723         .show = slab_attr_show,
5724         .store = slab_attr_store,
5725 };
5726
5727 static struct kobj_type slab_ktype = {
5728         .sysfs_ops = &slab_sysfs_ops,
5729         .release = kmem_cache_release,
5730 };
5731
5732 static struct kset *slab_kset;
5733
5734 static inline struct kset *cache_kset(struct kmem_cache *s)
5735 {
5736         return slab_kset;
5737 }
5738
5739 #define ID_STR_LENGTH 64
5740
5741 /* Create a unique string id for a slab cache:
5742  *
5743  * Format       :[flags-]size
5744  */
5745 static char *create_unique_id(struct kmem_cache *s)
5746 {
5747         char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL);
5748         char *p = name;
5749
5750         BUG_ON(!name);
5751
5752         *p++ = ':';
5753         /*
5754          * First flags affecting slabcache operations. We will only
5755          * get here for aliasable slabs so we do not need to support
5756          * too many flags. The flags here must cover all flags that
5757          * are matched during merging to guarantee that the id is
5758          * unique.
5759          */
5760         if (s->flags & SLAB_CACHE_DMA)
5761                 *p++ = 'd';
5762         if (s->flags & SLAB_CACHE_DMA32)
5763                 *p++ = 'D';
5764         if (s->flags & SLAB_RECLAIM_ACCOUNT)
5765                 *p++ = 'a';
5766         if (s->flags & SLAB_CONSISTENCY_CHECKS)
5767                 *p++ = 'F';
5768         if (s->flags & SLAB_ACCOUNT)
5769                 *p++ = 'A';
5770         if (p != name + 1)
5771                 *p++ = '-';
5772         p += sprintf(p, "%07u", s->size);
5773
5774         BUG_ON(p > name + ID_STR_LENGTH - 1);
5775         return name;
5776 }
5777
5778 static int sysfs_slab_add(struct kmem_cache *s)
5779 {
5780         int err;
5781         const char *name;
5782         struct kset *kset = cache_kset(s);
5783         int unmergeable = slab_unmergeable(s);
5784
5785         if (!kset) {
5786                 kobject_init(&s->kobj, &slab_ktype);
5787                 return 0;
5788         }
5789
5790         if (!unmergeable && disable_higher_order_debug &&
5791                         (slub_debug & DEBUG_METADATA_FLAGS))
5792                 unmergeable = 1;
5793
5794         if (unmergeable) {
5795                 /*
5796                  * Slabcache can never be merged so we can use the name proper.
5797                  * This is typically the case for debug situations. In that
5798                  * case we can catch duplicate names easily.
5799                  */
5800                 sysfs_remove_link(&slab_kset->kobj, s->name);
5801                 name = s->name;
5802         } else {
5803                 /*
5804                  * Create a unique name for the slab as a target
5805                  * for the symlinks.
5806                  */
5807                 name = create_unique_id(s);
5808         }
5809
5810         s->kobj.kset = kset;
5811         err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name);
5812         if (err)
5813                 goto out;
5814
5815         err = sysfs_create_group(&s->kobj, &slab_attr_group);
5816         if (err)
5817                 goto out_del_kobj;
5818
5819         if (!unmergeable) {
5820                 /* Setup first alias */
5821                 sysfs_slab_alias(s, s->name);
5822         }
5823 out:
5824         if (!unmergeable)
5825                 kfree(name);
5826         return err;
5827 out_del_kobj:
5828         kobject_del(&s->kobj);
5829         goto out;
5830 }
5831
5832 void sysfs_slab_unlink(struct kmem_cache *s)
5833 {
5834         if (slab_state >= FULL)
5835                 kobject_del(&s->kobj);
5836 }
5837
5838 void sysfs_slab_release(struct kmem_cache *s)
5839 {
5840         if (slab_state >= FULL)
5841                 kobject_put(&s->kobj);
5842 }
5843
5844 /*
5845  * Need to buffer aliases during bootup until sysfs becomes
5846  * available lest we lose that information.
5847  */
5848 struct saved_alias {
5849         struct kmem_cache *s;
5850         const char *name;
5851         struct saved_alias *next;
5852 };
5853
5854 static struct saved_alias *alias_list;
5855
5856 static int sysfs_slab_alias(struct kmem_cache *s, const char *name)
5857 {
5858         struct saved_alias *al;
5859
5860         if (slab_state == FULL) {
5861                 /*
5862                  * If we have a leftover link then remove it.
5863                  */
5864                 sysfs_remove_link(&slab_kset->kobj, name);
5865                 return sysfs_create_link(&slab_kset->kobj, &s->kobj, name);
5866         }
5867
5868         al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL);
5869         if (!al)
5870                 return -ENOMEM;
5871
5872         al->s = s;
5873         al->name = name;
5874         al->next = alias_list;
5875         alias_list = al;
5876         return 0;
5877 }
5878
5879 static int __init slab_sysfs_init(void)
5880 {
5881         struct kmem_cache *s;
5882         int err;
5883
5884         mutex_lock(&slab_mutex);
5885
5886         slab_kset = kset_create_and_add("slab", NULL, kernel_kobj);
5887         if (!slab_kset) {
5888                 mutex_unlock(&slab_mutex);
5889                 pr_err("Cannot register slab subsystem.\n");
5890                 return -ENOSYS;
5891         }
5892
5893         slab_state = FULL;
5894
5895         list_for_each_entry(s, &slab_caches, list) {
5896                 err = sysfs_slab_add(s);
5897                 if (err)
5898                         pr_err("SLUB: Unable to add boot slab %s to sysfs\n",
5899                                s->name);
5900         }
5901
5902         while (alias_list) {
5903                 struct saved_alias *al = alias_list;
5904
5905                 alias_list = alias_list->next;
5906                 err = sysfs_slab_alias(al->s, al->name);
5907                 if (err)
5908                         pr_err("SLUB: Unable to add boot slab alias %s to sysfs\n",
5909                                al->name);
5910                 kfree(al);
5911         }
5912
5913         mutex_unlock(&slab_mutex);
5914         return 0;
5915 }
5916
5917 __initcall(slab_sysfs_init);
5918 #endif /* CONFIG_SYSFS */
5919
5920 #if defined(CONFIG_SLUB_DEBUG) && defined(CONFIG_DEBUG_FS)
5921 static int slab_debugfs_show(struct seq_file *seq, void *v)
5922 {
5923
5924         struct location *l;
5925         unsigned int idx = *(unsigned int *)v;
5926         struct loc_track *t = seq->private;
5927
5928         if (idx < t->count) {
5929                 l = &t->loc[idx];
5930
5931                 seq_printf(seq, "%7ld ", l->count);
5932
5933                 if (l->addr)
5934                         seq_printf(seq, "%pS", (void *)l->addr);
5935                 else
5936                         seq_puts(seq, "<not-available>");
5937
5938                 if (l->sum_time != l->min_time) {
5939                         seq_printf(seq, " age=%ld/%llu/%ld",
5940                                 l->min_time, div_u64(l->sum_time, l->count),
5941                                 l->max_time);
5942                 } else
5943                         seq_printf(seq, " age=%ld", l->min_time);
5944
5945                 if (l->min_pid != l->max_pid)
5946                         seq_printf(seq, " pid=%ld-%ld", l->min_pid, l->max_pid);
5947                 else
5948                         seq_printf(seq, " pid=%ld",
5949                                 l->min_pid);
5950
5951                 if (num_online_cpus() > 1 && !cpumask_empty(to_cpumask(l->cpus)))
5952                         seq_printf(seq, " cpus=%*pbl",
5953                                  cpumask_pr_args(to_cpumask(l->cpus)));
5954
5955                 if (nr_online_nodes > 1 && !nodes_empty(l->nodes))
5956                         seq_printf(seq, " nodes=%*pbl",
5957                                  nodemask_pr_args(&l->nodes));
5958
5959                 seq_puts(seq, "\n");
5960         }
5961
5962         if (!idx && !t->count)
5963                 seq_puts(seq, "No data\n");
5964
5965         return 0;
5966 }
5967
5968 static void slab_debugfs_stop(struct seq_file *seq, void *v)
5969 {
5970 }
5971
5972 static void *slab_debugfs_next(struct seq_file *seq, void *v, loff_t *ppos)
5973 {
5974         struct loc_track *t = seq->private;
5975
5976         v = ppos;
5977         ++*ppos;
5978         if (*ppos <= t->count)
5979                 return v;
5980
5981         return NULL;
5982 }
5983
5984 static void *slab_debugfs_start(struct seq_file *seq, loff_t *ppos)
5985 {
5986         return ppos;
5987 }
5988
5989 static const struct seq_operations slab_debugfs_sops = {
5990         .start  = slab_debugfs_start,
5991         .next   = slab_debugfs_next,
5992         .stop   = slab_debugfs_stop,
5993         .show   = slab_debugfs_show,
5994 };
5995
5996 static int slab_debug_trace_open(struct inode *inode, struct file *filep)
5997 {
5998
5999         struct kmem_cache_node *n;
6000         enum track_item alloc;
6001         int node;
6002         struct loc_track *t = __seq_open_private(filep, &slab_debugfs_sops,
6003                                                 sizeof(struct loc_track));
6004         struct kmem_cache *s = file_inode(filep)->i_private;
6005         unsigned long *obj_map;
6006
6007         obj_map = bitmap_alloc(oo_objects(s->oo), GFP_KERNEL);
6008         if (!obj_map)
6009                 return -ENOMEM;
6010
6011         if (strcmp(filep->f_path.dentry->d_name.name, "alloc_traces") == 0)
6012                 alloc = TRACK_ALLOC;
6013         else
6014                 alloc = TRACK_FREE;
6015
6016         if (!alloc_loc_track(t, PAGE_SIZE / sizeof(struct location), GFP_KERNEL)) {
6017                 bitmap_free(obj_map);
6018                 return -ENOMEM;
6019         }
6020
6021         for_each_kmem_cache_node(s, node, n) {
6022                 unsigned long flags;
6023                 struct page *page;
6024
6025                 if (!atomic_long_read(&n->nr_slabs))
6026                         continue;
6027
6028                 spin_lock_irqsave(&n->list_lock, flags);
6029                 list_for_each_entry(page, &n->partial, slab_list)
6030                         process_slab(t, s, page, alloc, obj_map);
6031                 list_for_each_entry(page, &n->full, slab_list)
6032                         process_slab(t, s, page, alloc, obj_map);
6033                 spin_unlock_irqrestore(&n->list_lock, flags);
6034         }
6035
6036         bitmap_free(obj_map);
6037         return 0;
6038 }
6039
6040 static int slab_debug_trace_release(struct inode *inode, struct file *file)
6041 {
6042         struct seq_file *seq = file->private_data;
6043         struct loc_track *t = seq->private;
6044
6045         free_loc_track(t);
6046         return seq_release_private(inode, file);
6047 }
6048
6049 static const struct file_operations slab_debugfs_fops = {
6050         .open    = slab_debug_trace_open,
6051         .read    = seq_read,
6052         .llseek  = seq_lseek,
6053         .release = slab_debug_trace_release,
6054 };
6055
6056 static void debugfs_slab_add(struct kmem_cache *s)
6057 {
6058         struct dentry *slab_cache_dir;
6059
6060         if (unlikely(!slab_debugfs_root))
6061                 return;
6062
6063         slab_cache_dir = debugfs_create_dir(s->name, slab_debugfs_root);
6064
6065         debugfs_create_file("alloc_traces", 0400,
6066                 slab_cache_dir, s, &slab_debugfs_fops);
6067
6068         debugfs_create_file("free_traces", 0400,
6069                 slab_cache_dir, s, &slab_debugfs_fops);
6070 }
6071
6072 void debugfs_slab_release(struct kmem_cache *s)
6073 {
6074         debugfs_remove_recursive(debugfs_lookup(s->name, slab_debugfs_root));
6075 }
6076
6077 static int __init slab_debugfs_init(void)
6078 {
6079         struct kmem_cache *s;
6080
6081         slab_debugfs_root = debugfs_create_dir("slab", NULL);
6082
6083         list_for_each_entry(s, &slab_caches, list)
6084                 if (s->flags & SLAB_STORE_USER)
6085                         debugfs_slab_add(s);
6086
6087         return 0;
6088
6089 }
6090 __initcall(slab_debugfs_init);
6091 #endif
6092 /*
6093  * The /proc/slabinfo ABI
6094  */
6095 #ifdef CONFIG_SLUB_DEBUG
6096 void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
6097 {
6098         unsigned long nr_slabs = 0;
6099         unsigned long nr_objs = 0;
6100         unsigned long nr_free = 0;
6101         int node;
6102         struct kmem_cache_node *n;
6103
6104         for_each_kmem_cache_node(s, node, n) {
6105                 nr_slabs += node_nr_slabs(n);
6106                 nr_objs += node_nr_objs(n);
6107                 nr_free += count_partial(n, count_free);
6108         }
6109
6110         sinfo->active_objs = nr_objs - nr_free;
6111         sinfo->num_objs = nr_objs;
6112         sinfo->active_slabs = nr_slabs;
6113         sinfo->num_slabs = nr_slabs;
6114         sinfo->objects_per_slab = oo_objects(s->oo);
6115         sinfo->cache_order = oo_order(s->oo);
6116 }
6117
6118 void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s)
6119 {
6120 }
6121
6122 ssize_t slabinfo_write(struct file *file, const char __user *buffer,
6123                        size_t count, loff_t *ppos)
6124 {
6125         return -EIO;
6126 }
6127 #endif /* CONFIG_SLUB_DEBUG */