mm/mmu_gather.c

   1 #include <linux/gfp.h>
   2 #include <linux/highmem.h>
   3 #include <linux/kernel.h>
   4 #include <linux/kmsan-checks.h>
   5 #include <linux/mmdebug.h>
   6 #include <linux/mm_types.h>
   7 #include <linux/mm_inline.h>
   8 #include <linux/pagemap.h>
   9 #include <linux/rcupdate.h>
  10 #include <linux/smp.h>
  11 #include <linux/swap.h>
  12
  13 #include <asm/pgalloc.h>
  14 #include <asm/tlb.h>
  15
  16 #ifndef CONFIG_MMU_GATHER_NO_GATHER
  17
  18 static bool tlb_next_batch(struct mmu_gather *tlb)
  19 {
  20         struct mmu_gather_batch *batch;
  21
  22         batch = tlb->active;
  23         if (batch->next) {
  24                 tlb->active = batch->next;
  25                 return true;
  26         }
  27
  28         if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
  29                 return false;
  30
  31         batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
  32         if (!batch)
  33                 return false;
  34
  35         tlb->batch_count++;
  36         batch->next = NULL;
  37         batch->nr   = 0;
  38         batch->max  = MAX_GATHER_BATCH;
  39
  40         tlb->active->next = batch;
  41         tlb->active = batch;
  42
  43         return true;
  44 }
  45
  46 static void tlb_batch_pages_flush(struct mmu_gather *tlb)
  47 {
  48         struct mmu_gather_batch *batch;
  49
  50         for (batch = &tlb->local; batch && batch->nr; batch = batch->next) {
  51                 struct page **pages = batch->pages;
  52
  53                 do {
  54                         /*
  55                          * limit free batch count when PAGE_SIZE > 4K
  56                          */
  57                         unsigned int nr = min(512U, batch->nr);
  58
  59                         free_pages_and_swap_cache(pages, nr);
  60                         pages += nr;
  61                         batch->nr -= nr;
  62
  63                         cond_resched();
  64                 } while (batch->nr);
  65         }
  66         tlb->active = &tlb->local;
  67 }
  68
  69 static void tlb_batch_list_free(struct mmu_gather *tlb)
  70 {
  71         struct mmu_gather_batch *batch, *next;
  72
  73         for (batch = tlb->local.next; batch; batch = next) {
  74                 next = batch->next;
  75                 free_pages((unsigned long)batch, 0);
  76         }
  77         tlb->local.next = NULL;
  78 }
  79
  80 bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size)
  81 {
  82         struct mmu_gather_batch *batch;
  83
  84         VM_BUG_ON(!tlb->end);
  85
  86 #ifdef CONFIG_MMU_GATHER_PAGE_SIZE
  87         VM_WARN_ON(tlb->page_size != page_size);
  88 #endif
  89
  90         batch = tlb->active;
  91         /*
  92          * Add the page and check if we are full. If so
  93          * force a flush.
  94          */
  95         batch->pages[batch->nr++] = page;
  96         if (batch->nr == batch->max) {
  97                 if (!tlb_next_batch(tlb))
  98                         return true;
  99                 batch = tlb->active;
 100         }
 101         VM_BUG_ON_PAGE(batch->nr > batch->max, page);
 102
 103         return false;
 104 }
 105
 106 #endif /* MMU_GATHER_NO_GATHER */
 107
 108 #ifdef CONFIG_MMU_GATHER_TABLE_FREE
 109
 110 static void __tlb_remove_table_free(struct mmu_table_batch *batch)
 111 {
 112         int i;
 113
 114         for (i = 0; i < batch->nr; i++)
 115                 __tlb_remove_table(batch->tables[i]);
 116
 117         free_page((unsigned long)batch);
 118 }
 119
 120 #ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE
 121
 122 /*
 123  * Semi RCU freeing of the page directories.
 124  *
 125  * This is needed by some architectures to implement software pagetable walkers.
 126  *
 127  * gup_fast() and other software pagetable walkers do a lockless page-table
 128  * walk and therefore needs some synchronization with the freeing of the page
 129  * directories. The chosen means to accomplish that is by disabling IRQs over
 130  * the walk.
 131  *
 132  * Architectures that use IPIs to flush TLBs will then automagically DTRT,
 133  * since we unlink the page, flush TLBs, free the page. Since the disabling of
 134  * IRQs delays the completion of the TLB flush we can never observe an already
 135  * freed page.
 136  *
 137  * Architectures that do not have this (PPC) need to delay the freeing by some
 138  * other means, this is that means.
 139  *
 140  * What we do is batch the freed directory pages (tables) and RCU free them.
 141  * We use the sched RCU variant, as that guarantees that IRQ/preempt disabling
 142  * holds off grace periods.
 143  *
 144  * However, in order to batch these pages we need to allocate storage, this
 145  * allocation is deep inside the MM code and can thus easily fail on memory
 146  * pressure. To guarantee progress we fall back to single table freeing, see
 147  * the implementation of tlb_remove_table_one().
 148  *
 149  */
 150
 151 static void tlb_remove_table_smp_sync(void *arg)
 152 {
 153         /* Simply deliver the interrupt */
 154 }
 155
 156 static void tlb_remove_table_sync_one(void)
 157 {
 158         /*
 159          * This isn't an RCU grace period and hence the page-tables cannot be
 160          * assumed to be actually RCU-freed.
 161          *
 162          * It is however sufficient for software page-table walkers that rely on
 163          * IRQ disabling.
 164          */
 165         smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
 166 }
 167
 168 static void tlb_remove_table_rcu(struct rcu_head *head)
 169 {
 170         __tlb_remove_table_free(container_of(head, struct mmu_table_batch, rcu));
 171 }
 172
 173 static void tlb_remove_table_free(struct mmu_table_batch *batch)
 174 {
 175         call_rcu(&batch->rcu, tlb_remove_table_rcu);
 176 }
 177
 178 #else /* !CONFIG_MMU_GATHER_RCU_TABLE_FREE */
 179
 180 static void tlb_remove_table_sync_one(void) { }
 181
 182 static void tlb_remove_table_free(struct mmu_table_batch *batch)
 183 {
 184         __tlb_remove_table_free(batch);
 185 }
 186
 187 #endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */
 188
 189 /*
 190  * If we want tlb_remove_table() to imply TLB invalidates.
 191  */
 192 static inline void tlb_table_invalidate(struct mmu_gather *tlb)
 193 {
 194         if (tlb_needs_table_invalidate()) {
 195                 /*
 196                  * Invalidate page-table caches used by hardware walkers. Then
 197                  * we still need to RCU-sched wait while freeing the pages
 198                  * because software walkers can still be in-flight.
 199                  */
 200                 tlb_flush_mmu_tlbonly(tlb);
 201         }
 202 }
 203
 204 static void tlb_remove_table_one(void *table)
 205 {
 206         tlb_remove_table_sync_one();
 207         __tlb_remove_table(table);
 208 }
 209
 210 static void tlb_table_flush(struct mmu_gather *tlb)
 211 {
 212         struct mmu_table_batch **batch = &tlb->batch;
 213
 214         if (*batch) {
 215                 tlb_table_invalidate(tlb);
 216                 tlb_remove_table_free(*batch);
 217                 *batch = NULL;
 218         }
 219 }
 220
 221 void tlb_remove_table(struct mmu_gather *tlb, void *table)
 222 {
 223         struct mmu_table_batch **batch = &tlb->batch;
 224
 225         if (*batch == NULL) {
 226                 *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
 227                 if (*batch == NULL) {
 228                         tlb_table_invalidate(tlb);
 229                         tlb_remove_table_one(table);
 230                         return;
 231                 }
 232                 (*batch)->nr = 0;
 233         }
 234
 235         (*batch)->tables[(*batch)->nr++] = table;
 236         if ((*batch)->nr == MAX_TABLE_BATCH)
 237                 tlb_table_flush(tlb);
 238 }
 239
 240 static inline void tlb_table_init(struct mmu_gather *tlb)
 241 {
 242         tlb->batch = NULL;
 243 }
 244
 245 #else /* !CONFIG_MMU_GATHER_TABLE_FREE */
 246
 247 static inline void tlb_table_flush(struct mmu_gather *tlb) { }
 248 static inline void tlb_table_init(struct mmu_gather *tlb) { }
 249
 250 #endif /* CONFIG_MMU_GATHER_TABLE_FREE */
 251
 252 static void tlb_flush_mmu_free(struct mmu_gather *tlb)
 253 {
 254         tlb_table_flush(tlb);
 255 #ifndef CONFIG_MMU_GATHER_NO_GATHER
 256         tlb_batch_pages_flush(tlb);
 257 #endif
 258 }
 259
 260 void tlb_flush_mmu(struct mmu_gather *tlb)
 261 {
 262         tlb_flush_mmu_tlbonly(tlb);
 263         tlb_flush_mmu_free(tlb);
 264 }
 265
 266 static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
 267                              bool fullmm)
 268 {
 269         /*
 270          * struct mmu_gather contains 7 1-bit fields packed into a 32-bit
 271          * unsigned int value. The remaining 25 bits remain uninitialized
 272          * and are never used, but KMSAN updates the origin for them in
 273          * zap_pXX_range() in mm/memory.c, thus creating very long origin
 274          * chains. This is technically correct, but consumes too much memory.
 275          * Unpoisoning the whole structure will prevent creating such chains.
 276          */
 277         kmsan_unpoison_memory(tlb, sizeof(*tlb));
 278         tlb->mm = mm;
 279         tlb->fullmm = fullmm;
 280
 281 #ifndef CONFIG_MMU_GATHER_NO_GATHER
 282         tlb->need_flush_all = 0;
 283         tlb->local.next = NULL;
 284         tlb->local.nr   = 0;
 285         tlb->local.max  = ARRAY_SIZE(tlb->__pages);
 286         tlb->active     = &tlb->local;
 287         tlb->batch_count = 0;
 288 #endif
 289
 290         tlb_table_init(tlb);
 291 #ifdef CONFIG_MMU_GATHER_PAGE_SIZE
 292         tlb->page_size = 0;
 293 #endif
 294
 295         __tlb_reset_range(tlb);
 296         inc_tlb_flush_pending(tlb->mm);
 297 }
 298
 299 /**
 300  * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down
 301  * @tlb: the mmu_gather structure to initialize
 302  * @mm: the mm_struct of the target address space
 303  *
 304  * Called to initialize an (on-stack) mmu_gather structure for page-table
 305  * tear-down from @mm.
 306  */
 307 void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm)
 308 {
 309         __tlb_gather_mmu(tlb, mm, false);
 310 }
 311
 312 /**
 313  * tlb_gather_mmu_fullmm - initialize an mmu_gather structure for page-table tear-down
 314  * @tlb: the mmu_gather structure to initialize
 315  * @mm: the mm_struct of the target address space
 316  *
 317  * In this case, @mm is without users and we're going to destroy the
 318  * full address space (exit/execve).
 319  *
 320  * Called to initialize an (on-stack) mmu_gather structure for page-table
 321  * tear-down from @mm.
 322  */
 323 void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm)
 324 {
 325         __tlb_gather_mmu(tlb, mm, true);
 326 }
 327
 328 /**
 329  * tlb_finish_mmu - finish an mmu_gather structure
 330  * @tlb: the mmu_gather structure to finish
 331  *
 332  * Called at the end of the shootdown operation to free up any resources that
 333  * were required.
 334  */
 335 void tlb_finish_mmu(struct mmu_gather *tlb)
 336 {
 337         /*
 338          * If there are parallel threads are doing PTE changes on same range
 339          * under non-exclusive lock (e.g., mmap_lock read-side) but defer TLB
 340          * flush by batching, one thread may end up seeing inconsistent PTEs
 341          * and result in having stale TLB entries.  So flush TLB forcefully
 342          * if we detect parallel PTE batching threads.
 343          *
 344          * However, some syscalls, e.g. munmap(), may free page tables, this
 345          * needs force flush everything in the given range. Otherwise this
 346          * may result in having stale TLB entries for some architectures,
 347          * e.g. aarch64, that could specify flush what level TLB.
 348          */
 349         if (mm_tlb_flush_nested(tlb->mm)) {
 350                 /*
 351                  * The aarch64 yields better performance with fullmm by
 352                  * avoiding multiple CPUs spamming TLBI messages at the
 353                  * same time.
 354                  *
 355                  * On x86 non-fullmm doesn't yield significant difference
 356                  * against fullmm.
 357                  */
 358                 tlb->fullmm = 1;
 359                 __tlb_reset_range(tlb);
 360                 tlb->freed_tables = 1;
 361         }
 362
 363         tlb_flush_mmu(tlb);
 364
 365 #ifndef CONFIG_MMU_GATHER_NO_GATHER
 366         tlb_batch_list_free(tlb);
 367 #endif
 368         dec_tlb_flush_pending(tlb->mm);
 369 }