mm/mmu_gather.c

   1 #include <linux/gfp.h>
   2 #include <linux/highmem.h>
   3 #include <linux/kernel.h>
   4 #include <linux/kmsan-checks.h>
   5 #include <linux/mmdebug.h>
   6 #include <linux/mm_types.h>
   7 #include <linux/mm_inline.h>
   8 #include <linux/pagemap.h>
   9 #include <linux/rcupdate.h>
  10 #include <linux/smp.h>
  11 #include <linux/swap.h>
  12
  13 #include <asm/pgalloc.h>
  14 #include <asm/tlb.h>
  15
  16 #ifndef CONFIG_MMU_GATHER_NO_GATHER
  17
  18 static bool tlb_next_batch(struct mmu_gather *tlb)
  19 {
  20         struct mmu_gather_batch *batch;
  21
  22         batch = tlb->active;
  23         if (batch->next) {
  24                 tlb->active = batch->next;
  25                 return true;
  26         }
  27
  28         if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
  29                 return false;
  30
  31         batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
  32         if (!batch)
  33                 return false;
  34
  35         tlb->batch_count++;
  36         batch->next = NULL;
  37         batch->nr   = 0;
  38         batch->max  = MAX_GATHER_BATCH;
  39
  40         tlb->active->next = batch;
  41         tlb->active = batch;
  42
  43         return true;
  44 }
  45
  46 static void tlb_batch_pages_flush(struct mmu_gather *tlb)
  47 {
  48         struct mmu_gather_batch *batch;
  49
  50         for (batch = &tlb->local; batch && batch->nr; batch = batch->next) {
  51                 struct page **pages = batch->pages;
  52
  53                 do {
  54                         /*
  55                          * limit free batch count when PAGE_SIZE > 4K
  56                          */
  57                         unsigned int nr = min(512U, batch->nr);
  58
  59                         free_pages_and_swap_cache(pages, nr);
  60                         pages += nr;
  61                         batch->nr -= nr;
  62
  63                         cond_resched();
  64                 } while (batch->nr);
  65         }
  66         tlb->active = &tlb->local;
  67 }
  68
  69 static void tlb_batch_list_free(struct mmu_gather *tlb)
  70 {
  71         struct mmu_gather_batch *batch, *next;
  72
  73         for (batch = tlb->local.next; batch; batch = next) {
  74                 next = batch->next;
  75                 free_pages((unsigned long)batch, 0);
  76         }
  77         tlb->local.next = NULL;
  78 }
  79
  80 bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size)
  81 {
  82         struct mmu_gather_batch *batch;
  83
  84         VM_BUG_ON(!tlb->end);
  85
  86 #ifdef CONFIG_MMU_GATHER_PAGE_SIZE
  87         VM_WARN_ON(tlb->page_size != page_size);
  88 #endif
  89
  90         batch = tlb->active;
  91         /*
  92          * Add the page and check if we are full. If so
  93          * force a flush.
  94          */
  95         batch->pages[batch->nr++] = page;
  96         if (batch->nr == batch->max) {
  97                 if (!tlb_next_batch(tlb))
  98                         return true;
  99                 batch = tlb->active;
 100         }
 101         VM_BUG_ON_PAGE(batch->nr > batch->max, page);
 102
 103         return false;
 104 }
 105
 106 #endif /* MMU_GATHER_NO_GATHER */
 107
 108 #ifdef CONFIG_MMU_GATHER_TABLE_FREE
 109
 110 static void __tlb_remove_table_free(struct mmu_table_batch *batch)
 111 {
 112         int i;
 113
 114         for (i = 0; i < batch->nr; i++)
 115                 __tlb_remove_table(batch->tables[i]);
 116
 117         free_page((unsigned long)batch);
 118 }
 119
 120 #ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE
 121
 122 /*
 123  * Semi RCU freeing of the page directories.
 124  *
 125  * This is needed by some architectures to implement software pagetable walkers.
 126  *
 127  * gup_fast() and other software pagetable walkers do a lockless page-table
 128  * walk and therefore needs some synchronization with the freeing of the page
 129  * directories. The chosen means to accomplish that is by disabling IRQs over
 130  * the walk.
 131  *
 132  * Architectures that use IPIs to flush TLBs will then automagically DTRT,
 133  * since we unlink the page, flush TLBs, free the page. Since the disabling of
 134  * IRQs delays the completion of the TLB flush we can never observe an already
 135  * freed page.
 136  *
 137  * Architectures that do not have this (PPC) need to delay the freeing by some
 138  * other means, this is that means.
 139  *
 140  * What we do is batch the freed directory pages (tables) and RCU free them.
 141  * We use the sched RCU variant, as that guarantees that IRQ/preempt disabling
 142  * holds off grace periods.
 143  *
 144  * However, in order to batch these pages we need to allocate storage, this
 145  * allocation is deep inside the MM code and can thus easily fail on memory
 146  * pressure. To guarantee progress we fall back to single table freeing, see
 147  * the implementation of tlb_remove_table_one().
 148  *
 149  */
 150
 151 static void tlb_remove_table_smp_sync(void *arg)
 152 {
 153         /* Simply deliver the interrupt */
 154 }
 155
 156 void tlb_remove_table_sync_one(void)
 157 {
 158         /*
 159          * This isn't an RCU grace period and hence the page-tables cannot be
 160          * assumed to be actually RCU-freed.
 161          *
 162          * It is however sufficient for software page-table walkers that rely on
 163          * IRQ disabling.
 164          */
 165         smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
 166 }
 167
 168 static void tlb_remove_table_rcu(struct rcu_head *head)
 169 {
 170         __tlb_remove_table_free(container_of(head, struct mmu_table_batch, rcu));
 171 }
 172
 173 static void tlb_remove_table_free(struct mmu_table_batch *batch)
 174 {
 175         call_rcu(&batch->rcu, tlb_remove_table_rcu);
 176 }
 177
 178 #else /* !CONFIG_MMU_GATHER_RCU_TABLE_FREE */
 179
 180 static void tlb_remove_table_free(struct mmu_table_batch *batch)
 181 {
 182         __tlb_remove_table_free(batch);
 183 }
 184
 185 #endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */
 186
 187 /*
 188  * If we want tlb_remove_table() to imply TLB invalidates.
 189  */
 190 static inline void tlb_table_invalidate(struct mmu_gather *tlb)
 191 {
 192         if (tlb_needs_table_invalidate()) {
 193                 /*
 194                  * Invalidate page-table caches used by hardware walkers. Then
 195                  * we still need to RCU-sched wait while freeing the pages
 196                  * because software walkers can still be in-flight.
 197                  */
 198                 tlb_flush_mmu_tlbonly(tlb);
 199         }
 200 }
 201
 202 static void tlb_remove_table_one(void *table)
 203 {
 204         tlb_remove_table_sync_one();
 205         __tlb_remove_table(table);
 206 }
 207
 208 static void tlb_table_flush(struct mmu_gather *tlb)
 209 {
 210         struct mmu_table_batch **batch = &tlb->batch;
 211
 212         if (*batch) {
 213                 tlb_table_invalidate(tlb);
 214                 tlb_remove_table_free(*batch);
 215                 *batch = NULL;
 216         }
 217 }
 218
 219 void tlb_remove_table(struct mmu_gather *tlb, void *table)
 220 {
 221         struct mmu_table_batch **batch = &tlb->batch;
 222
 223         if (*batch == NULL) {
 224                 *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
 225                 if (*batch == NULL) {
 226                         tlb_table_invalidate(tlb);
 227                         tlb_remove_table_one(table);
 228                         return;
 229                 }
 230                 (*batch)->nr = 0;
 231         }
 232
 233         (*batch)->tables[(*batch)->nr++] = table;
 234         if ((*batch)->nr == MAX_TABLE_BATCH)
 235                 tlb_table_flush(tlb);
 236 }
 237
 238 static inline void tlb_table_init(struct mmu_gather *tlb)
 239 {
 240         tlb->batch = NULL;
 241 }
 242
 243 #else /* !CONFIG_MMU_GATHER_TABLE_FREE */
 244
 245 static inline void tlb_table_flush(struct mmu_gather *tlb) { }
 246 static inline void tlb_table_init(struct mmu_gather *tlb) { }
 247
 248 #endif /* CONFIG_MMU_GATHER_TABLE_FREE */
 249
 250 static void tlb_flush_mmu_free(struct mmu_gather *tlb)
 251 {
 252         tlb_table_flush(tlb);
 253 #ifndef CONFIG_MMU_GATHER_NO_GATHER
 254         tlb_batch_pages_flush(tlb);
 255 #endif
 256 }
 257
 258 void tlb_flush_mmu(struct mmu_gather *tlb)
 259 {
 260         tlb_flush_mmu_tlbonly(tlb);
 261         tlb_flush_mmu_free(tlb);
 262 }
 263
 264 static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
 265                              bool fullmm)
 266 {
 267         /*
 268          * struct mmu_gather contains 7 1-bit fields packed into a 32-bit
 269          * unsigned int value. The remaining 25 bits remain uninitialized
 270          * and are never used, but KMSAN updates the origin for them in
 271          * zap_pXX_range() in mm/memory.c, thus creating very long origin
 272          * chains. This is technically correct, but consumes too much memory.
 273          * Unpoisoning the whole structure will prevent creating such chains.
 274          */
 275         kmsan_unpoison_memory(tlb, sizeof(*tlb));
 276         tlb->mm = mm;
 277         tlb->fullmm = fullmm;
 278
 279 #ifndef CONFIG_MMU_GATHER_NO_GATHER
 280         tlb->need_flush_all = 0;
 281         tlb->local.next = NULL;
 282         tlb->local.nr   = 0;
 283         tlb->local.max  = ARRAY_SIZE(tlb->__pages);
 284         tlb->active     = &tlb->local;
 285         tlb->batch_count = 0;
 286 #endif
 287
 288         tlb_table_init(tlb);
 289 #ifdef CONFIG_MMU_GATHER_PAGE_SIZE
 290         tlb->page_size = 0;
 291 #endif
 292
 293         __tlb_reset_range(tlb);
 294         inc_tlb_flush_pending(tlb->mm);
 295 }
 296
 297 /**
 298  * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down
 299  * @tlb: the mmu_gather structure to initialize
 300  * @mm: the mm_struct of the target address space
 301  *
 302  * Called to initialize an (on-stack) mmu_gather structure for page-table
 303  * tear-down from @mm.
 304  */
 305 void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm)
 306 {
 307         __tlb_gather_mmu(tlb, mm, false);
 308 }
 309
 310 /**
 311  * tlb_gather_mmu_fullmm - initialize an mmu_gather structure for page-table tear-down
 312  * @tlb: the mmu_gather structure to initialize
 313  * @mm: the mm_struct of the target address space
 314  *
 315  * In this case, @mm is without users and we're going to destroy the
 316  * full address space (exit/execve).
 317  *
 318  * Called to initialize an (on-stack) mmu_gather structure for page-table
 319  * tear-down from @mm.
 320  */
 321 void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm)
 322 {
 323         __tlb_gather_mmu(tlb, mm, true);
 324 }
 325
 326 /**
 327  * tlb_finish_mmu - finish an mmu_gather structure
 328  * @tlb: the mmu_gather structure to finish
 329  *
 330  * Called at the end of the shootdown operation to free up any resources that
 331  * were required.
 332  */
 333 void tlb_finish_mmu(struct mmu_gather *tlb)
 334 {
 335         /*
 336          * If there are parallel threads are doing PTE changes on same range
 337          * under non-exclusive lock (e.g., mmap_lock read-side) but defer TLB
 338          * flush by batching, one thread may end up seeing inconsistent PTEs
 339          * and result in having stale TLB entries.  So flush TLB forcefully
 340          * if we detect parallel PTE batching threads.
 341          *
 342          * However, some syscalls, e.g. munmap(), may free page tables, this
 343          * needs force flush everything in the given range. Otherwise this
 344          * may result in having stale TLB entries for some architectures,
 345          * e.g. aarch64, that could specify flush what level TLB.
 346          */
 347         if (mm_tlb_flush_nested(tlb->mm)) {
 348                 /*
 349                  * The aarch64 yields better performance with fullmm by
 350                  * avoiding multiple CPUs spamming TLBI messages at the
 351                  * same time.
 352                  *
 353                  * On x86 non-fullmm doesn't yield significant difference
 354                  * against fullmm.
 355                  */
 356                 tlb->fullmm = 1;
 357                 __tlb_reset_range(tlb);
 358                 tlb->freed_tables = 1;
 359         }
 360
 361         tlb_flush_mmu(tlb);
 362
 363 #ifndef CONFIG_MMU_GATHER_NO_GATHER
 364         tlb_batch_list_free(tlb);
 365 #endif
 366         dec_tlb_flush_pending(tlb->mm);
 367 }