mm/mmu_gather.c

   1 #include <linux/gfp.h>
   2 #include <linux/highmem.h>
   3 #include <linux/kernel.h>
   4 #include <linux/mmdebug.h>
   5 #include <linux/mm_types.h>
   6 #include <linux/mm_inline.h>
   7 #include <linux/pagemap.h>
   8 #include <linux/rcupdate.h>
   9 #include <linux/smp.h>
  10 #include <linux/swap.h>
  11
  12 #include <asm/pgalloc.h>
  13 #include <asm/tlb.h>
  14
  15 #ifndef CONFIG_MMU_GATHER_NO_GATHER
  16
  17 static bool tlb_next_batch(struct mmu_gather *tlb)
  18 {
  19         struct mmu_gather_batch *batch;
  20
  21         batch = tlb->active;
  22         if (batch->next) {
  23                 tlb->active = batch->next;
  24                 return true;
  25         }
  26
  27         if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
  28                 return false;
  29
  30         batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
  31         if (!batch)
  32                 return false;
  33
  34         tlb->batch_count++;
  35         batch->next = NULL;
  36         batch->nr   = 0;
  37         batch->max  = MAX_GATHER_BATCH;
  38
  39         tlb->active->next = batch;
  40         tlb->active = batch;
  41
  42         return true;
  43 }
  44
  45 static void tlb_batch_pages_flush(struct mmu_gather *tlb)
  46 {
  47         struct mmu_gather_batch *batch;
  48
  49         for (batch = &tlb->local; batch && batch->nr; batch = batch->next) {
  50                 free_pages_and_swap_cache(batch->pages, batch->nr);
  51                 batch->nr = 0;
  52         }
  53         tlb->active = &tlb->local;
  54 }
  55
  56 static void tlb_batch_list_free(struct mmu_gather *tlb)
  57 {
  58         struct mmu_gather_batch *batch, *next;
  59
  60         for (batch = tlb->local.next; batch; batch = next) {
  61                 next = batch->next;
  62                 free_pages((unsigned long)batch, 0);
  63         }
  64         tlb->local.next = NULL;
  65 }
  66
  67 bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size)
  68 {
  69         struct mmu_gather_batch *batch;
  70
  71         VM_BUG_ON(!tlb->end);
  72
  73 #ifdef CONFIG_MMU_GATHER_PAGE_SIZE
  74         VM_WARN_ON(tlb->page_size != page_size);
  75 #endif
  76
  77         batch = tlb->active;
  78         /*
  79          * Add the page and check if we are full. If so
  80          * force a flush.
  81          */
  82         batch->pages[batch->nr++] = page;
  83         if (batch->nr == batch->max) {
  84                 if (!tlb_next_batch(tlb))
  85                         return true;
  86                 batch = tlb->active;
  87         }
  88         VM_BUG_ON_PAGE(batch->nr > batch->max, page);
  89
  90         return false;
  91 }
  92
  93 #endif /* MMU_GATHER_NO_GATHER */
  94
  95 #ifdef CONFIG_MMU_GATHER_TABLE_FREE
  96
  97 static void __tlb_remove_table_free(struct mmu_table_batch *batch)
  98 {
  99         int i;
 100
 101         for (i = 0; i < batch->nr; i++)
 102                 __tlb_remove_table(batch->tables[i]);
 103
 104         free_page((unsigned long)batch);
 105 }
 106
 107 #ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE
 108
 109 /*
 110  * Semi RCU freeing of the page directories.
 111  *
 112  * This is needed by some architectures to implement software pagetable walkers.
 113  *
 114  * gup_fast() and other software pagetable walkers do a lockless page-table
 115  * walk and therefore needs some synchronization with the freeing of the page
 116  * directories. The chosen means to accomplish that is by disabling IRQs over
 117  * the walk.
 118  *
 119  * Architectures that use IPIs to flush TLBs will then automagically DTRT,
 120  * since we unlink the page, flush TLBs, free the page. Since the disabling of
 121  * IRQs delays the completion of the TLB flush we can never observe an already
 122  * freed page.
 123  *
 124  * Architectures that do not have this (PPC) need to delay the freeing by some
 125  * other means, this is that means.
 126  *
 127  * What we do is batch the freed directory pages (tables) and RCU free them.
 128  * We use the sched RCU variant, as that guarantees that IRQ/preempt disabling
 129  * holds off grace periods.
 130  *
 131  * However, in order to batch these pages we need to allocate storage, this
 132  * allocation is deep inside the MM code and can thus easily fail on memory
 133  * pressure. To guarantee progress we fall back to single table freeing, see
 134  * the implementation of tlb_remove_table_one().
 135  *
 136  */
 137
 138 static void tlb_remove_table_smp_sync(void *arg)
 139 {
 140         /* Simply deliver the interrupt */
 141 }
 142
 143 static void tlb_remove_table_sync_one(void)
 144 {
 145         /*
 146          * This isn't an RCU grace period and hence the page-tables cannot be
 147          * assumed to be actually RCU-freed.
 148          *
 149          * It is however sufficient for software page-table walkers that rely on
 150          * IRQ disabling.
 151          */
 152         smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
 153 }
 154
 155 static void tlb_remove_table_rcu(struct rcu_head *head)
 156 {
 157         __tlb_remove_table_free(container_of(head, struct mmu_table_batch, rcu));
 158 }
 159
 160 static void tlb_remove_table_free(struct mmu_table_batch *batch)
 161 {
 162         call_rcu(&batch->rcu, tlb_remove_table_rcu);
 163 }
 164
 165 #else /* !CONFIG_MMU_GATHER_RCU_TABLE_FREE */
 166
 167 static void tlb_remove_table_sync_one(void) { }
 168
 169 static void tlb_remove_table_free(struct mmu_table_batch *batch)
 170 {
 171         __tlb_remove_table_free(batch);
 172 }
 173
 174 #endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */
 175
 176 /*
 177  * If we want tlb_remove_table() to imply TLB invalidates.
 178  */
 179 static inline void tlb_table_invalidate(struct mmu_gather *tlb)
 180 {
 181         if (tlb_needs_table_invalidate()) {
 182                 /*
 183                  * Invalidate page-table caches used by hardware walkers. Then
 184                  * we still need to RCU-sched wait while freeing the pages
 185                  * because software walkers can still be in-flight.
 186                  */
 187                 tlb_flush_mmu_tlbonly(tlb);
 188         }
 189 }
 190
 191 static void tlb_remove_table_one(void *table)
 192 {
 193         tlb_remove_table_sync_one();
 194         __tlb_remove_table(table);
 195 }
 196
 197 static void tlb_table_flush(struct mmu_gather *tlb)
 198 {
 199         struct mmu_table_batch **batch = &tlb->batch;
 200
 201         if (*batch) {
 202                 tlb_table_invalidate(tlb);
 203                 tlb_remove_table_free(*batch);
 204                 *batch = NULL;
 205         }
 206 }
 207
 208 void tlb_remove_table(struct mmu_gather *tlb, void *table)
 209 {
 210         struct mmu_table_batch **batch = &tlb->batch;
 211
 212         if (*batch == NULL) {
 213                 *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
 214                 if (*batch == NULL) {
 215                         tlb_table_invalidate(tlb);
 216                         tlb_remove_table_one(table);
 217                         return;
 218                 }
 219                 (*batch)->nr = 0;
 220         }
 221
 222         (*batch)->tables[(*batch)->nr++] = table;
 223         if ((*batch)->nr == MAX_TABLE_BATCH)
 224                 tlb_table_flush(tlb);
 225 }
 226
 227 static inline void tlb_table_init(struct mmu_gather *tlb)
 228 {
 229         tlb->batch = NULL;
 230 }
 231
 232 #else /* !CONFIG_MMU_GATHER_TABLE_FREE */
 233
 234 static inline void tlb_table_flush(struct mmu_gather *tlb) { }
 235 static inline void tlb_table_init(struct mmu_gather *tlb) { }
 236
 237 #endif /* CONFIG_MMU_GATHER_TABLE_FREE */
 238
 239 static void tlb_flush_mmu_free(struct mmu_gather *tlb)
 240 {
 241         tlb_table_flush(tlb);
 242 #ifndef CONFIG_MMU_GATHER_NO_GATHER
 243         tlb_batch_pages_flush(tlb);
 244 #endif
 245 }
 246
 247 void tlb_flush_mmu(struct mmu_gather *tlb)
 248 {
 249         tlb_flush_mmu_tlbonly(tlb);
 250         tlb_flush_mmu_free(tlb);
 251 }
 252
 253 static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
 254                              bool fullmm)
 255 {
 256         tlb->mm = mm;
 257         tlb->fullmm = fullmm;
 258
 259 #ifndef CONFIG_MMU_GATHER_NO_GATHER
 260         tlb->need_flush_all = 0;
 261         tlb->local.next = NULL;
 262         tlb->local.nr   = 0;
 263         tlb->local.max  = ARRAY_SIZE(tlb->__pages);
 264         tlb->active     = &tlb->local;
 265         tlb->batch_count = 0;
 266 #endif
 267
 268         tlb_table_init(tlb);
 269 #ifdef CONFIG_MMU_GATHER_PAGE_SIZE
 270         tlb->page_size = 0;
 271 #endif
 272
 273         __tlb_reset_range(tlb);
 274         inc_tlb_flush_pending(tlb->mm);
 275 }
 276
 277 /**
 278  * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down
 279  * @tlb: the mmu_gather structure to initialize
 280  * @mm: the mm_struct of the target address space
 281  *
 282  * Called to initialize an (on-stack) mmu_gather structure for page-table
 283  * tear-down from @mm.
 284  */
 285 void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm)
 286 {
 287         __tlb_gather_mmu(tlb, mm, false);
 288 }
 289
 290 /**
 291  * tlb_gather_mmu_fullmm - initialize an mmu_gather structure for page-table tear-down
 292  * @tlb: the mmu_gather structure to initialize
 293  * @mm: the mm_struct of the target address space
 294  *
 295  * In this case, @mm is without users and we're going to destroy the
 296  * full address space (exit/execve).
 297  *
 298  * Called to initialize an (on-stack) mmu_gather structure for page-table
 299  * tear-down from @mm.
 300  */
 301 void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm)
 302 {
 303         __tlb_gather_mmu(tlb, mm, true);
 304 }
 305
 306 /**
 307  * tlb_finish_mmu - finish an mmu_gather structure
 308  * @tlb: the mmu_gather structure to finish
 309  *
 310  * Called at the end of the shootdown operation to free up any resources that
 311  * were required.
 312  */
 313 void tlb_finish_mmu(struct mmu_gather *tlb)
 314 {
 315         /*
 316          * If there are parallel threads are doing PTE changes on same range
 317          * under non-exclusive lock (e.g., mmap_lock read-side) but defer TLB
 318          * flush by batching, one thread may end up seeing inconsistent PTEs
 319          * and result in having stale TLB entries.  So flush TLB forcefully
 320          * if we detect parallel PTE batching threads.
 321          *
 322          * However, some syscalls, e.g. munmap(), may free page tables, this
 323          * needs force flush everything in the given range. Otherwise this
 324          * may result in having stale TLB entries for some architectures,
 325          * e.g. aarch64, that could specify flush what level TLB.
 326          */
 327         if (mm_tlb_flush_nested(tlb->mm)) {
 328                 /*
 329                  * The aarch64 yields better performance with fullmm by
 330                  * avoiding multiple CPUs spamming TLBI messages at the
 331                  * same time.
 332                  *
 333                  * On x86 non-fullmm doesn't yield significant difference
 334                  * against fullmm.
 335                  */
 336                 tlb->fullmm = 1;
 337                 __tlb_reset_range(tlb);
 338                 tlb->freed_tables = 1;
 339         }
 340
 341         tlb_flush_mmu(tlb);
 342
 343 #ifndef CONFIG_MMU_GATHER_NO_GATHER
 344         tlb_batch_list_free(tlb);
 345 #endif
 346         dec_tlb_flush_pending(tlb->mm);
 347 }