mm/mlock.c

   1 /*
   2  *      linux/mm/mlock.c
   3  *
   4  *  (C) Copyright 1995 Linus Torvalds
   5  *  (C) Copyright 2002 Christoph Hellwig
   6  */
   7
   8 #include <linux/capability.h>
   9 #include <linux/mman.h>
  10 #include <linux/mm.h>
  11 #include <linux/swap.h>
  12 #include <linux/swapops.h>
  13 #include <linux/pagemap.h>
  14 #include <linux/mempolicy.h>
  15 #include <linux/syscalls.h>
  16 #include <linux/sched.h>
  17 #include <linux/module.h>
  18 #include <linux/rmap.h>
  19 #include <linux/mmzone.h>
  20 #include <linux/hugetlb.h>
  21
  22 #include "internal.h"
  23
  24 int can_do_mlock(void)
  25 {
  26         if (capable(CAP_IPC_LOCK))
  27                 return 1;
  28         if (current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur != 0)
  29                 return 1;
  30         return 0;
  31 }
  32 EXPORT_SYMBOL(can_do_mlock);
  33
  34 #ifdef CONFIG_UNEVICTABLE_LRU
  35 /*
  36  * Mlocked pages are marked with PageMlocked() flag for efficient testing
  37  * in vmscan and, possibly, the fault path; and to support semi-accurate
  38  * statistics.
  39  *
  40  * An mlocked page [PageMlocked(page)] is unevictable.  As such, it will
  41  * be placed on the LRU "unevictable" list, rather than the [in]active lists.
  42  * The unevictable list is an LRU sibling list to the [in]active lists.
  43  * PageUnevictable is set to indicate the unevictable state.
  44  *
  45  * When lazy mlocking via vmscan, it is important to ensure that the
  46  * vma's VM_LOCKED status is not concurrently being modified, otherwise we
  47  * may have mlocked a page that is being munlocked. So lazy mlock must take
  48  * the mmap_sem for read, and verify that the vma really is locked
  49  * (see mm/rmap.c).
  50  */
  51
  52 /*
  53  *  LRU accounting for clear_page_mlock()
  54  */
  55 void __clear_page_mlock(struct page *page)
  56 {
  57         VM_BUG_ON(!PageLocked(page));
  58
  59         if (!page->mapping) {   /* truncated ? */
  60                 return;
  61         }
  62
  63         if (!isolate_lru_page(page)) {
  64                 putback_lru_page(page);
  65         } else {
  66                 /*
  67                  * Page not on the LRU yet.  Flush all pagevecs and retry.
  68                  */
  69                 lru_add_drain_all();
  70                 if (!isolate_lru_page(page))
  71                         putback_lru_page(page);
  72         }
  73 }
  74
  75 /*
  76  * Mark page as mlocked if not already.
  77  * If page on LRU, isolate and putback to move to unevictable list.
  78  */
  79 void mlock_vma_page(struct page *page)
  80 {
  81         BUG_ON(!PageLocked(page));
  82
  83         if (!TestSetPageMlocked(page) && !isolate_lru_page(page))
  84                 putback_lru_page(page);
  85 }
  86
  87 /*
  88  * called from munlock()/munmap() path with page supposedly on the LRU.
  89  *
  90  * Note:  unlike mlock_vma_page(), we can't just clear the PageMlocked
  91  * [in try_to_munlock()] and then attempt to isolate the page.  We must
  92  * isolate the page to keep others from messing with its unevictable
  93  * and mlocked state while trying to munlock.  However, we pre-clear the
  94  * mlocked state anyway as we might lose the isolation race and we might
  95  * not get another chance to clear PageMlocked.  If we successfully
  96  * isolate the page and try_to_munlock() detects other VM_LOCKED vmas
  97  * mapping the page, it will restore the PageMlocked state, unless the page
  98  * is mapped in a non-linear vma.  So, we go ahead and SetPageMlocked(),
  99  * perhaps redundantly.
 100  * If we lose the isolation race, and the page is mapped by other VM_LOCKED
 101  * vmas, we'll detect this in vmscan--via try_to_munlock() or try_to_unmap()
 102  * either of which will restore the PageMlocked state by calling
 103  * mlock_vma_page() above, if it can grab the vma's mmap sem.
 104  */
 105 static void munlock_vma_page(struct page *page)
 106 {
 107         BUG_ON(!PageLocked(page));
 108
 109         if (TestClearPageMlocked(page) && !isolate_lru_page(page)) {
 110                 try_to_munlock(page);
 111                 putback_lru_page(page);
 112         }
 113 }
 114
 115 /**
 116  * __mlock_vma_pages_range() -  mlock/munlock a range of pages in the vma.
 117  * @vma:   target vma
 118  * @start: start address
 119  * @end:   end address
 120  * @mlock: 0 indicate munlock, otherwise mlock.
 121  *
 122  * If @mlock == 0, unlock an mlocked range;
 123  * else mlock the range of pages.  This takes care of making the pages present ,
 124  * too.
 125  *
 126  * return 0 on success, negative error code on error.
 127  *
 128  * vma->vm_mm->mmap_sem must be held for at least read.
 129  */
 130 static long __mlock_vma_pages_range(struct vm_area_struct *vma,
 131                                    unsigned long start, unsigned long end,
 132                                    int mlock)
 133 {
 134         struct mm_struct *mm = vma->vm_mm;
 135         unsigned long addr = start;
 136         struct page *pages[16]; /* 16 gives a reasonable batch */
 137         int nr_pages = (end - start) / PAGE_SIZE;
 138         int ret;
 139         int gup_flags = 0;
 140
 141         VM_BUG_ON(start & ~PAGE_MASK);
 142         VM_BUG_ON(end   & ~PAGE_MASK);
 143         VM_BUG_ON(start < vma->vm_start);
 144         VM_BUG_ON(end   > vma->vm_end);
 145         VM_BUG_ON((!rwsem_is_locked(&mm->mmap_sem)) &&
 146                   (atomic_read(&mm->mm_users) != 0));
 147
 148         /*
 149          * mlock:   don't page populate if page has PROT_NONE permission.
 150          * munlock: the pages always do munlock althrough
 151          *          its has PROT_NONE permission.
 152          */
 153         if (!mlock)
 154                 gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS;
 155
 156         if (vma->vm_flags & VM_WRITE)
 157                 gup_flags |= GUP_FLAGS_WRITE;
 158
 159         lru_add_drain_all();    /* push cached pages to LRU */
 160
 161         while (nr_pages > 0) {
 162                 int i;
 163
 164                 cond_resched();
 165
 166                 /*
 167                  * get_user_pages makes pages present if we are
 168                  * setting mlock. and this extra reference count will
 169                  * disable migration of this page.  However, page may
 170                  * still be truncated out from under us.
 171                  */
 172                 ret = __get_user_pages(current, mm, addr,
 173                                 min_t(int, nr_pages, ARRAY_SIZE(pages)),
 174                                 gup_flags, pages, NULL);
 175                 /*
 176                  * This can happen for, e.g., VM_NONLINEAR regions before
 177                  * a page has been allocated and mapped at a given offset,
 178                  * or for addresses that map beyond end of a file.
 179                  * We'll mlock the the pages if/when they get faulted in.
 180                  */
 181                 if (ret < 0)
 182                         break;
 183                 if (ret == 0) {
 184                         /*
 185                          * We know the vma is there, so the only time
 186                          * we cannot get a single page should be an
 187                          * error (ret < 0) case.
 188                          */
 189                         WARN_ON(1);
 190                         break;
 191                 }
 192
 193                 lru_add_drain();        /* push cached pages to LRU */
 194
 195                 for (i = 0; i < ret; i++) {
 196                         struct page *page = pages[i];
 197
 198                         lock_page(page);
 199                         /*
 200                          * Because we lock page here and migration is blocked
 201                          * by the elevated reference, we need only check for
 202                          * page truncation (file-cache only).
 203                          */
 204                         if (page->mapping) {
 205                                 if (mlock)
 206                                         mlock_vma_page(page);
 207                                 else
 208                                         munlock_vma_page(page);
 209                         }
 210                         unlock_page(page);
 211                         put_page(page);         /* ref from get_user_pages() */
 212
 213                         /*
 214                          * here we assume that get_user_pages() has given us
 215                          * a list of virtually contiguous pages.
 216                          */
 217                         addr += PAGE_SIZE;      /* for next get_user_pages() */
 218                         nr_pages--;
 219                 }
 220         }
 221
 222         lru_add_drain_all();    /* to update stats */
 223
 224         return 0;       /* count entire vma as locked_vm */
 225 }
 226
 227 #else /* CONFIG_UNEVICTABLE_LRU */
 228
 229 /*
 230  * Just make pages present if VM_LOCKED.  No-op if unlocking.
 231  */
 232 static long __mlock_vma_pages_range(struct vm_area_struct *vma,
 233                                    unsigned long start, unsigned long end,
 234                                    int mlock)
 235 {
 236         if (mlock && (vma->vm_flags & VM_LOCKED))
 237                 make_pages_present(start, end);
 238         return 0;
 239 }
 240 #endif /* CONFIG_UNEVICTABLE_LRU */
 241
 242 /**
 243  * mlock_vma_pages_range() - mlock pages in specified vma range.
 244  * @vma - the vma containing the specfied address range
 245  * @start - starting address in @vma to mlock
 246  * @end   - end address [+1] in @vma to mlock
 247  *
 248  * For mmap()/mremap()/expansion of mlocked vma.
 249  *
 250  * return 0 on success for "normal" vmas.
 251  *
 252  * return number of pages [> 0] to be removed from locked_vm on success
 253  * of "special" vmas.
 254  *
 255  * return negative error if vma spanning @start-@range disappears while
 256  * mmap semaphore is dropped.  Unlikely?
 257  */
 258 long mlock_vma_pages_range(struct vm_area_struct *vma,
 259                         unsigned long start, unsigned long end)
 260 {
 261         struct mm_struct *mm = vma->vm_mm;
 262         int nr_pages = (end - start) / PAGE_SIZE;
 263         BUG_ON(!(vma->vm_flags & VM_LOCKED));
 264
 265         /*
 266          * filter unlockable vmas
 267          */
 268         if (vma->vm_flags & (VM_IO | VM_PFNMAP))
 269                 goto no_mlock;
 270
 271         if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
 272                         is_vm_hugetlb_page(vma) ||
 273                         vma == get_gate_vma(current))) {
 274                 long error;
 275                 downgrade_write(&mm->mmap_sem);
 276
 277                 error = __mlock_vma_pages_range(vma, start, end, 1);
 278
 279                 up_read(&mm->mmap_sem);
 280                 /* vma can change or disappear */
 281                 down_write(&mm->mmap_sem);
 282                 vma = find_vma(mm, start);
 283                 /* non-NULL vma must contain @start, but need to check @end */
 284                 if (!vma ||  end > vma->vm_end)
 285                         return -ENOMEM;
 286
 287                 return 0;       /* hide other errors from mmap(), et al */
 288         }
 289
 290         /*
 291          * User mapped kernel pages or huge pages:
 292          * make these pages present to populate the ptes, but
 293          * fall thru' to reset VM_LOCKED--no need to unlock, and
 294          * return nr_pages so these don't get counted against task's
 295          * locked limit.  huge pages are already counted against
 296          * locked vm limit.
 297          */
 298         make_pages_present(start, end);
 299
 300 no_mlock:
 301         vma->vm_flags &= ~VM_LOCKED;    /* and don't come back! */
 302         return nr_pages;                /* error or pages NOT mlocked */
 303 }
 304
 305
 306 /*
 307  * munlock_vma_pages_range() - munlock all pages in the vma range.'
 308  * @vma - vma containing range to be munlock()ed.
 309  * @start - start address in @vma of the range
 310  * @end - end of range in @vma.
 311  *
 312  *  For mremap(), munmap() and exit().
 313  *
 314  * Called with @vma VM_LOCKED.
 315  *
 316  * Returns with VM_LOCKED cleared.  Callers must be prepared to
 317  * deal with this.
 318  *
 319  * We don't save and restore VM_LOCKED here because pages are
 320  * still on lru.  In unmap path, pages might be scanned by reclaim
 321  * and re-mlocked by try_to_{munlock|unmap} before we unmap and
 322  * free them.  This will result in freeing mlocked pages.
 323  */
 324 void munlock_vma_pages_range(struct vm_area_struct *vma,
 325                            unsigned long start, unsigned long end)
 326 {
 327         vma->vm_flags &= ~VM_LOCKED;
 328         __mlock_vma_pages_range(vma, start, end, 0);
 329 }
 330
 331 /*
 332  * mlock_fixup  - handle mlock[all]/munlock[all] requests.
 333  *
 334  * Filters out "special" vmas -- VM_LOCKED never gets set for these, and
 335  * munlock is a no-op.  However, for some special vmas, we go ahead and
 336  * populate the ptes via make_pages_present().
 337  *
 338  * For vmas that pass the filters, merge/split as appropriate.
 339  */
 340 static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
 341         unsigned long start, unsigned long end, unsigned int newflags)
 342 {
 343         struct mm_struct *mm = vma->vm_mm;
 344         pgoff_t pgoff;
 345         int nr_pages;
 346         int ret = 0;
 347         int lock = newflags & VM_LOCKED;
 348
 349         if (newflags == vma->vm_flags ||
 350                         (vma->vm_flags & (VM_IO | VM_PFNMAP)))
 351                 goto out;       /* don't set VM_LOCKED,  don't count */
 352
 353         if ((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
 354                         is_vm_hugetlb_page(vma) ||
 355                         vma == get_gate_vma(current)) {
 356                 if (lock)
 357                         make_pages_present(start, end);
 358                 goto out;       /* don't set VM_LOCKED,  don't count */
 359         }
 360
 361         pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
 362         *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
 363                           vma->vm_file, pgoff, vma_policy(vma));
 364         if (*prev) {
 365                 vma = *prev;
 366                 goto success;
 367         }
 368
 369         if (start != vma->vm_start) {
 370                 ret = split_vma(mm, vma, start, 1);
 371                 if (ret)
 372                         goto out;
 373         }
 374
 375         if (end != vma->vm_end) {
 376                 ret = split_vma(mm, vma, end, 0);
 377                 if (ret)
 378                         goto out;
 379         }
 380
 381 success:
 382         /*
 383          * Keep track of amount of locked VM.
 384          */
 385         nr_pages = (end - start) >> PAGE_SHIFT;
 386         if (!lock)
 387                 nr_pages = -nr_pages;
 388         mm->locked_vm += nr_pages;
 389
 390         /*
 391          * vm_flags is protected by the mmap_sem held in write mode.
 392          * It's okay if try_to_unmap_one unmaps a page just after we
 393          * set VM_LOCKED, __mlock_vma_pages_range will bring it back.
 394          */
 395         vma->vm_flags = newflags;
 396
 397         if (lock) {
 398                 /*
 399                  * mmap_sem is currently held for write.  Downgrade the write
 400                  * lock to a read lock so that other faults, mmap scans, ...
 401                  * while we fault in all pages.
 402                  */
 403                 downgrade_write(&mm->mmap_sem);
 404
 405                 ret = __mlock_vma_pages_range(vma, start, end, 1);
 406                 if (ret > 0) {
 407                         mm->locked_vm -= ret;
 408                         ret = 0;
 409                 }
 410                 /*
 411                  * Need to reacquire mmap sem in write mode, as our callers
 412                  * expect this.  We have no support for atomically upgrading
 413                  * a sem to write, so we need to check for ranges while sem
 414                  * is unlocked.
 415                  */
 416                 up_read(&mm->mmap_sem);
 417                 /* vma can change or disappear */
 418                 down_write(&mm->mmap_sem);
 419                 *prev = find_vma(mm, start);
 420                 /* non-NULL *prev must contain @start, but need to check @end */
 421                 if (!(*prev) || end > (*prev)->vm_end)
 422                         ret = -ENOMEM;
 423         } else {
 424                 /*
 425                  * TODO:  for unlocking, pages will already be resident, so
 426                  * we don't need to wait for allocations/reclaim/pagein, ...
 427                  * However, unlocking a very large region can still take a
 428                  * while.  Should we downgrade the semaphore for both lock
 429                  * AND unlock ?
 430                  */
 431                 __mlock_vma_pages_range(vma, start, end, 0);
 432         }
 433
 434 out:
 435         *prev = vma;
 436         return ret;
 437 }
 438
 439 static int do_mlock(unsigned long start, size_t len, int on)
 440 {
 441         unsigned long nstart, end, tmp;
 442         struct vm_area_struct * vma, * prev;
 443         int error;
 444
 445         len = PAGE_ALIGN(len);
 446         end = start + len;
 447         if (end < start)
 448                 return -EINVAL;
 449         if (end == start)
 450                 return 0;
 451         vma = find_vma_prev(current->mm, start, &prev);
 452         if (!vma || vma->vm_start > start)
 453                 return -ENOMEM;
 454
 455         if (start > vma->vm_start)
 456                 prev = vma;
 457
 458         for (nstart = start ; ; ) {
 459                 unsigned int newflags;
 460
 461                 /* Here we know that  vma->vm_start <= nstart < vma->vm_end. */
 462
 463                 newflags = vma->vm_flags | VM_LOCKED;
 464                 if (!on)
 465                         newflags &= ~VM_LOCKED;
 466
 467                 tmp = vma->vm_end;
 468                 if (tmp > end)
 469                         tmp = end;
 470                 error = mlock_fixup(vma, &prev, nstart, tmp, newflags);
 471                 if (error)
 472                         break;
 473                 nstart = tmp;
 474                 if (nstart < prev->vm_end)
 475                         nstart = prev->vm_end;
 476                 if (nstart >= end)
 477                         break;
 478
 479                 vma = prev->vm_next;
 480                 if (!vma || vma->vm_start != nstart) {
 481                         error = -ENOMEM;
 482                         break;
 483                 }
 484         }
 485         return error;
 486 }
 487
 488 asmlinkage long sys_mlock(unsigned long start, size_t len)
 489 {
 490         unsigned long locked;
 491         unsigned long lock_limit;
 492         int error = -ENOMEM;
 493
 494         if (!can_do_mlock())
 495                 return -EPERM;
 496
 497         down_write(&current->mm->mmap_sem);
 498         len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
 499         start &= PAGE_MASK;
 500
 501         locked = len >> PAGE_SHIFT;
 502         locked += current->mm->locked_vm;
 503
 504         lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
 505         lock_limit >>= PAGE_SHIFT;
 506
 507         /* check against resource limits */
 508         if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
 509                 error = do_mlock(start, len, 1);
 510         up_write(&current->mm->mmap_sem);
 511         return error;
 512 }
 513
 514 asmlinkage long sys_munlock(unsigned long start, size_t len)
 515 {
 516         int ret;
 517
 518         down_write(&current->mm->mmap_sem);
 519         len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
 520         start &= PAGE_MASK;
 521         ret = do_mlock(start, len, 0);
 522         up_write(&current->mm->mmap_sem);
 523         return ret;
 524 }
 525
 526 static int do_mlockall(int flags)
 527 {
 528         struct vm_area_struct * vma, * prev = NULL;
 529         unsigned int def_flags = 0;
 530
 531         if (flags & MCL_FUTURE)
 532                 def_flags = VM_LOCKED;
 533         current->mm->def_flags = def_flags;
 534         if (flags == MCL_FUTURE)
 535                 goto out;
 536
 537         for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
 538                 unsigned int newflags;
 539
 540                 newflags = vma->vm_flags | VM_LOCKED;
 541                 if (!(flags & MCL_CURRENT))
 542                         newflags &= ~VM_LOCKED;
 543
 544                 /* Ignore errors */
 545                 mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
 546         }
 547 out:
 548         return 0;
 549 }
 550
 551 asmlinkage long sys_mlockall(int flags)
 552 {
 553         unsigned long lock_limit;
 554         int ret = -EINVAL;
 555
 556         if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE)))
 557                 goto out;
 558
 559         ret = -EPERM;
 560         if (!can_do_mlock())
 561                 goto out;
 562
 563         down_write(&current->mm->mmap_sem);
 564
 565         lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
 566         lock_limit >>= PAGE_SHIFT;
 567
 568         ret = -ENOMEM;
 569         if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) ||
 570             capable(CAP_IPC_LOCK))
 571                 ret = do_mlockall(flags);
 572         up_write(&current->mm->mmap_sem);
 573 out:
 574         return ret;
 575 }
 576
 577 asmlinkage long sys_munlockall(void)
 578 {
 579         int ret;
 580
 581         down_write(&current->mm->mmap_sem);
 582         ret = do_mlockall(0);
 583         up_write(&current->mm->mmap_sem);
 584         return ret;
 585 }
 586
 587 /*
 588  * Objects with different lifetime than processes (SHM_LOCK and SHM_HUGETLB
 589  * shm segments) get accounted against the user_struct instead.
 590  */
 591 static DEFINE_SPINLOCK(shmlock_user_lock);
 592
 593 int user_shm_lock(size_t size, struct user_struct *user)
 594 {
 595         unsigned long lock_limit, locked;
 596         int allowed = 0;
 597
 598         locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
 599         lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
 600         if (lock_limit == RLIM_INFINITY)
 601                 allowed = 1;
 602         lock_limit >>= PAGE_SHIFT;
 603         spin_lock(&shmlock_user_lock);
 604         if (!allowed &&
 605             locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK))
 606                 goto out;
 607         get_uid(user);
 608         user->locked_shm += locked;
 609         allowed = 1;
 610 out:
 611         spin_unlock(&shmlock_user_lock);
 612         return allowed;
 613 }
 614
 615 void user_shm_unlock(size_t size, struct user_struct *user)
 616 {
 617         spin_lock(&shmlock_user_lock);
 618         user->locked_shm -= (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
 619         spin_unlock(&shmlock_user_lock);
 620         free_uid(user);
 621 }