mm/madvise.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  *      linux/mm/madvise.c
   4  *
   5  * Copyright (C) 1999  Linus Torvalds
   6  * Copyright (C) 2002  Christoph Hellwig
   7  */
   8
   9 #include <linux/mman.h>
  10 #include <linux/pagemap.h>
  11 #include <linux/syscalls.h>
  12 #include <linux/mempolicy.h>
  13 #include <linux/page-isolation.h>
  14 #include <linux/page_idle.h>
  15 #include <linux/userfaultfd_k.h>
  16 #include <linux/hugetlb.h>
  17 #include <linux/falloc.h>
  18 #include <linux/fadvise.h>
  19 #include <linux/sched.h>
  20 #include <linux/sched/mm.h>
  21 #include <linux/mm_inline.h>
  22 #include <linux/string.h>
  23 #include <linux/uio.h>
  24 #include <linux/ksm.h>
  25 #include <linux/fs.h>
  26 #include <linux/file.h>
  27 #include <linux/blkdev.h>
  28 #include <linux/backing-dev.h>
  29 #include <linux/pagewalk.h>
  30 #include <linux/swap.h>
  31 #include <linux/swapops.h>
  32 #include <linux/shmem_fs.h>
  33 #include <linux/mmu_notifier.h>
  34
  35 #include <asm/tlb.h>
  36
  37 #include "internal.h"
  38 #include "swap.h"
  39
  40 struct madvise_walk_private {
  41         struct mmu_gather *tlb;
  42         bool pageout;
  43 };
  44
  45 /*
  46  * Any behaviour which results in changes to the vma->vm_flags needs to
  47  * take mmap_lock for writing. Others, which simply traverse vmas, need
  48  * to only take it for reading.
  49  */
  50 static int madvise_need_mmap_write(int behavior)
  51 {
  52         switch (behavior) {
  53         case MADV_REMOVE:
  54         case MADV_WILLNEED:
  55         case MADV_DONTNEED:
  56         case MADV_DONTNEED_LOCKED:
  57         case MADV_COLD:
  58         case MADV_PAGEOUT:
  59         case MADV_FREE:
  60         case MADV_POPULATE_READ:
  61         case MADV_POPULATE_WRITE:
  62                 return 0;
  63         default:
  64                 /* be safe, default to 1. list exceptions explicitly */
  65                 return 1;
  66         }
  67 }
  68
  69 #ifdef CONFIG_ANON_VMA_NAME
  70 struct anon_vma_name *anon_vma_name_alloc(const char *name)
  71 {
  72         struct anon_vma_name *anon_name;
  73         size_t count;
  74
  75         /* Add 1 for NUL terminator at the end of the anon_name->name */
  76         count = strlen(name) + 1;
  77         anon_name = kmalloc(struct_size(anon_name, name, count), GFP_KERNEL);
  78         if (anon_name) {
  79                 kref_init(&anon_name->kref);
  80                 memcpy(anon_name->name, name, count);
  81         }
  82
  83         return anon_name;
  84 }
  85
  86 void anon_vma_name_free(struct kref *kref)
  87 {
  88         struct anon_vma_name *anon_name =
  89                         container_of(kref, struct anon_vma_name, kref);
  90         kfree(anon_name);
  91 }
  92
  93 struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
  94 {
  95         mmap_assert_locked(vma->vm_mm);
  96
  97         if (vma->vm_file)
  98                 return NULL;
  99
 100         return vma->anon_name;
 101 }
 102
 103 /* mmap_lock should be write-locked */
 104 static int replace_anon_vma_name(struct vm_area_struct *vma,
 105                                  struct anon_vma_name *anon_name)
 106 {
 107         struct anon_vma_name *orig_name = anon_vma_name(vma);
 108
 109         if (!anon_name) {
 110                 vma->anon_name = NULL;
 111                 anon_vma_name_put(orig_name);
 112                 return 0;
 113         }
 114
 115         if (anon_vma_name_eq(orig_name, anon_name))
 116                 return 0;
 117
 118         vma->anon_name = anon_vma_name_reuse(anon_name);
 119         anon_vma_name_put(orig_name);
 120
 121         return 0;
 122 }
 123 #else /* CONFIG_ANON_VMA_NAME */
 124 static int replace_anon_vma_name(struct vm_area_struct *vma,
 125                                  struct anon_vma_name *anon_name)
 126 {
 127         if (anon_name)
 128                 return -EINVAL;
 129
 130         return 0;
 131 }
 132 #endif /* CONFIG_ANON_VMA_NAME */
 133 /*
 134  * Update the vm_flags on region of a vma, splitting it or merging it as
 135  * necessary.  Must be called with mmap_sem held for writing;
 136  * Caller should ensure anon_name stability by raising its refcount even when
 137  * anon_name belongs to a valid vma because this function might free that vma.
 138  */
 139 static int madvise_update_vma(struct vm_area_struct *vma,
 140                               struct vm_area_struct **prev, unsigned long start,
 141                               unsigned long end, unsigned long new_flags,
 142                               struct anon_vma_name *anon_name)
 143 {
 144         struct mm_struct *mm = vma->vm_mm;
 145         int error;
 146         pgoff_t pgoff;
 147
 148         if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) {
 149                 *prev = vma;
 150                 return 0;
 151         }
 152
 153         pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
 154         *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
 155                           vma->vm_file, pgoff, vma_policy(vma),
 156                           vma->vm_userfaultfd_ctx, anon_name);
 157         if (*prev) {
 158                 vma = *prev;
 159                 goto success;
 160         }
 161
 162         *prev = vma;
 163
 164         if (start != vma->vm_start) {
 165                 if (unlikely(mm->map_count >= sysctl_max_map_count))
 166                         return -ENOMEM;
 167                 error = __split_vma(mm, vma, start, 1);
 168                 if (error)
 169                         return error;
 170         }
 171
 172         if (end != vma->vm_end) {
 173                 if (unlikely(mm->map_count >= sysctl_max_map_count))
 174                         return -ENOMEM;
 175                 error = __split_vma(mm, vma, end, 0);
 176                 if (error)
 177                         return error;
 178         }
 179
 180 success:
 181         /*
 182          * vm_flags is protected by the mmap_lock held in write mode.
 183          */
 184         vma->vm_flags = new_flags;
 185         if (!vma->vm_file) {
 186                 error = replace_anon_vma_name(vma, anon_name);
 187                 if (error)
 188                         return error;
 189         }
 190
 191         return 0;
 192 }
 193
 194 #ifdef CONFIG_SWAP
 195 static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
 196         unsigned long end, struct mm_walk *walk)
 197 {
 198         struct vm_area_struct *vma = walk->private;
 199         unsigned long index;
 200         struct swap_iocb *splug = NULL;
 201
 202         if (pmd_none_or_trans_huge_or_clear_bad(pmd))
 203                 return 0;
 204
 205         for (index = start; index != end; index += PAGE_SIZE) {
 206                 pte_t pte;
 207                 swp_entry_t entry;
 208                 struct page *page;
 209                 spinlock_t *ptl;
 210                 pte_t *ptep;
 211
 212                 ptep = pte_offset_map_lock(vma->vm_mm, pmd, index, &ptl);
 213                 pte = *ptep;
 214                 pte_unmap_unlock(ptep, ptl);
 215
 216                 if (!is_swap_pte(pte))
 217                         continue;
 218                 entry = pte_to_swp_entry(pte);
 219                 if (unlikely(non_swap_entry(entry)))
 220                         continue;
 221
 222                 page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
 223                                              vma, index, false, &splug);
 224                 if (page)
 225                         put_page(page);
 226         }
 227         swap_read_unplug(splug);
 228
 229         return 0;
 230 }
 231
 232 static const struct mm_walk_ops swapin_walk_ops = {
 233         .pmd_entry              = swapin_walk_pmd_entry,
 234 };
 235
 236 static void force_shm_swapin_readahead(struct vm_area_struct *vma,
 237                 unsigned long start, unsigned long end,
 238                 struct address_space *mapping)
 239 {
 240         XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start));
 241         pgoff_t end_index = linear_page_index(vma, end + PAGE_SIZE - 1);
 242         struct page *page;
 243         struct swap_iocb *splug = NULL;
 244
 245         rcu_read_lock();
 246         xas_for_each(&xas, page, end_index) {
 247                 swp_entry_t swap;
 248
 249                 if (!xa_is_value(page))
 250                         continue;
 251                 swap = radix_to_swp_entry(page);
 252                 /* There might be swapin error entries in shmem mapping. */
 253                 if (non_swap_entry(swap))
 254                         continue;
 255                 xas_pause(&xas);
 256                 rcu_read_unlock();
 257
 258                 page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
 259                                              NULL, 0, false, &splug);
 260                 if (page)
 261                         put_page(page);
 262
 263                 rcu_read_lock();
 264         }
 265         rcu_read_unlock();
 266         swap_read_unplug(splug);
 267
 268         lru_add_drain();        /* Push any new pages onto the LRU now */
 269 }
 270 #endif          /* CONFIG_SWAP */
 271
 272 /*
 273  * Schedule all required I/O operations.  Do not wait for completion.
 274  */
 275 static long madvise_willneed(struct vm_area_struct *vma,
 276                              struct vm_area_struct **prev,
 277                              unsigned long start, unsigned long end)
 278 {
 279         struct mm_struct *mm = vma->vm_mm;
 280         struct file *file = vma->vm_file;
 281         loff_t offset;
 282
 283         *prev = vma;
 284 #ifdef CONFIG_SWAP
 285         if (!file) {
 286                 walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma);
 287                 lru_add_drain(); /* Push any new pages onto the LRU now */
 288                 return 0;
 289         }
 290
 291         if (shmem_mapping(file->f_mapping)) {
 292                 force_shm_swapin_readahead(vma, start, end,
 293                                         file->f_mapping);
 294                 return 0;
 295         }
 296 #else
 297         if (!file)
 298                 return -EBADF;
 299 #endif
 300
 301         if (IS_DAX(file_inode(file))) {
 302                 /* no bad return value, but ignore advice */
 303                 return 0;
 304         }
 305
 306         /*
 307          * Filesystem's fadvise may need to take various locks.  We need to
 308          * explicitly grab a reference because the vma (and hence the
 309          * vma's reference to the file) can go away as soon as we drop
 310          * mmap_lock.
 311          */
 312         *prev = NULL;   /* tell sys_madvise we drop mmap_lock */
 313         get_file(file);
 314         offset = (loff_t)(start - vma->vm_start)
 315                         + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
 316         mmap_read_unlock(mm);
 317         vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
 318         fput(file);
 319         mmap_read_lock(mm);
 320         return 0;
 321 }
 322
 323 static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 324                                 unsigned long addr, unsigned long end,
 325                                 struct mm_walk *walk)
 326 {
 327         struct madvise_walk_private *private = walk->private;
 328         struct mmu_gather *tlb = private->tlb;
 329         bool pageout = private->pageout;
 330         struct mm_struct *mm = tlb->mm;
 331         struct vm_area_struct *vma = walk->vma;
 332         pte_t *orig_pte, *pte, ptent;
 333         spinlock_t *ptl;
 334         struct page *page = NULL;
 335         LIST_HEAD(page_list);
 336
 337         if (fatal_signal_pending(current))
 338                 return -EINTR;
 339
 340 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 341         if (pmd_trans_huge(*pmd)) {
 342                 pmd_t orig_pmd;
 343                 unsigned long next = pmd_addr_end(addr, end);
 344
 345                 tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
 346                 ptl = pmd_trans_huge_lock(pmd, vma);
 347                 if (!ptl)
 348                         return 0;
 349
 350                 orig_pmd = *pmd;
 351                 if (is_huge_zero_pmd(orig_pmd))
 352                         goto huge_unlock;
 353
 354                 if (unlikely(!pmd_present(orig_pmd))) {
 355                         VM_BUG_ON(thp_migration_supported() &&
 356                                         !is_pmd_migration_entry(orig_pmd));
 357                         goto huge_unlock;
 358                 }
 359
 360                 page = pmd_page(orig_pmd);
 361
 362                 /* Do not interfere with other mappings of this page */
 363                 if (page_mapcount(page) != 1)
 364                         goto huge_unlock;
 365
 366                 if (next - addr != HPAGE_PMD_SIZE) {
 367                         int err;
 368
 369                         get_page(page);
 370                         spin_unlock(ptl);
 371                         lock_page(page);
 372                         err = split_huge_page(page);
 373                         unlock_page(page);
 374                         put_page(page);
 375                         if (!err)
 376                                 goto regular_page;
 377                         return 0;
 378                 }
 379
 380                 if (pmd_young(orig_pmd)) {
 381                         pmdp_invalidate(vma, addr, pmd);
 382                         orig_pmd = pmd_mkold(orig_pmd);
 383
 384                         set_pmd_at(mm, addr, pmd, orig_pmd);
 385                         tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
 386                 }
 387
 388                 ClearPageReferenced(page);
 389                 test_and_clear_page_young(page);
 390                 if (pageout) {
 391                         if (!isolate_lru_page(page)) {
 392                                 if (PageUnevictable(page))
 393                                         putback_lru_page(page);
 394                                 else
 395                                         list_add(&page->lru, &page_list);
 396                         }
 397                 } else
 398                         deactivate_page(page);
 399 huge_unlock:
 400                 spin_unlock(ptl);
 401                 if (pageout)
 402                         reclaim_pages(&page_list);
 403                 return 0;
 404         }
 405
 406 regular_page:
 407         if (pmd_trans_unstable(pmd))
 408                 return 0;
 409 #endif
 410         tlb_change_page_size(tlb, PAGE_SIZE);
 411         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 412         flush_tlb_batched_pending(mm);
 413         arch_enter_lazy_mmu_mode();
 414         for (; addr < end; pte++, addr += PAGE_SIZE) {
 415                 ptent = *pte;
 416
 417                 if (pte_none(ptent))
 418                         continue;
 419
 420                 if (!pte_present(ptent))
 421                         continue;
 422
 423                 page = vm_normal_page(vma, addr, ptent);
 424                 if (!page || is_zone_device_page(page))
 425                         continue;
 426
 427                 /*
 428                  * Creating a THP page is expensive so split it only if we
 429                  * are sure it's worth. Split it if we are only owner.
 430                  */
 431                 if (PageTransCompound(page)) {
 432                         if (page_mapcount(page) != 1)
 433                                 break;
 434                         get_page(page);
 435                         if (!trylock_page(page)) {
 436                                 put_page(page);
 437                                 break;
 438                         }
 439                         pte_unmap_unlock(orig_pte, ptl);
 440                         if (split_huge_page(page)) {
 441                                 unlock_page(page);
 442                                 put_page(page);
 443                                 orig_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 444                                 break;
 445                         }
 446                         unlock_page(page);
 447                         put_page(page);
 448                         orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 449                         pte--;
 450                         addr -= PAGE_SIZE;
 451                         continue;
 452                 }
 453
 454                 /*
 455                  * Do not interfere with other mappings of this page and
 456                  * non-LRU page.
 457                  */
 458                 if (!PageLRU(page) || page_mapcount(page) != 1)
 459                         continue;
 460
 461                 VM_BUG_ON_PAGE(PageTransCompound(page), page);
 462
 463                 if (pte_young(ptent)) {
 464                         ptent = ptep_get_and_clear_full(mm, addr, pte,
 465                                                         tlb->fullmm);
 466                         ptent = pte_mkold(ptent);
 467                         set_pte_at(mm, addr, pte, ptent);
 468                         tlb_remove_tlb_entry(tlb, pte, addr);
 469                 }
 470
 471                 /*
 472                  * We are deactivating a page for accelerating reclaiming.
 473                  * VM couldn't reclaim the page unless we clear PG_young.
 474                  * As a side effect, it makes confuse idle-page tracking
 475                  * because they will miss recent referenced history.
 476                  */
 477                 ClearPageReferenced(page);
 478                 test_and_clear_page_young(page);
 479                 if (pageout) {
 480                         if (!isolate_lru_page(page)) {
 481                                 if (PageUnevictable(page))
 482                                         putback_lru_page(page);
 483                                 else
 484                                         list_add(&page->lru, &page_list);
 485                         }
 486                 } else
 487                         deactivate_page(page);
 488         }
 489
 490         arch_leave_lazy_mmu_mode();
 491         pte_unmap_unlock(orig_pte, ptl);
 492         if (pageout)
 493                 reclaim_pages(&page_list);
 494         cond_resched();
 495
 496         return 0;
 497 }
 498
 499 static const struct mm_walk_ops cold_walk_ops = {
 500         .pmd_entry = madvise_cold_or_pageout_pte_range,
 501 };
 502
 503 static void madvise_cold_page_range(struct mmu_gather *tlb,
 504                              struct vm_area_struct *vma,
 505                              unsigned long addr, unsigned long end)
 506 {
 507         struct madvise_walk_private walk_private = {
 508                 .pageout = false,
 509                 .tlb = tlb,
 510         };
 511
 512         tlb_start_vma(tlb, vma);
 513         walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
 514         tlb_end_vma(tlb, vma);
 515 }
 516
 517 static inline bool can_madv_lru_vma(struct vm_area_struct *vma)
 518 {
 519         return !(vma->vm_flags & (VM_LOCKED|VM_PFNMAP|VM_HUGETLB));
 520 }
 521
 522 static long madvise_cold(struct vm_area_struct *vma,
 523                         struct vm_area_struct **prev,
 524                         unsigned long start_addr, unsigned long end_addr)
 525 {
 526         struct mm_struct *mm = vma->vm_mm;
 527         struct mmu_gather tlb;
 528
 529         *prev = vma;
 530         if (!can_madv_lru_vma(vma))
 531                 return -EINVAL;
 532
 533         lru_add_drain();
 534         tlb_gather_mmu(&tlb, mm);
 535         madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
 536         tlb_finish_mmu(&tlb);
 537
 538         return 0;
 539 }
 540
 541 static void madvise_pageout_page_range(struct mmu_gather *tlb,
 542                              struct vm_area_struct *vma,
 543                              unsigned long addr, unsigned long end)
 544 {
 545         struct madvise_walk_private walk_private = {
 546                 .pageout = true,
 547                 .tlb = tlb,
 548         };
 549
 550         tlb_start_vma(tlb, vma);
 551         walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
 552         tlb_end_vma(tlb, vma);
 553 }
 554
 555 static inline bool can_do_pageout(struct vm_area_struct *vma)
 556 {
 557         if (vma_is_anonymous(vma))
 558                 return true;
 559         if (!vma->vm_file)
 560                 return false;
 561         /*
 562          * paging out pagecache only for non-anonymous mappings that correspond
 563          * to the files the calling process could (if tried) open for writing;
 564          * otherwise we'd be including shared non-exclusive mappings, which
 565          * opens a side channel.
 566          */
 567         return inode_owner_or_capable(&init_user_ns,
 568                                       file_inode(vma->vm_file)) ||
 569                file_permission(vma->vm_file, MAY_WRITE) == 0;
 570 }
 571
 572 static long madvise_pageout(struct vm_area_struct *vma,
 573                         struct vm_area_struct **prev,
 574                         unsigned long start_addr, unsigned long end_addr)
 575 {
 576         struct mm_struct *mm = vma->vm_mm;
 577         struct mmu_gather tlb;
 578
 579         *prev = vma;
 580         if (!can_madv_lru_vma(vma))
 581                 return -EINVAL;
 582
 583         if (!can_do_pageout(vma))
 584                 return 0;
 585
 586         lru_add_drain();
 587         tlb_gather_mmu(&tlb, mm);
 588         madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
 589         tlb_finish_mmu(&tlb);
 590
 591         return 0;
 592 }
 593
 594 static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 595                                 unsigned long end, struct mm_walk *walk)
 596
 597 {
 598         struct mmu_gather *tlb = walk->private;
 599         struct mm_struct *mm = tlb->mm;
 600         struct vm_area_struct *vma = walk->vma;
 601         spinlock_t *ptl;
 602         pte_t *orig_pte, *pte, ptent;
 603         struct page *page;
 604         int nr_swap = 0;
 605         unsigned long next;
 606
 607         next = pmd_addr_end(addr, end);
 608         if (pmd_trans_huge(*pmd))
 609                 if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
 610                         goto next;
 611
 612         if (pmd_trans_unstable(pmd))
 613                 return 0;
 614
 615         tlb_change_page_size(tlb, PAGE_SIZE);
 616         orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 617         flush_tlb_batched_pending(mm);
 618         arch_enter_lazy_mmu_mode();
 619         for (; addr != end; pte++, addr += PAGE_SIZE) {
 620                 ptent = *pte;
 621
 622                 if (pte_none(ptent))
 623                         continue;
 624                 /*
 625                  * If the pte has swp_entry, just clear page table to
 626                  * prevent swap-in which is more expensive rather than
 627                  * (page allocation + zeroing).
 628                  */
 629                 if (!pte_present(ptent)) {
 630                         swp_entry_t entry;
 631
 632                         entry = pte_to_swp_entry(ptent);
 633                         if (!non_swap_entry(entry)) {
 634                                 nr_swap--;
 635                                 free_swap_and_cache(entry);
 636                                 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
 637                         } else if (is_hwpoison_entry(entry) ||
 638                                    is_swapin_error_entry(entry)) {
 639                                 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
 640                         }
 641                         continue;
 642                 }
 643
 644                 page = vm_normal_page(vma, addr, ptent);
 645                 if (!page || is_zone_device_page(page))
 646                         continue;
 647
 648                 /*
 649                  * If pmd isn't transhuge but the page is THP and
 650                  * is owned by only this process, split it and
 651                  * deactivate all pages.
 652                  */
 653                 if (PageTransCompound(page)) {
 654                         if (page_mapcount(page) != 1)
 655                                 goto out;
 656                         get_page(page);
 657                         if (!trylock_page(page)) {
 658                                 put_page(page);
 659                                 goto out;
 660                         }
 661                         pte_unmap_unlock(orig_pte, ptl);
 662                         if (split_huge_page(page)) {
 663                                 unlock_page(page);
 664                                 put_page(page);
 665                                 orig_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 666                                 goto out;
 667                         }
 668                         unlock_page(page);
 669                         put_page(page);
 670                         orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 671                         pte--;
 672                         addr -= PAGE_SIZE;
 673                         continue;
 674                 }
 675
 676                 VM_BUG_ON_PAGE(PageTransCompound(page), page);
 677
 678                 if (PageSwapCache(page) || PageDirty(page)) {
 679                         if (!trylock_page(page))
 680                                 continue;
 681                         /*
 682                          * If page is shared with others, we couldn't clear
 683                          * PG_dirty of the page.
 684                          */
 685                         if (page_mapcount(page) != 1) {
 686                                 unlock_page(page);
 687                                 continue;
 688                         }
 689
 690                         if (PageSwapCache(page) && !try_to_free_swap(page)) {
 691                                 unlock_page(page);
 692                                 continue;
 693                         }
 694
 695                         ClearPageDirty(page);
 696                         unlock_page(page);
 697                 }
 698
 699                 if (pte_young(ptent) || pte_dirty(ptent)) {
 700                         /*
 701                          * Some of architecture(ex, PPC) don't update TLB
 702                          * with set_pte_at and tlb_remove_tlb_entry so for
 703                          * the portability, remap the pte with old|clean
 704                          * after pte clearing.
 705                          */
 706                         ptent = ptep_get_and_clear_full(mm, addr, pte,
 707                                                         tlb->fullmm);
 708
 709                         ptent = pte_mkold(ptent);
 710                         ptent = pte_mkclean(ptent);
 711                         set_pte_at(mm, addr, pte, ptent);
 712                         tlb_remove_tlb_entry(tlb, pte, addr);
 713                 }
 714                 mark_page_lazyfree(page);
 715         }
 716 out:
 717         if (nr_swap) {
 718                 if (current->mm == mm)
 719                         sync_mm_rss(mm);
 720
 721                 add_mm_counter(mm, MM_SWAPENTS, nr_swap);
 722         }
 723         arch_leave_lazy_mmu_mode();
 724         pte_unmap_unlock(orig_pte, ptl);
 725         cond_resched();
 726 next:
 727         return 0;
 728 }
 729
 730 static const struct mm_walk_ops madvise_free_walk_ops = {
 731         .pmd_entry              = madvise_free_pte_range,
 732 };
 733
 734 static int madvise_free_single_vma(struct vm_area_struct *vma,
 735                         unsigned long start_addr, unsigned long end_addr)
 736 {
 737         struct mm_struct *mm = vma->vm_mm;
 738         struct mmu_notifier_range range;
 739         struct mmu_gather tlb;
 740
 741         /* MADV_FREE works for only anon vma at the moment */
 742         if (!vma_is_anonymous(vma))
 743                 return -EINVAL;
 744
 745         range.start = max(vma->vm_start, start_addr);
 746         if (range.start >= vma->vm_end)
 747                 return -EINVAL;
 748         range.end = min(vma->vm_end, end_addr);
 749         if (range.end <= vma->vm_start)
 750                 return -EINVAL;
 751         mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
 752                                 range.start, range.end);
 753
 754         lru_add_drain();
 755         tlb_gather_mmu(&tlb, mm);
 756         update_hiwater_rss(mm);
 757
 758         mmu_notifier_invalidate_range_start(&range);
 759         tlb_start_vma(&tlb, vma);
 760         walk_page_range(vma->vm_mm, range.start, range.end,
 761                         &madvise_free_walk_ops, &tlb);
 762         tlb_end_vma(&tlb, vma);
 763         mmu_notifier_invalidate_range_end(&range);
 764         tlb_finish_mmu(&tlb);
 765
 766         return 0;
 767 }
 768
 769 /*
 770  * Application no longer needs these pages.  If the pages are dirty,
 771  * it's OK to just throw them away.  The app will be more careful about
 772  * data it wants to keep.  Be sure to free swap resources too.  The
 773  * zap_page_range call sets things up for shrink_active_list to actually free
 774  * these pages later if no one else has touched them in the meantime,
 775  * although we could add these pages to a global reuse list for
 776  * shrink_active_list to pick up before reclaiming other pages.
 777  *
 778  * NB: This interface discards data rather than pushes it out to swap,
 779  * as some implementations do.  This has performance implications for
 780  * applications like large transactional databases which want to discard
 781  * pages in anonymous maps after committing to backing store the data
 782  * that was kept in them.  There is no reason to write this data out to
 783  * the swap area if the application is discarding it.
 784  *
 785  * An interface that causes the system to free clean pages and flush
 786  * dirty pages is already available as msync(MS_INVALIDATE).
 787  */
 788 static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
 789                                         unsigned long start, unsigned long end)
 790 {
 791         zap_page_range(vma, start, end - start);
 792         return 0;
 793 }
 794
 795 static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma,
 796                                             unsigned long start,
 797                                             unsigned long *end,
 798                                             int behavior)
 799 {
 800         if (!is_vm_hugetlb_page(vma)) {
 801                 unsigned int forbidden = VM_PFNMAP;
 802
 803                 if (behavior != MADV_DONTNEED_LOCKED)
 804                         forbidden |= VM_LOCKED;
 805
 806                 return !(vma->vm_flags & forbidden);
 807         }
 808
 809         if (behavior != MADV_DONTNEED && behavior != MADV_DONTNEED_LOCKED)
 810                 return false;
 811         if (start & ~huge_page_mask(hstate_vma(vma)))
 812                 return false;
 813
 814         *end = ALIGN(*end, huge_page_size(hstate_vma(vma)));
 815         return true;
 816 }
 817
 818 static long madvise_dontneed_free(struct vm_area_struct *vma,
 819                                   struct vm_area_struct **prev,
 820                                   unsigned long start, unsigned long end,
 821                                   int behavior)
 822 {
 823         struct mm_struct *mm = vma->vm_mm;
 824
 825         *prev = vma;
 826         if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior))
 827                 return -EINVAL;
 828
 829         if (!userfaultfd_remove(vma, start, end)) {
 830                 *prev = NULL; /* mmap_lock has been dropped, prev is stale */
 831
 832                 mmap_read_lock(mm);
 833                 vma = find_vma(mm, start);
 834                 if (!vma)
 835                         return -ENOMEM;
 836                 if (start < vma->vm_start) {
 837                         /*
 838                          * This "vma" under revalidation is the one
 839                          * with the lowest vma->vm_start where start
 840                          * is also < vma->vm_end. If start <
 841                          * vma->vm_start it means an hole materialized
 842                          * in the user address space within the
 843                          * virtual range passed to MADV_DONTNEED
 844                          * or MADV_FREE.
 845                          */
 846                         return -ENOMEM;
 847                 }
 848                 /*
 849                  * Potential end adjustment for hugetlb vma is OK as
 850                  * the check below keeps end within vma.
 851                  */
 852                 if (!madvise_dontneed_free_valid_vma(vma, start, &end,
 853                                                      behavior))
 854                         return -EINVAL;
 855                 if (end > vma->vm_end) {
 856                         /*
 857                          * Don't fail if end > vma->vm_end. If the old
 858                          * vma was split while the mmap_lock was
 859                          * released the effect of the concurrent
 860                          * operation may not cause madvise() to
 861                          * have an undefined result. There may be an
 862                          * adjacent next vma that we'll walk
 863                          * next. userfaultfd_remove() will generate an
 864                          * UFFD_EVENT_REMOVE repetition on the
 865                          * end-vma->vm_end range, but the manager can
 866                          * handle a repetition fine.
 867                          */
 868                         end = vma->vm_end;
 869                 }
 870                 VM_WARN_ON(start >= end);
 871         }
 872
 873         if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED)
 874                 return madvise_dontneed_single_vma(vma, start, end);
 875         else if (behavior == MADV_FREE)
 876                 return madvise_free_single_vma(vma, start, end);
 877         else
 878                 return -EINVAL;
 879 }
 880
 881 static long madvise_populate(struct vm_area_struct *vma,
 882                              struct vm_area_struct **prev,
 883                              unsigned long start, unsigned long end,
 884                              int behavior)
 885 {
 886         const bool write = behavior == MADV_POPULATE_WRITE;
 887         struct mm_struct *mm = vma->vm_mm;
 888         unsigned long tmp_end;
 889         int locked = 1;
 890         long pages;
 891
 892         *prev = vma;
 893
 894         while (start < end) {
 895                 /*
 896                  * We might have temporarily dropped the lock. For example,
 897                  * our VMA might have been split.
 898                  */
 899                 if (!vma || start >= vma->vm_end) {
 900                         vma = vma_lookup(mm, start);
 901                         if (!vma)
 902                                 return -ENOMEM;
 903                 }
 904
 905                 tmp_end = min_t(unsigned long, end, vma->vm_end);
 906                 /* Populate (prefault) page tables readable/writable. */
 907                 pages = faultin_vma_page_range(vma, start, tmp_end, write,
 908                                                &locked);
 909                 if (!locked) {
 910                         mmap_read_lock(mm);
 911                         locked = 1;
 912                         *prev = NULL;
 913                         vma = NULL;
 914                 }
 915                 if (pages < 0) {
 916                         switch (pages) {
 917                         case -EINTR:
 918                                 return -EINTR;
 919                         case -EINVAL: /* Incompatible mappings / permissions. */
 920                                 return -EINVAL;
 921                         case -EHWPOISON:
 922                                 return -EHWPOISON;
 923                         case -EFAULT: /* VM_FAULT_SIGBUS or VM_FAULT_SIGSEGV */
 924                                 return -EFAULT;
 925                         default:
 926                                 pr_warn_once("%s: unhandled return value: %ld\n",
 927                                              __func__, pages);
 928                                 fallthrough;
 929                         case -ENOMEM:
 930                                 return -ENOMEM;
 931                         }
 932                 }
 933                 start += pages * PAGE_SIZE;
 934         }
 935         return 0;
 936 }
 937
 938 /*
 939  * Application wants to free up the pages and associated backing store.
 940  * This is effectively punching a hole into the middle of a file.
 941  */
 942 static long madvise_remove(struct vm_area_struct *vma,
 943                                 struct vm_area_struct **prev,
 944                                 unsigned long start, unsigned long end)
 945 {
 946         loff_t offset;
 947         int error;
 948         struct file *f;
 949         struct mm_struct *mm = vma->vm_mm;
 950
 951         *prev = NULL;   /* tell sys_madvise we drop mmap_lock */
 952
 953         if (vma->vm_flags & VM_LOCKED)
 954                 return -EINVAL;
 955
 956         f = vma->vm_file;
 957
 958         if (!f || !f->f_mapping || !f->f_mapping->host) {
 959                         return -EINVAL;
 960         }
 961
 962         if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
 963                 return -EACCES;
 964
 965         offset = (loff_t)(start - vma->vm_start)
 966                         + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
 967
 968         /*
 969          * Filesystem's fallocate may need to take i_rwsem.  We need to
 970          * explicitly grab a reference because the vma (and hence the
 971          * vma's reference to the file) can go away as soon as we drop
 972          * mmap_lock.
 973          */
 974         get_file(f);
 975         if (userfaultfd_remove(vma, start, end)) {
 976                 /* mmap_lock was not released by userfaultfd_remove() */
 977                 mmap_read_unlock(mm);
 978         }
 979         error = vfs_fallocate(f,
 980                                 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
 981                                 offset, end - start);
 982         fput(f);
 983         mmap_read_lock(mm);
 984         return error;
 985 }
 986
 987 /*
 988  * Apply an madvise behavior to a region of a vma.  madvise_update_vma
 989  * will handle splitting a vm area into separate areas, each area with its own
 990  * behavior.
 991  */
 992 static int madvise_vma_behavior(struct vm_area_struct *vma,
 993                                 struct vm_area_struct **prev,
 994                                 unsigned long start, unsigned long end,
 995                                 unsigned long behavior)
 996 {
 997         int error;
 998         struct anon_vma_name *anon_name;
 999         unsigned long new_flags = vma->vm_flags;
1000
1001         switch (behavior) {
1002         case MADV_REMOVE:
1003                 return madvise_remove(vma, prev, start, end);
1004         case MADV_WILLNEED:
1005                 return madvise_willneed(vma, prev, start, end);
1006         case MADV_COLD:
1007                 return madvise_cold(vma, prev, start, end);
1008         case MADV_PAGEOUT:
1009                 return madvise_pageout(vma, prev, start, end);
1010         case MADV_FREE:
1011         case MADV_DONTNEED:
1012         case MADV_DONTNEED_LOCKED:
1013                 return madvise_dontneed_free(vma, prev, start, end, behavior);
1014         case MADV_POPULATE_READ:
1015         case MADV_POPULATE_WRITE:
1016                 return madvise_populate(vma, prev, start, end, behavior);
1017         case MADV_NORMAL:
1018                 new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
1019                 break;
1020         case MADV_SEQUENTIAL:
1021                 new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
1022                 break;
1023         case MADV_RANDOM:
1024                 new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
1025                 break;
1026         case MADV_DONTFORK:
1027                 new_flags |= VM_DONTCOPY;
1028                 break;
1029         case MADV_DOFORK:
1030                 if (vma->vm_flags & VM_IO)
1031                         return -EINVAL;
1032                 new_flags &= ~VM_DONTCOPY;
1033                 break;
1034         case MADV_WIPEONFORK:
1035                 /* MADV_WIPEONFORK is only supported on anonymous memory. */
1036                 if (vma->vm_file || vma->vm_flags & VM_SHARED)
1037                         return -EINVAL;
1038                 new_flags |= VM_WIPEONFORK;
1039                 break;
1040         case MADV_KEEPONFORK:
1041                 new_flags &= ~VM_WIPEONFORK;
1042                 break;
1043         case MADV_DONTDUMP:
1044                 new_flags |= VM_DONTDUMP;
1045                 break;
1046         case MADV_DODUMP:
1047                 if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL)
1048                         return -EINVAL;
1049                 new_flags &= ~VM_DONTDUMP;
1050                 break;
1051         case MADV_MERGEABLE:
1052         case MADV_UNMERGEABLE:
1053                 error = ksm_madvise(vma, start, end, behavior, &new_flags);
1054                 if (error)
1055                         goto out;
1056                 break;
1057         case MADV_HUGEPAGE:
1058         case MADV_NOHUGEPAGE:
1059                 error = hugepage_madvise(vma, &new_flags, behavior);
1060                 if (error)
1061                         goto out;
1062                 break;
1063         }
1064
1065         anon_name = anon_vma_name(vma);
1066         anon_vma_name_get(anon_name);
1067         error = madvise_update_vma(vma, prev, start, end, new_flags,
1068                                    anon_name);
1069         anon_vma_name_put(anon_name);
1070
1071 out:
1072         /*
1073          * madvise() returns EAGAIN if kernel resources, such as
1074          * slab, are temporarily unavailable.
1075          */
1076         if (error == -ENOMEM)
1077                 error = -EAGAIN;
1078         return error;
1079 }
1080
1081 #ifdef CONFIG_MEMORY_FAILURE
1082 /*
1083  * Error injection support for memory error handling.
1084  */
1085 static int madvise_inject_error(int behavior,
1086                 unsigned long start, unsigned long end)
1087 {
1088         unsigned long size;
1089
1090         if (!capable(CAP_SYS_ADMIN))
1091                 return -EPERM;
1092
1093
1094         for (; start < end; start += size) {
1095                 unsigned long pfn;
1096                 struct page *page;
1097                 int ret;
1098
1099                 ret = get_user_pages_fast(start, 1, 0, &page);
1100                 if (ret != 1)
1101                         return ret;
1102                 pfn = page_to_pfn(page);
1103
1104                 /*
1105                  * When soft offlining hugepages, after migrating the page
1106                  * we dissolve it, therefore in the second loop "page" will
1107                  * no longer be a compound page.
1108                  */
1109                 size = page_size(compound_head(page));
1110
1111                 if (behavior == MADV_SOFT_OFFLINE) {
1112                         pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
1113                                  pfn, start);
1114                         ret = soft_offline_page(pfn, MF_COUNT_INCREASED);
1115                 } else {
1116                         pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
1117                                  pfn, start);
1118                         ret = memory_failure(pfn, MF_COUNT_INCREASED | MF_SW_SIMULATED);
1119                         if (ret == -EOPNOTSUPP)
1120                                 ret = 0;
1121                 }
1122
1123                 if (ret)
1124                         return ret;
1125         }
1126
1127         return 0;
1128 }
1129 #endif
1130
1131 static bool
1132 madvise_behavior_valid(int behavior)
1133 {
1134         switch (behavior) {
1135         case MADV_DOFORK:
1136         case MADV_DONTFORK:
1137         case MADV_NORMAL:
1138         case MADV_SEQUENTIAL:
1139         case MADV_RANDOM:
1140         case MADV_REMOVE:
1141         case MADV_WILLNEED:
1142         case MADV_DONTNEED:
1143         case MADV_DONTNEED_LOCKED:
1144         case MADV_FREE:
1145         case MADV_COLD:
1146         case MADV_PAGEOUT:
1147         case MADV_POPULATE_READ:
1148         case MADV_POPULATE_WRITE:
1149 #ifdef CONFIG_KSM
1150         case MADV_MERGEABLE:
1151         case MADV_UNMERGEABLE:
1152 #endif
1153 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1154         case MADV_HUGEPAGE:
1155         case MADV_NOHUGEPAGE:
1156 #endif
1157         case MADV_DONTDUMP:
1158         case MADV_DODUMP:
1159         case MADV_WIPEONFORK:
1160         case MADV_KEEPONFORK:
1161 #ifdef CONFIG_MEMORY_FAILURE
1162         case MADV_SOFT_OFFLINE:
1163         case MADV_HWPOISON:
1164 #endif
1165                 return true;
1166
1167         default:
1168                 return false;
1169         }
1170 }
1171
1172 static bool
1173 process_madvise_behavior_valid(int behavior)
1174 {
1175         switch (behavior) {
1176         case MADV_COLD:
1177         case MADV_PAGEOUT:
1178         case MADV_WILLNEED:
1179                 return true;
1180         default:
1181                 return false;
1182         }
1183 }
1184
1185 /*
1186  * Walk the vmas in range [start,end), and call the visit function on each one.
1187  * The visit function will get start and end parameters that cover the overlap
1188  * between the current vma and the original range.  Any unmapped regions in the
1189  * original range will result in this function returning -ENOMEM while still
1190  * calling the visit function on all of the existing vmas in the range.
1191  * Must be called with the mmap_lock held for reading or writing.
1192  */
1193 static
1194 int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
1195                       unsigned long end, unsigned long arg,
1196                       int (*visit)(struct vm_area_struct *vma,
1197                                    struct vm_area_struct **prev, unsigned long start,
1198                                    unsigned long end, unsigned long arg))
1199 {
1200         struct vm_area_struct *vma;
1201         struct vm_area_struct *prev;
1202         unsigned long tmp;
1203         int unmapped_error = 0;
1204
1205         /*
1206          * If the interval [start,end) covers some unmapped address
1207          * ranges, just ignore them, but return -ENOMEM at the end.
1208          * - different from the way of handling in mlock etc.
1209          */
1210         vma = find_vma_prev(mm, start, &prev);
1211         if (vma && start > vma->vm_start)
1212                 prev = vma;
1213
1214         for (;;) {
1215                 int error;
1216
1217                 /* Still start < end. */
1218                 if (!vma)
1219                         return -ENOMEM;
1220
1221                 /* Here start < (end|vma->vm_end). */
1222                 if (start < vma->vm_start) {
1223                         unmapped_error = -ENOMEM;
1224                         start = vma->vm_start;
1225                         if (start >= end)
1226                                 break;
1227                 }
1228
1229                 /* Here vma->vm_start <= start < (end|vma->vm_end) */
1230                 tmp = vma->vm_end;
1231                 if (end < tmp)
1232                         tmp = end;
1233
1234                 /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
1235                 error = visit(vma, &prev, start, tmp, arg);
1236                 if (error)
1237                         return error;
1238                 start = tmp;
1239                 if (prev && start < prev->vm_end)
1240                         start = prev->vm_end;
1241                 if (start >= end)
1242                         break;
1243                 if (prev)
1244                         vma = prev->vm_next;
1245                 else    /* madvise_remove dropped mmap_lock */
1246                         vma = find_vma(mm, start);
1247         }
1248
1249         return unmapped_error;
1250 }
1251
1252 #ifdef CONFIG_ANON_VMA_NAME
1253 static int madvise_vma_anon_name(struct vm_area_struct *vma,
1254                                  struct vm_area_struct **prev,
1255                                  unsigned long start, unsigned long end,
1256                                  unsigned long anon_name)
1257 {
1258         int error;
1259
1260         /* Only anonymous mappings can be named */
1261         if (vma->vm_file)
1262                 return -EBADF;
1263
1264         error = madvise_update_vma(vma, prev, start, end, vma->vm_flags,
1265                                    (struct anon_vma_name *)anon_name);
1266
1267         /*
1268          * madvise() returns EAGAIN if kernel resources, such as
1269          * slab, are temporarily unavailable.
1270          */
1271         if (error == -ENOMEM)
1272                 error = -EAGAIN;
1273         return error;
1274 }
1275
1276 int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
1277                           unsigned long len_in, struct anon_vma_name *anon_name)
1278 {
1279         unsigned long end;
1280         unsigned long len;
1281
1282         if (start & ~PAGE_MASK)
1283                 return -EINVAL;
1284         len = (len_in + ~PAGE_MASK) & PAGE_MASK;
1285
1286         /* Check to see whether len was rounded up from small -ve to zero */
1287         if (len_in && !len)
1288                 return -EINVAL;
1289
1290         end = start + len;
1291         if (end < start)
1292                 return -EINVAL;
1293
1294         if (end == start)
1295                 return 0;
1296
1297         return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name,
1298                                  madvise_vma_anon_name);
1299 }
1300 #endif /* CONFIG_ANON_VMA_NAME */
1301 /*
1302  * The madvise(2) system call.
1303  *
1304  * Applications can use madvise() to advise the kernel how it should
1305  * handle paging I/O in this VM area.  The idea is to help the kernel
1306  * use appropriate read-ahead and caching techniques.  The information
1307  * provided is advisory only, and can be safely disregarded by the
1308  * kernel without affecting the correct operation of the application.
1309  *
1310  * behavior values:
1311  *  MADV_NORMAL - the default behavior is to read clusters.  This
1312  *              results in some read-ahead and read-behind.
1313  *  MADV_RANDOM - the system should read the minimum amount of data
1314  *              on any access, since it is unlikely that the appli-
1315  *              cation will need more than what it asks for.
1316  *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
1317  *              once, so they can be aggressively read ahead, and
1318  *              can be freed soon after they are accessed.
1319  *  MADV_WILLNEED - the application is notifying the system to read
1320  *              some pages ahead.
1321  *  MADV_DONTNEED - the application is finished with the given range,
1322  *              so the kernel can free resources associated with it.
1323  *  MADV_FREE - the application marks pages in the given range as lazy free,
1324  *              where actual purges are postponed until memory pressure happens.
1325  *  MADV_REMOVE - the application wants to free up the given range of
1326  *              pages and associated backing store.
1327  *  MADV_DONTFORK - omit this area from child's address space when forking:
1328  *              typically, to avoid COWing pages pinned by get_user_pages().
1329  *  MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
1330  *  MADV_WIPEONFORK - present the child process with zero-filled memory in this
1331  *              range after a fork.
1332  *  MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK
1333  *  MADV_HWPOISON - trigger memory error handler as if the given memory range
1334  *              were corrupted by unrecoverable hardware memory failure.
1335  *  MADV_SOFT_OFFLINE - try to soft-offline the given range of memory.
1336  *  MADV_MERGEABLE - the application recommends that KSM try to merge pages in
1337  *              this area with pages of identical content from other such areas.
1338  *  MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
1339  *  MADV_HUGEPAGE - the application wants to back the given range by transparent
1340  *              huge pages in the future. Existing pages might be coalesced and
1341  *              new pages might be allocated as THP.
1342  *  MADV_NOHUGEPAGE - mark the given range as not worth being backed by
1343  *              transparent huge pages so the existing pages will not be
1344  *              coalesced into THP and new pages will not be allocated as THP.
1345  *  MADV_DONTDUMP - the application wants to prevent pages in the given range
1346  *              from being included in its core dump.
1347  *  MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
1348  *  MADV_COLD - the application is not expected to use this memory soon,
1349  *              deactivate pages in this range so that they can be reclaimed
1350  *              easily if memory pressure happens.
1351  *  MADV_PAGEOUT - the application is not expected to use this memory soon,
1352  *              page out the pages in this range immediately.
1353  *  MADV_POPULATE_READ - populate (prefault) page tables readable by
1354  *              triggering read faults if required
1355  *  MADV_POPULATE_WRITE - populate (prefault) page tables writable by
1356  *              triggering write faults if required
1357  *
1358  * return values:
1359  *  zero    - success
1360  *  -EINVAL - start + len < 0, start is not page-aligned,
1361  *              "behavior" is not a valid value, or application
1362  *              is attempting to release locked or shared pages,
1363  *              or the specified address range includes file, Huge TLB,
1364  *              MAP_SHARED or VMPFNMAP range.
1365  *  -ENOMEM - addresses in the specified range are not currently
1366  *              mapped, or are outside the AS of the process.
1367  *  -EIO    - an I/O error occurred while paging in data.
1368  *  -EBADF  - map exists, but area maps something that isn't a file.
1369  *  -EAGAIN - a kernel resource was temporarily unavailable.
1370  */
1371 int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
1372 {
1373         unsigned long end;
1374         int error;
1375         int write;
1376         size_t len;
1377         struct blk_plug plug;
1378
1379         start = untagged_addr(start);
1380
1381         if (!madvise_behavior_valid(behavior))
1382                 return -EINVAL;
1383
1384         if (!PAGE_ALIGNED(start))
1385                 return -EINVAL;
1386         len = PAGE_ALIGN(len_in);
1387
1388         /* Check to see whether len was rounded up from small -ve to zero */
1389         if (len_in && !len)
1390                 return -EINVAL;
1391
1392         end = start + len;
1393         if (end < start)
1394                 return -EINVAL;
1395
1396         if (end == start)
1397                 return 0;
1398
1399 #ifdef CONFIG_MEMORY_FAILURE
1400         if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
1401                 return madvise_inject_error(behavior, start, start + len_in);
1402 #endif
1403
1404         write = madvise_need_mmap_write(behavior);
1405         if (write) {
1406                 if (mmap_write_lock_killable(mm))
1407                         return -EINTR;
1408         } else {
1409                 mmap_read_lock(mm);
1410         }
1411
1412         blk_start_plug(&plug);
1413         error = madvise_walk_vmas(mm, start, end, behavior,
1414                         madvise_vma_behavior);
1415         blk_finish_plug(&plug);
1416         if (write)
1417                 mmap_write_unlock(mm);
1418         else
1419                 mmap_read_unlock(mm);
1420
1421         return error;
1422 }
1423
1424 SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
1425 {
1426         return do_madvise(current->mm, start, len_in, behavior);
1427 }
1428
1429 SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
1430                 size_t, vlen, int, behavior, unsigned int, flags)
1431 {
1432         ssize_t ret;
1433         struct iovec iovstack[UIO_FASTIOV], iovec;
1434         struct iovec *iov = iovstack;
1435         struct iov_iter iter;
1436         struct task_struct *task;
1437         struct mm_struct *mm;
1438         size_t total_len;
1439         unsigned int f_flags;
1440
1441         if (flags != 0) {
1442                 ret = -EINVAL;
1443                 goto out;
1444         }
1445
1446         ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
1447         if (ret < 0)
1448                 goto out;
1449
1450         task = pidfd_get_task(pidfd, &f_flags);
1451         if (IS_ERR(task)) {
1452                 ret = PTR_ERR(task);
1453                 goto free_iov;
1454         }
1455
1456         if (!process_madvise_behavior_valid(behavior)) {
1457                 ret = -EINVAL;
1458                 goto release_task;
1459         }
1460
1461         /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
1462         mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
1463         if (IS_ERR_OR_NULL(mm)) {
1464                 ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
1465                 goto release_task;
1466         }
1467
1468         /*
1469          * Require CAP_SYS_NICE for influencing process performance. Note that
1470          * only non-destructive hints are currently supported.
1471          */
1472         if (!capable(CAP_SYS_NICE)) {
1473                 ret = -EPERM;
1474                 goto release_mm;
1475         }
1476
1477         total_len = iov_iter_count(&iter);
1478
1479         while (iov_iter_count(&iter)) {
1480                 iovec = iov_iter_iovec(&iter);
1481                 ret = do_madvise(mm, (unsigned long)iovec.iov_base,
1482                                         iovec.iov_len, behavior);
1483                 if (ret < 0)
1484                         break;
1485                 iov_iter_advance(&iter, iovec.iov_len);
1486         }
1487
1488         ret = (total_len - iov_iter_count(&iter)) ? : ret;
1489
1490 release_mm:
1491         mmput(mm);
1492 release_task:
1493         put_task_struct(task);
1494 free_iov:
1495         kfree(iov);
1496 out:
1497         return ret;
1498 }