mm/pagewalk.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 #include <linux/pagewalk.h>
   3 #include <linux/highmem.h>
   4 #include <linux/sched.h>
   5 #include <linux/hugetlb.h>
   6
   7 /*
   8  * We want to know the real level where a entry is located ignoring any
   9  * folding of levels which may be happening. For example if p4d is folded then
  10  * a missing entry found at level 1 (p4d) is actually at level 0 (pgd).
  11  */
  12 static int real_depth(int depth)
  13 {
  14         if (depth == 3 && PTRS_PER_PMD == 1)
  15                 depth = 2;
  16         if (depth == 2 && PTRS_PER_PUD == 1)
  17                 depth = 1;
  18         if (depth == 1 && PTRS_PER_P4D == 1)
  19                 depth = 0;
  20         return depth;
  21 }
  22
  23 static int walk_pte_range_inner(pte_t *pte, unsigned long addr,
  24                                 unsigned long end, struct mm_walk *walk)
  25 {
  26         const struct mm_walk_ops *ops = walk->ops;
  27         int err = 0;
  28
  29         for (;;) {
  30                 err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
  31                 if (err)
  32                        break;
  33                 if (addr >= end - PAGE_SIZE)
  34                         break;
  35                 addr += PAGE_SIZE;
  36                 pte++;
  37         }
  38         return err;
  39 }
  40
  41 static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
  42                           struct mm_walk *walk)
  43 {
  44         pte_t *pte;
  45         int err = 0;
  46         spinlock_t *ptl;
  47
  48         if (walk->no_vma) {
  49                 /*
  50                  * pte_offset_map() might apply user-specific validation.
  51                  */
  52                 if (walk->mm == &init_mm)
  53                         pte = pte_offset_kernel(pmd, addr);
  54                 else
  55                         pte = pte_offset_map(pmd, addr);
  56                 if (pte) {
  57                         err = walk_pte_range_inner(pte, addr, end, walk);
  58                         if (walk->mm != &init_mm)
  59                                 pte_unmap(pte);
  60                 }
  61         } else {
  62                 pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
  63                 if (pte) {
  64                         err = walk_pte_range_inner(pte, addr, end, walk);
  65                         pte_unmap_unlock(pte, ptl);
  66                 }
  67         }
  68         if (!pte)
  69                 walk->action = ACTION_AGAIN;
  70         return err;
  71 }
  72
  73 #ifdef CONFIG_ARCH_HAS_HUGEPD
  74 static int walk_hugepd_range(hugepd_t *phpd, unsigned long addr,
  75                              unsigned long end, struct mm_walk *walk, int pdshift)
  76 {
  77         int err = 0;
  78         const struct mm_walk_ops *ops = walk->ops;
  79         int shift = hugepd_shift(*phpd);
  80         int page_size = 1 << shift;
  81
  82         if (!ops->pte_entry)
  83                 return 0;
  84
  85         if (addr & (page_size - 1))
  86                 return 0;
  87
  88         for (;;) {
  89                 pte_t *pte;
  90
  91                 spin_lock(&walk->mm->page_table_lock);
  92                 pte = hugepte_offset(*phpd, addr, pdshift);
  93                 err = ops->pte_entry(pte, addr, addr + page_size, walk);
  94                 spin_unlock(&walk->mm->page_table_lock);
  95
  96                 if (err)
  97                         break;
  98                 if (addr >= end - page_size)
  99                         break;
 100                 addr += page_size;
 101         }
 102         return err;
 103 }
 104 #else
 105 static int walk_hugepd_range(hugepd_t *phpd, unsigned long addr,
 106                              unsigned long end, struct mm_walk *walk, int pdshift)
 107 {
 108         return 0;
 109 }
 110 #endif
 111
 112 static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
 113                           struct mm_walk *walk)
 114 {
 115         pmd_t *pmd;
 116         unsigned long next;
 117         const struct mm_walk_ops *ops = walk->ops;
 118         int err = 0;
 119         int depth = real_depth(3);
 120
 121         pmd = pmd_offset(pud, addr);
 122         do {
 123 again:
 124                 next = pmd_addr_end(addr, end);
 125                 if (pmd_none(*pmd)) {
 126                         if (ops->pte_hole)
 127                                 err = ops->pte_hole(addr, next, depth, walk);
 128                         if (err)
 129                                 break;
 130                         continue;
 131                 }
 132
 133                 walk->action = ACTION_SUBTREE;
 134
 135                 /*
 136                  * This implies that each ->pmd_entry() handler
 137                  * needs to know about pmd_trans_huge() pmds
 138                  */
 139                 if (ops->pmd_entry)
 140                         err = ops->pmd_entry(pmd, addr, next, walk);
 141                 if (err)
 142                         break;
 143
 144                 if (walk->action == ACTION_AGAIN)
 145                         goto again;
 146
 147                 /*
 148                  * Check this here so we only break down trans_huge
 149                  * pages when we _need_ to
 150                  */
 151                 if ((!walk->vma && (pmd_leaf(*pmd) || !pmd_present(*pmd))) ||
 152                     walk->action == ACTION_CONTINUE ||
 153                     !(ops->pte_entry))
 154                         continue;
 155
 156                 if (walk->vma)
 157                         split_huge_pmd(walk->vma, pmd, addr);
 158
 159                 if (is_hugepd(__hugepd(pmd_val(*pmd))))
 160                         err = walk_hugepd_range((hugepd_t *)pmd, addr, next, walk, PMD_SHIFT);
 161                 else
 162                         err = walk_pte_range(pmd, addr, next, walk);
 163                 if (err)
 164                         break;
 165
 166                 if (walk->action == ACTION_AGAIN)
 167                         goto again;
 168
 169         } while (pmd++, addr = next, addr != end);
 170
 171         return err;
 172 }
 173
 174 static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
 175                           struct mm_walk *walk)
 176 {
 177         pud_t *pud;
 178         unsigned long next;
 179         const struct mm_walk_ops *ops = walk->ops;
 180         int err = 0;
 181         int depth = real_depth(2);
 182
 183         pud = pud_offset(p4d, addr);
 184         do {
 185  again:
 186                 next = pud_addr_end(addr, end);
 187                 if (pud_none(*pud)) {
 188                         if (ops->pte_hole)
 189                                 err = ops->pte_hole(addr, next, depth, walk);
 190                         if (err)
 191                                 break;
 192                         continue;
 193                 }
 194
 195                 walk->action = ACTION_SUBTREE;
 196
 197                 if (ops->pud_entry)
 198                         err = ops->pud_entry(pud, addr, next, walk);
 199                 if (err)
 200                         break;
 201
 202                 if (walk->action == ACTION_AGAIN)
 203                         goto again;
 204
 205                 if ((!walk->vma && (pud_leaf(*pud) || !pud_present(*pud))) ||
 206                     walk->action == ACTION_CONTINUE ||
 207                     !(ops->pmd_entry || ops->pte_entry))
 208                         continue;
 209
 210                 if (walk->vma)
 211                         split_huge_pud(walk->vma, pud, addr);
 212                 if (pud_none(*pud))
 213                         goto again;
 214
 215                 if (is_hugepd(__hugepd(pud_val(*pud))))
 216                         err = walk_hugepd_range((hugepd_t *)pud, addr, next, walk, PUD_SHIFT);
 217                 else
 218                         err = walk_pmd_range(pud, addr, next, walk);
 219                 if (err)
 220                         break;
 221         } while (pud++, addr = next, addr != end);
 222
 223         return err;
 224 }
 225
 226 static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
 227                           struct mm_walk *walk)
 228 {
 229         p4d_t *p4d;
 230         unsigned long next;
 231         const struct mm_walk_ops *ops = walk->ops;
 232         int err = 0;
 233         int depth = real_depth(1);
 234
 235         p4d = p4d_offset(pgd, addr);
 236         do {
 237                 next = p4d_addr_end(addr, end);
 238                 if (p4d_none_or_clear_bad(p4d)) {
 239                         if (ops->pte_hole)
 240                                 err = ops->pte_hole(addr, next, depth, walk);
 241                         if (err)
 242                                 break;
 243                         continue;
 244                 }
 245                 if (ops->p4d_entry) {
 246                         err = ops->p4d_entry(p4d, addr, next, walk);
 247                         if (err)
 248                                 break;
 249                 }
 250                 if (is_hugepd(__hugepd(p4d_val(*p4d))))
 251                         err = walk_hugepd_range((hugepd_t *)p4d, addr, next, walk, P4D_SHIFT);
 252                 else if (ops->pud_entry || ops->pmd_entry || ops->pte_entry)
 253                         err = walk_pud_range(p4d, addr, next, walk);
 254                 if (err)
 255                         break;
 256         } while (p4d++, addr = next, addr != end);
 257
 258         return err;
 259 }
 260
 261 static int walk_pgd_range(unsigned long addr, unsigned long end,
 262                           struct mm_walk *walk)
 263 {
 264         pgd_t *pgd;
 265         unsigned long next;
 266         const struct mm_walk_ops *ops = walk->ops;
 267         int err = 0;
 268
 269         if (walk->pgd)
 270                 pgd = walk->pgd + pgd_index(addr);
 271         else
 272                 pgd = pgd_offset(walk->mm, addr);
 273         do {
 274                 next = pgd_addr_end(addr, end);
 275                 if (pgd_none_or_clear_bad(pgd)) {
 276                         if (ops->pte_hole)
 277                                 err = ops->pte_hole(addr, next, 0, walk);
 278                         if (err)
 279                                 break;
 280                         continue;
 281                 }
 282                 if (ops->pgd_entry) {
 283                         err = ops->pgd_entry(pgd, addr, next, walk);
 284                         if (err)
 285                                 break;
 286                 }
 287                 if (is_hugepd(__hugepd(pgd_val(*pgd))))
 288                         err = walk_hugepd_range((hugepd_t *)pgd, addr, next, walk, PGDIR_SHIFT);
 289                 else if (ops->p4d_entry || ops->pud_entry || ops->pmd_entry || ops->pte_entry)
 290                         err = walk_p4d_range(pgd, addr, next, walk);
 291                 if (err)
 292                         break;
 293         } while (pgd++, addr = next, addr != end);
 294
 295         return err;
 296 }
 297
 298 #ifdef CONFIG_HUGETLB_PAGE
 299 static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
 300                                        unsigned long end)
 301 {
 302         unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h);
 303         return boundary < end ? boundary : end;
 304 }
 305
 306 static int walk_hugetlb_range(unsigned long addr, unsigned long end,
 307                               struct mm_walk *walk)
 308 {
 309         struct vm_area_struct *vma = walk->vma;
 310         struct hstate *h = hstate_vma(vma);
 311         unsigned long next;
 312         unsigned long hmask = huge_page_mask(h);
 313         unsigned long sz = huge_page_size(h);
 314         pte_t *pte;
 315         const struct mm_walk_ops *ops = walk->ops;
 316         int err = 0;
 317
 318         hugetlb_vma_lock_read(vma);
 319         do {
 320                 next = hugetlb_entry_end(h, addr, end);
 321                 pte = hugetlb_walk(vma, addr & hmask, sz);
 322                 if (pte)
 323                         err = ops->hugetlb_entry(pte, hmask, addr, next, walk);
 324                 else if (ops->pte_hole)
 325                         err = ops->pte_hole(addr, next, -1, walk);
 326                 if (err)
 327                         break;
 328         } while (addr = next, addr != end);
 329         hugetlb_vma_unlock_read(vma);
 330
 331         return err;
 332 }
 333
 334 #else /* CONFIG_HUGETLB_PAGE */
 335 static int walk_hugetlb_range(unsigned long addr, unsigned long end,
 336                               struct mm_walk *walk)
 337 {
 338         return 0;
 339 }
 340
 341 #endif /* CONFIG_HUGETLB_PAGE */
 342
 343 /*
 344  * Decide whether we really walk over the current vma on [@start, @end)
 345  * or skip it via the returned value. Return 0 if we do walk over the
 346  * current vma, and return 1 if we skip the vma. Negative values means
 347  * error, where we abort the current walk.
 348  */
 349 static int walk_page_test(unsigned long start, unsigned long end,
 350                         struct mm_walk *walk)
 351 {
 352         struct vm_area_struct *vma = walk->vma;
 353         const struct mm_walk_ops *ops = walk->ops;
 354
 355         if (ops->test_walk)
 356                 return ops->test_walk(start, end, walk);
 357
 358         /*
 359          * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP
 360          * range, so we don't walk over it as we do for normal vmas. However,
 361          * Some callers are interested in handling hole range and they don't
 362          * want to just ignore any single address range. Such users certainly
 363          * define their ->pte_hole() callbacks, so let's delegate them to handle
 364          * vma(VM_PFNMAP).
 365          */
 366         if (vma->vm_flags & VM_PFNMAP) {
 367                 int err = 1;
 368                 if (ops->pte_hole)
 369                         err = ops->pte_hole(start, end, -1, walk);
 370                 return err ? err : 1;
 371         }
 372         return 0;
 373 }
 374
 375 static int __walk_page_range(unsigned long start, unsigned long end,
 376                         struct mm_walk *walk)
 377 {
 378         int err = 0;
 379         struct vm_area_struct *vma = walk->vma;
 380         const struct mm_walk_ops *ops = walk->ops;
 381
 382         if (ops->pre_vma) {
 383                 err = ops->pre_vma(start, end, walk);
 384                 if (err)
 385                         return err;
 386         }
 387
 388         if (is_vm_hugetlb_page(vma)) {
 389                 if (ops->hugetlb_entry)
 390                         err = walk_hugetlb_range(start, end, walk);
 391         } else
 392                 err = walk_pgd_range(start, end, walk);
 393
 394         if (ops->post_vma)
 395                 ops->post_vma(walk);
 396
 397         return err;
 398 }
 399
 400 /**
 401  * walk_page_range - walk page table with caller specific callbacks
 402  * @mm:         mm_struct representing the target process of page table walk
 403  * @start:      start address of the virtual address range
 404  * @end:        end address of the virtual address range
 405  * @ops:        operation to call during the walk
 406  * @private:    private data for callbacks' usage
 407  *
 408  * Recursively walk the page table tree of the process represented by @mm
 409  * within the virtual address range [@start, @end). During walking, we can do
 410  * some caller-specific works for each entry, by setting up pmd_entry(),
 411  * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these
 412  * callbacks, the associated entries/pages are just ignored.
 413  * The return values of these callbacks are commonly defined like below:
 414  *
 415  *  - 0  : succeeded to handle the current entry, and if you don't reach the
 416  *         end address yet, continue to walk.
 417  *  - >0 : succeeded to handle the current entry, and return to the caller
 418  *         with caller specific value.
 419  *  - <0 : failed to handle the current entry, and return to the caller
 420  *         with error code.
 421  *
 422  * Before starting to walk page table, some callers want to check whether
 423  * they really want to walk over the current vma, typically by checking
 424  * its vm_flags. walk_page_test() and @ops->test_walk() are used for this
 425  * purpose.
 426  *
 427  * If operations need to be staged before and committed after a vma is walked,
 428  * there are two callbacks, pre_vma() and post_vma(). Note that post_vma(),
 429  * since it is intended to handle commit-type operations, can't return any
 430  * errors.
 431  *
 432  * struct mm_walk keeps current values of some common data like vma and pmd,
 433  * which are useful for the access from callbacks. If you want to pass some
 434  * caller-specific data to callbacks, @private should be helpful.
 435  *
 436  * Locking:
 437  *   Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_lock,
 438  *   because these function traverse vma list and/or access to vma's data.
 439  */
 440 int walk_page_range(struct mm_struct *mm, unsigned long start,
 441                 unsigned long end, const struct mm_walk_ops *ops,
 442                 void *private)
 443 {
 444         int err = 0;
 445         unsigned long next;
 446         struct vm_area_struct *vma;
 447         struct mm_walk walk = {
 448                 .ops            = ops,
 449                 .mm             = mm,
 450                 .private        = private,
 451         };
 452
 453         if (start >= end)
 454                 return -EINVAL;
 455
 456         if (!walk.mm)
 457                 return -EINVAL;
 458
 459         mmap_assert_locked(walk.mm);
 460
 461         vma = find_vma(walk.mm, start);
 462         do {
 463                 if (!vma) { /* after the last vma */
 464                         walk.vma = NULL;
 465                         next = end;
 466                         if (ops->pte_hole)
 467                                 err = ops->pte_hole(start, next, -1, &walk);
 468                 } else if (start < vma->vm_start) { /* outside vma */
 469                         walk.vma = NULL;
 470                         next = min(end, vma->vm_start);
 471                         if (ops->pte_hole)
 472                                 err = ops->pte_hole(start, next, -1, &walk);
 473                 } else { /* inside vma */
 474                         walk.vma = vma;
 475                         next = min(end, vma->vm_end);
 476                         vma = find_vma(mm, vma->vm_end);
 477
 478                         err = walk_page_test(start, next, &walk);
 479                         if (err > 0) {
 480                                 /*
 481                                  * positive return values are purely for
 482                                  * controlling the pagewalk, so should never
 483                                  * be passed to the callers.
 484                                  */
 485                                 err = 0;
 486                                 continue;
 487                         }
 488                         if (err < 0)
 489                                 break;
 490                         err = __walk_page_range(start, next, &walk);
 491                 }
 492                 if (err)
 493                         break;
 494         } while (start = next, start < end);
 495         return err;
 496 }
 497
 498 /**
 499  * walk_page_range_novma - walk a range of pagetables not backed by a vma
 500  * @mm:         mm_struct representing the target process of page table walk
 501  * @start:      start address of the virtual address range
 502  * @end:        end address of the virtual address range
 503  * @ops:        operation to call during the walk
 504  * @pgd:        pgd to walk if different from mm->pgd
 505  * @private:    private data for callbacks' usage
 506  *
 507  * Similar to walk_page_range() but can walk any page tables even if they are
 508  * not backed by VMAs. Because 'unusual' entries may be walked this function
 509  * will also not lock the PTEs for the pte_entry() callback. This is useful for
 510  * walking the kernel pages tables or page tables for firmware.
 511  */
 512 int walk_page_range_novma(struct mm_struct *mm, unsigned long start,
 513                           unsigned long end, const struct mm_walk_ops *ops,
 514                           pgd_t *pgd,
 515                           void *private)
 516 {
 517         struct mm_walk walk = {
 518                 .ops            = ops,
 519                 .mm             = mm,
 520                 .pgd            = pgd,
 521                 .private        = private,
 522                 .no_vma         = true
 523         };
 524
 525         if (start >= end || !walk.mm)
 526                 return -EINVAL;
 527
 528         mmap_assert_write_locked(walk.mm);
 529
 530         return walk_pgd_range(start, end, &walk);
 531 }
 532
 533 int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start,
 534                         unsigned long end, const struct mm_walk_ops *ops,
 535                         void *private)
 536 {
 537         struct mm_walk walk = {
 538                 .ops            = ops,
 539                 .mm             = vma->vm_mm,
 540                 .vma            = vma,
 541                 .private        = private,
 542         };
 543
 544         if (start >= end || !walk.mm)
 545                 return -EINVAL;
 546         if (start < vma->vm_start || end > vma->vm_end)
 547                 return -EINVAL;
 548
 549         mmap_assert_locked(walk.mm);
 550         return __walk_page_range(start, end, &walk);
 551 }
 552
 553 int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
 554                 void *private)
 555 {
 556         struct mm_walk walk = {
 557                 .ops            = ops,
 558                 .mm             = vma->vm_mm,
 559                 .vma            = vma,
 560                 .private        = private,
 561         };
 562
 563         if (!walk.mm)
 564                 return -EINVAL;
 565
 566         mmap_assert_locked(walk.mm);
 567         return __walk_page_range(vma->vm_start, vma->vm_end, &walk);
 568 }
 569
 570 /**
 571  * walk_page_mapping - walk all memory areas mapped into a struct address_space.
 572  * @mapping: Pointer to the struct address_space
 573  * @first_index: First page offset in the address_space
 574  * @nr: Number of incremental page offsets to cover
 575  * @ops:        operation to call during the walk
 576  * @private:    private data for callbacks' usage
 577  *
 578  * This function walks all memory areas mapped into a struct address_space.
 579  * The walk is limited to only the given page-size index range, but if
 580  * the index boundaries cross a huge page-table entry, that entry will be
 581  * included.
 582  *
 583  * Also see walk_page_range() for additional information.
 584  *
 585  * Locking:
 586  *   This function can't require that the struct mm_struct::mmap_lock is held,
 587  *   since @mapping may be mapped by multiple processes. Instead
 588  *   @mapping->i_mmap_rwsem must be held. This might have implications in the
 589  *   callbacks, and it's up tho the caller to ensure that the
 590  *   struct mm_struct::mmap_lock is not needed.
 591  *
 592  *   Also this means that a caller can't rely on the struct
 593  *   vm_area_struct::vm_flags to be constant across a call,
 594  *   except for immutable flags. Callers requiring this shouldn't use
 595  *   this function.
 596  *
 597  * Return: 0 on success, negative error code on failure, positive number on
 598  * caller defined premature termination.
 599  */
 600 int walk_page_mapping(struct address_space *mapping, pgoff_t first_index,
 601                       pgoff_t nr, const struct mm_walk_ops *ops,
 602                       void *private)
 603 {
 604         struct mm_walk walk = {
 605                 .ops            = ops,
 606                 .private        = private,
 607         };
 608         struct vm_area_struct *vma;
 609         pgoff_t vba, vea, cba, cea;
 610         unsigned long start_addr, end_addr;
 611         int err = 0;
 612
 613         lockdep_assert_held(&mapping->i_mmap_rwsem);
 614         vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index,
 615                                   first_index + nr - 1) {
 616                 /* Clip to the vma */
 617                 vba = vma->vm_pgoff;
 618                 vea = vba + vma_pages(vma);
 619                 cba = first_index;
 620                 cba = max(cba, vba);
 621                 cea = first_index + nr;
 622                 cea = min(cea, vea);
 623
 624                 start_addr = ((cba - vba) << PAGE_SHIFT) + vma->vm_start;
 625                 end_addr = ((cea - vba) << PAGE_SHIFT) + vma->vm_start;
 626                 if (start_addr >= end_addr)
 627                         continue;
 628
 629                 walk.vma = vma;
 630                 walk.mm = vma->vm_mm;
 631
 632                 err = walk_page_test(vma->vm_start, vma->vm_end, &walk);
 633                 if (err > 0) {
 634                         err = 0;
 635                         break;
 636                 } else if (err < 0)
 637                         break;
 638
 639                 err = __walk_page_range(start_addr, end_addr, &walk);
 640                 if (err)
 641                         break;
 642         }
 643
 644         return err;
 645 }