mm/khugepaged: skip shmem with userfaultfd

author David Stevens <stevensd@chromium.org>

Tue, 4 Apr 2023 12:01:16 +0000 (21:01 +0900)

committer Andrew Morton <akpm@linux-foundation.org>

Tue, 18 Apr 2023 23:29:52 +0000 (16:29 -0700)
author David Stevens <stevensd@chromium.org>
Tue, 4 Apr 2023 12:01:16 +0000 (21:01 +0900)
committer Andrew Morton <akpm@linux-foundation.org>
Tue, 18 Apr 2023 23:29:52 +0000 (16:29 -0700)
diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h

index eca4c6f..6e2ef1d 100644 (file)
--- a/include/trace/events/huge_memory.h
+++ b/include/trace/events/huge_memory.h
@@ -38,7 +38,8 @@
         EM( SCAN_TRUNCATED,             "truncated")                    \
         EM( SCAN_PAGE_HAS_PRIVATE,      "page_has_private")             \
         EM( SCAN_STORE_FAILED,          "store_failed")                 \
-       EMe(SCAN_COPY_MC,               "copy_poisoned_page")
+       EM( SCAN_COPY_MC,               "copy_poisoned_page")           \
+       EMe(SCAN_PAGE_FILLED,           "page_filled")
  
  #undef EM
  #undef EMe
diff --git a/mm/khugepaged.c b/mm/khugepaged.c

index 7628775..434674c 100644 (file)
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -57,6 +57,7 @@ enum scan_result {
         SCAN_PAGE_HAS_PRIVATE,
         SCAN_STORE_FAILED,
         SCAN_COPY_MC,
+       SCAN_PAGE_FILLED,
  };
  
  #define CREATE_TRACE_POINTS
@@ -1860,8 +1861,8 @@ next:
   *  - allocate and lock a new huge page;
   *  - scan page cache replacing old pages with the new one
   *    + swap/gup in pages if necessary;
- *    + fill in gaps;
   *    + keep old pages around in case rollback is required;
+ *  - finalize updates to the page cache;
   *  - if replacing succeeds:
   *    + copy data over;
   *    + free old pages;
@@ -1939,7 +1940,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
                                                 result = SCAN_TRUNCATED;
                                                 goto xa_locked;
                                         }
-                                       xas_set(&xas, index);
+                                       xas_set(&xas, index + 1);
                                 }
                                 if (!shmem_charge(mapping->host, 1)) {
                                         result = SCAN_FAIL;
@@ -2176,22 +2177,66 @@ xa_unlocked:
                 index++;
         }
  
-       /*
-        * Copying old pages to huge one has succeeded, now we
-        * need to free the old pages.
-        */
-       list_for_each_entry_safe(page, tmp, &pagelist, lru) {
-               list_del(&page->lru);
-               page->mapping = NULL;
-               page_ref_unfreeze(page, 1);
-               ClearPageActive(page);
-               ClearPageUnevictable(page);
-               unlock_page(page);
-               put_page(page);
+       if (nr_none) {
+               struct vm_area_struct *vma;
+               int nr_none_check = 0;
+
+               i_mmap_lock_read(mapping);
+               xas_lock_irq(&xas);
+
+               xas_set(&xas, start);
+               for (index = start; index < end; index++) {
+                       if (!xas_next(&xas)) {
+                               xas_store(&xas, XA_RETRY_ENTRY);
+                               if (xas_error(&xas)) {
+                                       result = SCAN_STORE_FAILED;
+                                       goto immap_locked;
+                               }
+                               nr_none_check++;
+                       }
+               }
+
+               if (nr_none != nr_none_check) {
+                       result = SCAN_PAGE_FILLED;
+                       goto immap_locked;
+               }
+
+               /*
+                * If userspace observed a missing page in a VMA with a MODE_MISSING
+                * userfaultfd, then it might expect a UFFD_EVENT_PAGEFAULT for that
+                * page. If so, we need to roll back to avoid suppressing such an
+                * event. Since wp/minor userfaultfds don't give userspace any
+                * guarantees that the kernel doesn't fill a missing page with a zero
+                * page, so they don't matter here.
+                *
+                * Any userfaultfds registered after this point will not be able to
+                * observe any missing pages due to the previously inserted retry
+                * entries.
+                */
+               vma_interval_tree_foreach(vma, &mapping->i_mmap, start, end) {
+                       if (userfaultfd_missing(vma)) {
+                               result = SCAN_EXCEED_NONE_PTE;
+                               goto immap_locked;
+                       }
+               }
+
+immap_locked:
+               i_mmap_unlock_read(mapping);
+               if (result != SCAN_SUCCEED) {
+                       xas_set(&xas, start);
+                       for (index = start; index < end; index++) {
+                               if (xas_next(&xas) == XA_RETRY_ENTRY)
+                                       xas_store(&xas, NULL);
+                       }
+
+                       xas_unlock_irq(&xas);
+                       goto rollback;
+               }
+       } else {
+               xas_lock_irq(&xas);
         }
  
         nr = thp_nr_pages(hpage);
-       xas_lock_irq(&xas);
         if (is_shmem)
                 __mod_lruvec_page_state(hpage, NR_SHMEM_THPS, nr);
         else
@@ -2221,6 +2266,20 @@ xa_unlocked:
         result = retract_page_tables(mapping, start, mm, addr, hpage,
                                      cc);
         unlock_page(hpage);
+
+       /*
+        * The collapse has succeeded, so free the old pages.
+        */
+       list_for_each_entry_safe(page, tmp, &pagelist, lru) {
+               list_del(&page->lru);
+               page->mapping = NULL;
+               page_ref_unfreeze(page, 1);
+               ClearPageActive(page);
+               ClearPageUnevictable(page);
+               unlock_page(page);
+               put_page(page);
+       }
+
         goto out;
  
  rollback:
@@ -2232,15 +2291,13 @@ rollback:
         }
  
         xas_set(&xas, start);
-       xas_for_each(&xas, page, end - 1) {
+       end = index;
+       for (index = start; index < end; index++) {
+               xas_next(&xas);
                 page = list_first_entry_or_null(&pagelist,
                                 struct page, lru);
                 if (!page || xas.xa_index < page->index) {
-                       if (!nr_none)
-                               break;
                         nr_none--;
-                       /* Put holes back where they were */
-                       xas_store(&xas, NULL);
                         continue;
                 }
  
@@ -2764,12 +2821,14 @@ static int madvise_collapse_errno(enum scan_result r)
         case SCAN_ALLOC_HUGE_PAGE_FAIL:
                 return -ENOMEM;
         case SCAN_CGROUP_CHARGE_FAIL:
+       case SCAN_EXCEED_NONE_PTE:
                 return -EBUSY;
         /* Resource temporary unavailable - trying again might succeed */
         case SCAN_PAGE_COUNT:
         case SCAN_PAGE_LOCK:
         case SCAN_PAGE_LRU:
         case SCAN_DEL_PAGE_LRU:
+       case SCAN_PAGE_FILLED:
                 return -EAGAIN;
         /*
          * Other: Trying again likely not to succeed / error intrinsic to
author	David Stevens <stevensd@chromium.org>
	Tue, 4 Apr 2023 12:01:16 +0000 (21:01 +0900)
committer	Andrew Morton <akpm@linux-foundation.org>
	Tue, 18 Apr 2023 23:29:52 +0000 (16:29 -0700)
include/trace/events/huge_memory.h		patch \| blob \| history
mm/khugepaged.c		patch \| blob \| history