Merge tag 'pull-18-rc1-work.fd' of git://git.kernel.org/pub/scm/linux/kernel/git...

[platform/kernel/linux-starfive.git] / mm / swapfile.c
diff --git a/mm/swapfile.c b/mm/swapfile.c

index 6aec1b2..a2e66d8 100644 (file)
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -45,6 +45,7 @@
  #include <asm/tlbflush.h>
  #include <linux/swapops.h>
  #include <linux/swap_cgroup.h>
+#include "swap.h"
  
  static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
                                  unsigned char);
@@ -77,9 +78,9 @@ static PLIST_HEAD(swap_active_head);
  /*
   * all available (active, not full) swap_info_structs
   * protected with swap_avail_lock, ordered by priority.
- * This is used by get_swap_page() instead of swap_active_head
+ * This is used by folio_alloc_swap() instead of swap_active_head
   * because swap_active_head includes all swap_info_structs,
- * but get_swap_page() doesn't need to look at full ones.
+ * but folio_alloc_swap() doesn't need to look at full ones.
   * This uses its own lock instead of swap_lock because when a
   * swap_info_struct changes between not-full/full, it needs to
   * add/remove itself to/from this list, but the swap_info_struct->lock
@@ -775,6 +776,22 @@ static void set_cluster_next(struct swap_info_struct *si, unsigned long next)
         this_cpu_write(*si->cluster_next_cpu, next);
  }
  
+static bool swap_offset_available_and_locked(struct swap_info_struct *si,
+                                            unsigned long offset)
+{
+       if (data_race(!si->swap_map[offset])) {
+               spin_lock(&si->lock);
+               return true;
+       }
+
+       if (vm_swap_full() && READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
+               spin_lock(&si->lock);
+               return true;
+       }
+
+       return false;
+}
+
  static int scan_swap_map_slots(struct swap_info_struct *si,
                                unsigned char usage, int nr,
                                swp_entry_t slots[])
@@ -952,15 +969,8 @@ done:
  scan:
         spin_unlock(&si->lock);
         while (++offset <= READ_ONCE(si->highest_bit)) {
-               if (data_race(!si->swap_map[offset])) {
-                       spin_lock(&si->lock);
+               if (swap_offset_available_and_locked(si, offset))
                         goto checks;
-               }
-               if (vm_swap_full() &&
-                   READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
-                       spin_lock(&si->lock);
-                       goto checks;
-               }
                 if (unlikely(--latency_ration < 0)) {
                         cond_resched();
                         latency_ration = LATENCY_LIMIT;
@@ -969,15 +979,8 @@ scan:
         }
         offset = si->lowest_bit;
         while (offset < scan_base) {
-               if (data_race(!si->swap_map[offset])) {
-                       spin_lock(&si->lock);
-                       goto checks;
-               }
-               if (vm_swap_full() &&
-                   READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
-                       spin_lock(&si->lock);
+               if (swap_offset_available_and_locked(si, offset))
                         goto checks;
-               }
                 if (unlikely(--latency_ration < 0)) {
                         cond_resched();
                         latency_ration = LATENCY_LIMIT;
@@ -1122,7 +1125,7 @@ noswap:
         return n_ret;
  }
  
-static struct swap_info_struct *__swap_info_get(swp_entry_t entry)
+static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
  {
         struct swap_info_struct *p;
         unsigned long offset;
@@ -1137,8 +1140,13 @@ static struct swap_info_struct *__swap_info_get(swp_entry_t entry)
         offset = swp_offset(entry);
         if (offset >= p->max)
                 goto bad_offset;
+       if (data_race(!p->swap_map[swp_offset(entry)]))
+               goto bad_free;
         return p;
  
+bad_free:
+       pr_err("%s: %s%08lx\n", __func__, Unused_offset, entry.val);
+       goto out;
  bad_offset:
         pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val);
         goto out;
@@ -1151,23 +1159,6 @@ out:
         return NULL;
  }
  
-static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
-{
-       struct swap_info_struct *p;
-
-       p = __swap_info_get(entry);
-       if (!p)
-               goto out;
-       if (data_race(!p->swap_map[swp_offset(entry)]))
-               goto bad_free;
-       return p;
-
-bad_free:
-       pr_err("%s: %s%08lx\n", __func__, Unused_offset, entry.val);
-out:
-       return NULL;
-}
-
  static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry,
                                         struct swap_info_struct *q)
  {
@@ -1283,6 +1274,7 @@ bad_nofile:
  out:
         return NULL;
  put_out:
+       pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val);
         percpu_ref_put(&si->users);
         return NULL;
  }
@@ -1440,7 +1432,7 @@ void swapcache_free_entries(swp_entry_t *entries, int n)
   * This does not give an exact answer when swap count is continued,
   * but does include the high COUNT_CONTINUED flag to allow for that.
   */
-int page_swapcount(struct page *page)
+static int page_swapcount(struct page *page)
  {
         int count = 0;
         struct swap_info_struct *p;
@@ -1783,7 +1775,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
  {
         struct page *swapcache;
         spinlock_t *ptl;
-       pte_t *pte;
+       pte_t *pte, new_pte;
         int ret = 1;
  
         swapcache = page;
@@ -1797,17 +1789,47 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
                 goto out;
         }
  
+       if (unlikely(!PageUptodate(page))) {
+               pte_t pteval;
+
+               dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
+               pteval = swp_entry_to_pte(make_swapin_error_entry(page));
+               set_pte_at(vma->vm_mm, addr, pte, pteval);
+               swap_free(entry);
+               ret = 0;
+               goto out;
+       }
+
+       /* See do_swap_page() */
+       BUG_ON(!PageAnon(page) && PageMappedToDisk(page));
+       BUG_ON(PageAnon(page) && PageAnonExclusive(page));
+
         dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
         inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
         get_page(page);
         if (page == swapcache) {
-               page_add_anon_rmap(page, vma, addr, false);
+               rmap_t rmap_flags = RMAP_NONE;
+
+               /*
+                * See do_swap_page(): PageWriteback() would be problematic.
+                * However, we do a wait_on_page_writeback() just before this
+                * call and have the page locked.
+                */
+               VM_BUG_ON_PAGE(PageWriteback(page), page);
+               if (pte_swp_exclusive(*pte))
+                       rmap_flags |= RMAP_EXCLUSIVE;
+
+               page_add_anon_rmap(page, vma, addr, rmap_flags);
         } else { /* ksm created a completely new copy */
-               page_add_new_anon_rmap(page, vma, addr, false);
+               page_add_new_anon_rmap(page, vma, addr);
                 lru_cache_add_inactive_or_unevictable(page, vma);
         }
-       set_pte_at(vma->vm_mm, addr, pte,
-                  pte_mkold(mk_pte(page, vma->vm_page_prot)));
+       new_pte = pte_mkold(mk_pte(page, vma->vm_page_prot));
+       if (pte_swp_soft_dirty(*pte))
+               new_pte = pte_mksoft_dirty(new_pte);
+       if (pte_swp_uffd_wp(*pte))
+               new_pte = pte_mkuffd_wp(new_pte);
+       set_pte_at(vma->vm_mm, addr, pte, new_pte);
         swap_free(entry);
  out:
         pte_unmap_unlock(pte, ptl);
@@ -1984,9 +2006,9 @@ static int unuse_mm(struct mm_struct *mm, unsigned int type)
  }
  
  /*
- * Scan swap_map (or frontswap_map if frontswap parameter is true)
- * from current position to next entry still in use. Return 0
- * if there are no inuse entries after prev till end of the map.
+ * Scan swap_map from current position to next entry still in use.
+ * Return 0 if there are no inuse entries after prev till end of
+ * the map.
   */
  static unsigned int find_next_to_unuse(struct swap_info_struct *si,
                                         unsigned int prev)
@@ -2094,11 +2116,12 @@ retry:
          * Under global memory pressure, swap entries can be reinserted back
          * into process space after the mmlist loop above passes over them.
          *
-        * Limit the number of retries? No: when mmget_not_zero() above fails,
-        * that mm is likely to be freeing swap from exit_mmap(), which proceeds
-        * at its own independent pace; and even shmem_writepage() could have
-        * been preempted after get_swap_page(), temporarily hiding that swap.
-        * It's easy and robust (though cpu-intensive) just to keep retrying.
+        * Limit the number of retries? No: when mmget_not_zero()
+        * above fails, that mm is likely to be freeing swap from
+        * exit_mmap(), which proceeds at its own independent pace;
+        * and even shmem_writepage() could have been preempted after
+        * folio_alloc_swap(), temporarily hiding that swap.  It's easy
+        * and robust (though cpu-intensive) just to keep retrying.
          */
         if (READ_ONCE(si->inuse_pages)) {
                 if (!signal_pending(current))
@@ -2201,8 +2224,8 @@ EXPORT_SYMBOL_GPL(add_swap_extent);
  
  /*
   * A `swap extent' is a simple thing which maps a contiguous range of pages
- * onto a contiguous range of disk blocks.  An ordered list of swap extents
- * is built at swapon time and is then used at swap_writepage/swap_readpage
+ * onto a contiguous range of disk blocks.  A rbtree of swap extents is
+ * built at swapon time and is then used at swap_writepage/swap_readpage
   * time for locating where on disk a page belongs.
   *
   * If the swapfile is an S_ISBLK block device, a single extent is installed.
@@ -2210,12 +2233,12 @@ EXPORT_SYMBOL_GPL(add_swap_extent);
   * swap files identically.
   *
   * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap
- * extent list operates in PAGE_SIZE disk blocks.  Both S_ISREG and S_ISBLK
+ * extent rbtree operates in PAGE_SIZE disk blocks.  Both S_ISREG and S_ISBLK
   * swapfiles are handled *identically* after swapon time.
   *
   * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks
- * and will parse them into an ordered extent list, in PAGE_SIZE chunks.  If
- * some stray blocks are found which do not fall within the PAGE_SIZE alignment
+ * and will parse them into a rbtree, in PAGE_SIZE chunks.  If some stray
+ * blocks are found which do not fall within the PAGE_SIZE alignment
   * requirements, they are simply tossed out - we will never use those blocks
   * for swapping.
   *
@@ -2224,10 +2247,7 @@ EXPORT_SYMBOL_GPL(add_swap_extent);
   *
   * The amount of disk space which a single swap extent represents varies.
   * Typically it is in the 1-4 megabyte range.  So we can have hundreds of
- * extents in the list.  To avoid much list walking, we cache the previous
- * search location in `curr_swap_extent', and start new searches from there.
- * This is extremely effective.  The average number of iterations in
- * map_swap_page() has been measured at about 0.3 per page.  - akpm.
+ * extents in the rbtree. - akpm.
   */
  static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
  {
@@ -2244,12 +2264,13 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
  
         if (mapping->a_ops->swap_activate) {
                 ret = mapping->a_ops->swap_activate(sis, swap_file, span);
-               if (ret >= 0)
-                       sis->flags |= SWP_ACTIVATED;
-               if (!ret) {
-                       sis->flags |= SWP_FS_OPS;
-                       ret = add_swap_extent(sis, 0, sis->max, 0);
-                       *span = sis->pages;
+               if (ret < 0)
+                       return ret;
+               sis->flags |= SWP_ACTIVATED;
+               if ((sis->flags & SWP_FS_OPS) &&
+                   sio_pool_init() != 0) {
+                       destroy_swap_extents(sis);
+                       return -ENOMEM;
                 }
                 return ret;
         }
@@ -2311,7 +2332,7 @@ static void _enable_swap_info(struct swap_info_struct *p)
          * which on removal of any swap_info_struct with an auto-assigned
          * (i.e. negative) priority increments the auto-assigned priority
          * of any lower-priority swap_info_structs.
-        * swap_avail_head needs to be priority ordered for get_swap_page(),
+        * swap_avail_head needs to be priority ordered for folio_alloc_swap(),
          * which allocates swap pages from the highest available priority
          * swap_info_struct.
          */
@@ -3314,8 +3335,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
  
  unlock_out:
         unlock_cluster_or_swap_info(p, ci);
-       if (p)
-               put_swap_device(p);
+       put_swap_device(p);
         return err;
  }