Merge branch 'akpm' (patches from Andrew)

author Linus Torvalds <torvalds@linux-foundation.org>

Fri, 25 Jun 2021 18:05:03 +0000 (11:05 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 25 Jun 2021 18:05:03 +0000 (11:05 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Fri, 25 Jun 2021 18:05:03 +0000 (11:05 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 25 Jun 2021 18:05:03 +0000 (11:05 -0700)
diff --git a/.mailmap b/.mailmap

index c79a78766c07f6008f1aff45c2d97ab9ba19ad65..db58eedb44f1d2ce86b5e8d678108d390bb6c78a 100644 (file)
--- a/.mailmap
+++ b/.mailmap
@@ -212,6 +212,8 @@ Manivannan Sadhasivam <mani@kernel.org> <manivannanece23@gmail.com>
  Manivannan Sadhasivam <mani@kernel.org> <manivannan.sadhasivam@linaro.org>
  Marcin Nowakowski <marcin.nowakowski@mips.com> <marcin.nowakowski@imgtec.com>
  Marc Zyngier <maz@kernel.org> <marc.zyngier@arm.com>
+Marek Behún <kabel@kernel.org> <marek.behun@nic.cz>
+Marek Behún <kabel@kernel.org> Marek Behun <marek.behun@nic.cz>
  Mark Brown <broonie@sirena.org.uk>
  Mark Starovoytov <mstarovo@pm.me> <mstarovoitov@marvell.com>
  Mark Yao <markyao0591@gmail.com> <mark.yao@rock-chips.com>
diff --git a/MAINTAINERS b/MAINTAINERS

index b3b9a253316f931b2f7429b61196c7fa45bd1224..0cce91cd562439d4ee4cf4874a5439fb4e1d90ce 100644 (file)
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1816,7 +1816,7 @@ F:        drivers/pinctrl/pinctrl-gemini.c
  F:     drivers/rtc/rtc-ftrtc010.c
  
  ARM/CZ.NIC TURRIS SUPPORT
-M:     Marek Behun <kabel@kernel.org>
+M:     Marek Behún <kabel@kernel.org>
  S:     Maintained
  W:     https://www.turris.cz/
  F:     Documentation/ABI/testing/debugfs-moxtet
@@ -10945,7 +10945,7 @@ F:      include/linux/mv643xx.h
  
  MARVELL MV88X3310 PHY DRIVER
  M:     Russell King <linux@armlinux.org.uk>
-M:     Marek Behun <marek.behun@nic.cz>
+M:     Marek Behún <kabel@kernel.org>
  L:     netdev@vger.kernel.org
  S:     Maintained
  F:     drivers/net/phy/marvell10g.c
diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c

index 813b6e93dc83657530ba3e28b8a78007a90c0949..c8841f476e9132bca23c52828a27fb3cc7fdeadc 100644 (file)
--- a/arch/s390/kvm/pv.c
+++ b/arch/s390/kvm/pv.c
@@ -140,7 +140,12 @@ static int kvm_s390_pv_alloc_vm(struct kvm *kvm)
         /* Allocate variable storage */
         vlen = ALIGN(virt * ((npages * PAGE_SIZE) / HPAGE_SIZE), PAGE_SIZE);
         vlen += uv_info.guest_virt_base_stor_len;
-       kvm->arch.pv.stor_var = vzalloc(vlen);
+       /*
+        * The Create Secure Configuration Ultravisor Call does not support
+        * using large pages for the virtual memory area.
+        * This is a hardware limitation.
+        */
+       kvm->arch.pv.stor_var = vmalloc_no_huge(vlen);
         if (!kvm->arch.pv.stor_var)
                 goto out_err;
         return 0;
diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c

index 303d71430bdd170905956a38cb8a79186e03cc46..9c6c0e2e5880ae6a7c51dbf837bebdaaa52a2248 100644 (file)
--- a/fs/nilfs2/sysfs.c
+++ b/fs/nilfs2/sysfs.c
@@ -1053,6 +1053,7 @@ void nilfs_sysfs_delete_device_group(struct the_nilfs *nilfs)
         nilfs_sysfs_delete_superblock_group(nilfs);
         nilfs_sysfs_delete_segctor_group(nilfs);
         kobject_del(&nilfs->ns_dev_kobj);
+       kobject_put(&nilfs->ns_dev_kobj);
         kfree(nilfs->ns_dev_subgroups);
  }
  
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h

index 6504346a19473d224fb497d4c2b7830516970f67..3c0117656745abac4d566f7c62ccb2f830ba3ac2 100644 (file)
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -741,17 +741,6 @@ static inline int hstate_index(struct hstate *h)
         return h - hstates;
  }
  
-pgoff_t __basepage_index(struct page *page);
-
-/* Return page->index in PAGE_SIZE units */
-static inline pgoff_t basepage_index(struct page *page)
-{
-       if (!PageCompound(page))
-               return page->index;
-
-       return __basepage_index(page);
-}
-
  extern int dissolve_free_huge_page(struct page *page);
  extern int dissolve_free_huge_pages(unsigned long start_pfn,
                                     unsigned long end_pfn);
@@ -988,11 +977,6 @@ static inline int hstate_index(struct hstate *h)
         return 0;
  }
  
-static inline pgoff_t basepage_index(struct page *page)
-{
-       return page->index;
-}
-
  static inline int dissolve_free_huge_page(struct page *page)
  {
         return 0;
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h

index e89df447fae32cac5a49cbf89b2abc985012111c..0f1b34dbf3a2e464812b3ae4a59ac4ff1d7b0d3e 100644 (file)
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -516,7 +516,7 @@ static inline struct page *read_mapping_page(struct address_space *mapping,
  }
  
  /*
- * Get index of the page with in radix-tree
+ * Get index of the page within radix-tree (but not for hugetlb pages).
   * (TODO: remove once hugetlb pages will have ->index in PAGE_SIZE)
   */
  static inline pgoff_t page_to_index(struct page *page)
@@ -535,15 +535,16 @@ static inline pgoff_t page_to_index(struct page *page)
         return pgoff;
  }
  
+extern pgoff_t hugetlb_basepage_index(struct page *page);
+
  /*
- * Get the offset in PAGE_SIZE.
- * (TODO: hugepage should have ->index in PAGE_SIZE)
+ * Get the offset in PAGE_SIZE (even for hugetlb pages).
+ * (TODO: hugetlb pages should have ->index in PAGE_SIZE)
   */
  static inline pgoff_t page_to_pgoff(struct page *page)
  {
-       if (unlikely(PageHeadHuge(page)))
-               return page->index << compound_order(page);
-
+       if (unlikely(PageHuge(page)))
+               return hugetlb_basepage_index(page);
         return page_to_index(page);
  }
  
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h

index 4d668abb639179884b2df93bedb65d1616d327ac..bfaaf0b6fa7665396c0508fbf67463665028ba3c 100644 (file)
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -135,6 +135,7 @@ extern void *__vmalloc_node_range(unsigned long size, unsigned long align,
                         const void *caller);
  void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask,
                 int node, const void *caller);
+void *vmalloc_no_huge(unsigned long size);
  
  extern void vfree(const void *addr);
  extern void vfree_atomic(const void *addr);
diff --git a/kernel/futex.c b/kernel/futex.c

index 4938a00bc7857df511aab1c3b105b17bfaa47ebe..408cad5e89680ff7abe0184f418d4f1ead342533 100644 (file)
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -35,7 +35,6 @@
  #include <linux/jhash.h>
  #include <linux/pagemap.h>
  #include <linux/syscalls.h>
-#include <linux/hugetlb.h>
  #include <linux/freezer.h>
  #include <linux/memblock.h>
  #include <linux/fault-inject.h>
@@ -650,7 +649,7 @@ again:
  
                 key->both.offset |= FUT_OFF_INODE; /* inode-based key */
                 key->shared.i_seq = get_inode_sequence_number(inode);
-               key->shared.pgoff = basepage_index(tail);
+               key->shared.pgoff = page_to_pgoff(tail);
                 rcu_read_unlock();
         }
  
diff --git a/kernel/kthread.c b/kernel/kthread.c

index fe3f2a40d61e8620f01fe057ec1e3dda92709b0e..0fccf7d0c6a16dd3c697dd78e09cd3e46a2cf7df 100644 (file)
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1093,8 +1093,38 @@ void kthread_flush_work(struct kthread_work *work)
  EXPORT_SYMBOL_GPL(kthread_flush_work);
  
  /*
- * This function removes the work from the worker queue. Also it makes sure
- * that it won't get queued later via the delayed work's timer.
+ * Make sure that the timer is neither set nor running and could
+ * not manipulate the work list_head any longer.
+ *
+ * The function is called under worker->lock. The lock is temporary
+ * released but the timer can't be set again in the meantime.
+ */
+static void kthread_cancel_delayed_work_timer(struct kthread_work *work,
+                                             unsigned long *flags)
+{
+       struct kthread_delayed_work *dwork =
+               container_of(work, struct kthread_delayed_work, work);
+       struct kthread_worker *worker = work->worker;
+
+       /*
+        * del_timer_sync() must be called to make sure that the timer
+        * callback is not running. The lock must be temporary released
+        * to avoid a deadlock with the callback. In the meantime,
+        * any queuing is blocked by setting the canceling counter.
+        */
+       work->canceling++;
+       raw_spin_unlock_irqrestore(&worker->lock, *flags);
+       del_timer_sync(&dwork->timer);
+       raw_spin_lock_irqsave(&worker->lock, *flags);
+       work->canceling--;
+}
+
+/*
+ * This function removes the work from the worker queue.
+ *
+ * It is called under worker->lock. The caller must make sure that
+ * the timer used by delayed work is not running, e.g. by calling
+ * kthread_cancel_delayed_work_timer().
   *
   * The work might still be in use when this function finishes. See the
   * current_work proceed by the worker.
@@ -1102,28 +1132,8 @@ EXPORT_SYMBOL_GPL(kthread_flush_work);
   * Return: %true if @work was pending and successfully canceled,
   *     %false if @work was not pending
   */
-static bool __kthread_cancel_work(struct kthread_work *work, bool is_dwork,
-                                 unsigned long *flags)
+static bool __kthread_cancel_work(struct kthread_work *work)
  {
-       /* Try to cancel the timer if exists. */
-       if (is_dwork) {
-               struct kthread_delayed_work *dwork =
-                       container_of(work, struct kthread_delayed_work, work);
-               struct kthread_worker *worker = work->worker;
-
-               /*
-                * del_timer_sync() must be called to make sure that the timer
-                * callback is not running. The lock must be temporary released
-                * to avoid a deadlock with the callback. In the meantime,
-                * any queuing is blocked by setting the canceling counter.
-                */
-               work->canceling++;
-               raw_spin_unlock_irqrestore(&worker->lock, *flags);
-               del_timer_sync(&dwork->timer);
-               raw_spin_lock_irqsave(&worker->lock, *flags);
-               work->canceling--;
-       }
-
         /*
          * Try to remove the work from a worker list. It might either
          * be from worker->work_list or from worker->delayed_work_list.
@@ -1176,11 +1186,23 @@ bool kthread_mod_delayed_work(struct kthread_worker *worker,
         /* Work must not be used with >1 worker, see kthread_queue_work() */
         WARN_ON_ONCE(work->worker != worker);
  
-       /* Do not fight with another command that is canceling this work. */
+       /*
+        * Temporary cancel the work but do not fight with another command
+        * that is canceling the work as well.
+        *
+        * It is a bit tricky because of possible races with another
+        * mod_delayed_work() and cancel_delayed_work() callers.
+        *
+        * The timer must be canceled first because worker->lock is released
+        * when doing so. But the work can be removed from the queue (list)
+        * only when it can be queued again so that the return value can
+        * be used for reference counting.
+        */
+       kthread_cancel_delayed_work_timer(work, &flags);
         if (work->canceling)
                 goto out;
+       ret = __kthread_cancel_work(work);
  
-       ret = __kthread_cancel_work(work, true, &flags);
  fast_queue:
         __kthread_queue_delayed_work(worker, dwork, delay);
  out:
@@ -1202,7 +1224,10 @@ static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork)
         /* Work must not be used with >1 worker, see kthread_queue_work(). */
         WARN_ON_ONCE(work->worker != worker);
  
-       ret = __kthread_cancel_work(work, is_dwork, &flags);
+       if (is_dwork)
+               kthread_cancel_delayed_work_timer(work, &flags);
+
+       ret = __kthread_cancel_work(work);
  
         if (worker->current_work != work)
                 goto out_fast;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c

index e0a5f9cbbece9150256eb4bd85df1ddc6c869423..5ba5a0da6d572bdadca4252a0332a29a8ed652fe 100644 (file)
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1588,15 +1588,12 @@ struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage)
         return NULL;
  }
  
-pgoff_t __basepage_index(struct page *page)
+pgoff_t hugetlb_basepage_index(struct page *page)
  {
         struct page *page_head = compound_head(page);
         pgoff_t index = page_index(page_head);
         unsigned long compound_idx;
  
-       if (!PageHuge(page_head))
-               return page_index(page);
-
         if (compound_order(page_head) >= MAX_ORDER)
                 compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
         else
diff --git a/mm/memory-failure.c b/mm/memory-failure.c

index 0143d32bc666313d5ff872024e796f3d17243f44..6f5f78885ab425fa5d77a730bc7bf5d46bc3e293 100644 (file)
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -658,6 +658,7 @@ static int truncate_error_page(struct page *p, unsigned long pfn,
   */
  static int me_kernel(struct page *p, unsigned long pfn)
  {
+       unlock_page(p);
         return MF_IGNORED;
  }
  
@@ -667,6 +668,7 @@ static int me_kernel(struct page *p, unsigned long pfn)
  static int me_unknown(struct page *p, unsigned long pfn)
  {
         pr_err("Memory failure: %#lx: Unknown page state\n", pfn);
+       unlock_page(p);
         return MF_FAILED;
  }
  
@@ -675,6 +677,7 @@ static int me_unknown(struct page *p, unsigned long pfn)
   */
  static int me_pagecache_clean(struct page *p, unsigned long pfn)
  {
+       int ret;
         struct address_space *mapping;
  
         delete_from_lru_cache(p);
@@ -683,8 +686,10 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
          * For anonymous pages we're done the only reference left
          * should be the one m_f() holds.
          */
-       if (PageAnon(p))
-               return MF_RECOVERED;
+       if (PageAnon(p)) {
+               ret = MF_RECOVERED;
+               goto out;
+       }
  
         /*
          * Now truncate the page in the page cache. This is really
@@ -698,7 +703,8 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
                 /*
                  * Page has been teared down in the meanwhile
                  */
-               return MF_FAILED;
+               ret = MF_FAILED;
+               goto out;
         }
  
         /*
@@ -706,7 +712,10 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
          *
          * Open: to take i_mutex or not for this? Right now we don't.
          */
-       return truncate_error_page(p, pfn, mapping);
+       ret = truncate_error_page(p, pfn, mapping);
+out:
+       unlock_page(p);
+       return ret;
  }
  
  /*
@@ -782,24 +791,26 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn)
   */
  static int me_swapcache_dirty(struct page *p, unsigned long pfn)
  {
+       int ret;
+
         ClearPageDirty(p);
         /* Trigger EIO in shmem: */
         ClearPageUptodate(p);
  
-       if (!delete_from_lru_cache(p))
-               return MF_DELAYED;
-       else
-               return MF_FAILED;
+       ret = delete_from_lru_cache(p) ? MF_FAILED : MF_DELAYED;
+       unlock_page(p);
+       return ret;
  }
  
  static int me_swapcache_clean(struct page *p, unsigned long pfn)
  {
+       int ret;
+
         delete_from_swap_cache(p);
  
-       if (!delete_from_lru_cache(p))
-               return MF_RECOVERED;
-       else
-               return MF_FAILED;
+       ret = delete_from_lru_cache(p) ? MF_FAILED : MF_RECOVERED;
+       unlock_page(p);
+       return ret;
  }
  
  /*
@@ -820,6 +831,7 @@ static int me_huge_page(struct page *p, unsigned long pfn)
         mapping = page_mapping(hpage);
         if (mapping) {
                 res = truncate_error_page(hpage, pfn, mapping);
+               unlock_page(hpage);
         } else {
                 res = MF_FAILED;
                 unlock_page(hpage);
@@ -834,7 +846,6 @@ static int me_huge_page(struct page *p, unsigned long pfn)
                         page_ref_inc(p);
                         res = MF_RECOVERED;
                 }
-               lock_page(hpage);
         }
  
         return res;
@@ -866,6 +877,8 @@ static struct page_state {
         unsigned long mask;
         unsigned long res;
         enum mf_action_page_type type;
+
+       /* Callback ->action() has to unlock the relevant page inside it. */
         int (*action)(struct page *p, unsigned long pfn);
  } error_states[] = {
         { reserved,     reserved,       MF_MSG_KERNEL,  me_kernel },
@@ -929,6 +942,7 @@ static int page_action(struct page_state *ps, struct page *p,
         int result;
         int count;
  
+       /* page p should be unlocked after returning from ps->action().  */
         result = ps->action(p, pfn);
  
         count = page_count(p) - 1;
@@ -1253,7 +1267,7 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
         if (TestSetPageHWPoison(head)) {
                 pr_err("Memory failure: %#lx: already hardware poisoned\n",
                        pfn);
-               return 0;
+               return -EHWPOISON;
         }
  
         num_poisoned_pages_inc();
@@ -1313,7 +1327,7 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
                 goto out;
         }
  
-       res = identify_page_state(pfn, p, page_flags);
+       return identify_page_state(pfn, p, page_flags);
  out:
         unlock_page(head);
         return res;
@@ -1429,9 +1443,10 @@ int memory_failure(unsigned long pfn, int flags)
         struct page *hpage;
         struct page *orig_head;
         struct dev_pagemap *pgmap;
-       int res;
+       int res = 0;
         unsigned long page_flags;
         bool retry = true;
+       static DEFINE_MUTEX(mf_mutex);
  
         if (!sysctl_memory_failure_recovery)
                 panic("Memory failure on page %lx", pfn);
@@ -1449,13 +1464,19 @@ int memory_failure(unsigned long pfn, int flags)
                 return -ENXIO;
         }
  
+       mutex_lock(&mf_mutex);
+
  try_again:
-       if (PageHuge(p))
-               return memory_failure_hugetlb(pfn, flags);
+       if (PageHuge(p)) {
+               res = memory_failure_hugetlb(pfn, flags);
+               goto unlock_mutex;
+       }
+
         if (TestSetPageHWPoison(p)) {
                 pr_err("Memory failure: %#lx: already hardware poisoned\n",
                         pfn);
-               return 0;
+               res = -EHWPOISON;
+               goto unlock_mutex;
         }
  
         orig_head = hpage = compound_head(p);
@@ -1488,17 +1509,19 @@ try_again:
                                 res = MF_FAILED;
                         }
                         action_result(pfn, MF_MSG_BUDDY, res);
-                       return res == MF_RECOVERED ? 0 : -EBUSY;
+                       res = res == MF_RECOVERED ? 0 : -EBUSY;
                 } else {
                         action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED);
-                       return -EBUSY;
+                       res = -EBUSY;
                 }
+               goto unlock_mutex;
         }
  
         if (PageTransHuge(hpage)) {
                 if (try_to_split_thp_page(p, "Memory Failure") < 0) {
                         action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED);
-                       return -EBUSY;
+                       res = -EBUSY;
+                       goto unlock_mutex;
                 }
                 VM_BUG_ON_PAGE(!page_count(p), p);
         }
@@ -1522,7 +1545,7 @@ try_again:
         if (PageCompound(p) && compound_head(p) != orig_head) {
                 action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED);
                 res = -EBUSY;
-               goto out;
+               goto unlock_page;
         }
  
         /*
@@ -1542,14 +1565,14 @@ try_again:
                 num_poisoned_pages_dec();
                 unlock_page(p);
                 put_page(p);
-               return 0;
+               goto unlock_mutex;
         }
         if (hwpoison_filter(p)) {
                 if (TestClearPageHWPoison(p))
                         num_poisoned_pages_dec();
                 unlock_page(p);
                 put_page(p);
-               return 0;
+               goto unlock_mutex;
         }
  
         /*
@@ -1573,7 +1596,7 @@ try_again:
         if (!hwpoison_user_mappings(p, pfn, flags, &p)) {
                 action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
                 res = -EBUSY;
-               goto out;
+               goto unlock_page;
         }
  
         /*
@@ -1582,13 +1605,17 @@ try_again:
         if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
                 action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED);
                 res = -EBUSY;
-               goto out;
+               goto unlock_page;
         }
  
  identify_page_state:
         res = identify_page_state(pfn, p, page_flags);
-out:
+       mutex_unlock(&mf_mutex);
+       return res;
+unlock_page:
         unlock_page(p);
+unlock_mutex:
+       mutex_unlock(&mf_mutex);
         return res;
  }
  EXPORT_SYMBOL_GPL(memory_failure);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index d1f5de1c1283b08075b3bb5177ea5b1214a58c0f..ef2265f86b913dbcd604f22a893079775fff135b 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5053,9 +5053,13 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
          * Skip populated array elements to determine if any pages need
          * to be allocated before disabling IRQs.
          */
-       while (page_array && page_array[nr_populated] && nr_populated < nr_pages)
+       while (page_array && nr_populated < nr_pages && page_array[nr_populated])
                 nr_populated++;
  
+       /* Already populated array? */
+       if (unlikely(page_array && nr_pages - nr_populated == 0))
+               return 0;
+
         /* Use the single page allocator for one page. */
         if (nr_pages - nr_populated == 1)
                 goto failed;
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c

index e37bd43904af7a372ef60a52f17be25abf1b4456..a4435311754b08597c3cced04df12c3effd747e6 100644 (file)
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -116,6 +116,13 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw)
         return pfn_is_match(pvmw->page, pfn);
  }
  
+static void step_forward(struct page_vma_mapped_walk *pvmw, unsigned long size)
+{
+       pvmw->address = (pvmw->address + size) & ~(size - 1);
+       if (!pvmw->address)
+               pvmw->address = ULONG_MAX;
+}
+
  /**
   * page_vma_mapped_walk - check if @pvmw->page is mapped in @pvmw->vma at
   * @pvmw->address
@@ -144,6 +151,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
  {
         struct mm_struct *mm = pvmw->vma->vm_mm;
         struct page *page = pvmw->page;
+       unsigned long end;
         pgd_t *pgd;
         p4d_t *p4d;
         pud_t *pud;
@@ -153,10 +161,11 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
         if (pvmw->pmd && !pvmw->pte)
                 return not_found(pvmw);
  
-       if (pvmw->pte)
-               goto next_pte;
+       if (unlikely(PageHuge(page))) {
+               /* The only possible mapping was handled on last iteration */
+               if (pvmw->pte)
+                       return not_found(pvmw);
  
-       if (unlikely(PageHuge(pvmw->page))) {
                 /* when pud is not present, pte will be NULL */
                 pvmw->pte = huge_pte_offset(mm, pvmw->address, page_size(page));
                 if (!pvmw->pte)
@@ -168,89 +177,108 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
                         return not_found(pvmw);
                 return true;
         }
-restart:
-       pgd = pgd_offset(mm, pvmw->address);
-       if (!pgd_present(*pgd))
-               return false;
-       p4d = p4d_offset(pgd, pvmw->address);
-       if (!p4d_present(*p4d))
-               return false;
-       pud = pud_offset(p4d, pvmw->address);
-       if (!pud_present(*pud))
-               return false;
-       pvmw->pmd = pmd_offset(pud, pvmw->address);
+
         /*
-        * Make sure the pmd value isn't cached in a register by the
-        * compiler and used as a stale value after we've observed a
-        * subsequent update.
+        * Seek to next pte only makes sense for THP.
+        * But more important than that optimization, is to filter out
+        * any PageKsm page: whose page->index misleads vma_address()
+        * and vma_address_end() to disaster.
          */
-       pmde = READ_ONCE(*pvmw->pmd);
-       if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde)) {
-               pvmw->ptl = pmd_lock(mm, pvmw->pmd);
-               if (likely(pmd_trans_huge(*pvmw->pmd))) {
-                       if (pvmw->flags & PVMW_MIGRATION)
-                               return not_found(pvmw);
-                       if (pmd_page(*pvmw->pmd) != page)
-                               return not_found(pvmw);
-                       return true;
-               } else if (!pmd_present(*pvmw->pmd)) {
-                       if (thp_migration_supported()) {
-                               if (!(pvmw->flags & PVMW_MIGRATION))
+       end = PageTransCompound(page) ?
+               vma_address_end(page, pvmw->vma) :
+               pvmw->address + PAGE_SIZE;
+       if (pvmw->pte)
+               goto next_pte;
+restart:
+       do {
+               pgd = pgd_offset(mm, pvmw->address);
+               if (!pgd_present(*pgd)) {
+                       step_forward(pvmw, PGDIR_SIZE);
+                       continue;
+               }
+               p4d = p4d_offset(pgd, pvmw->address);
+               if (!p4d_present(*p4d)) {
+                       step_forward(pvmw, P4D_SIZE);
+                       continue;
+               }
+               pud = pud_offset(p4d, pvmw->address);
+               if (!pud_present(*pud)) {
+                       step_forward(pvmw, PUD_SIZE);
+                       continue;
+               }
+
+               pvmw->pmd = pmd_offset(pud, pvmw->address);
+               /*
+                * Make sure the pmd value isn't cached in a register by the
+                * compiler and used as a stale value after we've observed a
+                * subsequent update.
+                */
+               pmde = READ_ONCE(*pvmw->pmd);
+
+               if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde)) {
+                       pvmw->ptl = pmd_lock(mm, pvmw->pmd);
+                       pmde = *pvmw->pmd;
+                       if (likely(pmd_trans_huge(pmde))) {
+                               if (pvmw->flags & PVMW_MIGRATION)
                                         return not_found(pvmw);
-                               if (is_migration_entry(pmd_to_swp_entry(*pvmw->pmd))) {
-                                       swp_entry_t entry = pmd_to_swp_entry(*pvmw->pmd);
+                               if (pmd_page(pmde) != page)
+                                       return not_found(pvmw);
+                               return true;
+                       }
+                       if (!pmd_present(pmde)) {
+                               swp_entry_t entry;
  
-                                       if (migration_entry_to_page(entry) != page)
-                                               return not_found(pvmw);
-                                       return true;
-                               }
+                               if (!thp_migration_supported() ||
+                                   !(pvmw->flags & PVMW_MIGRATION))
+                                       return not_found(pvmw);
+                               entry = pmd_to_swp_entry(pmde);
+                               if (!is_migration_entry(entry) ||
+                                   migration_entry_to_page(entry) != page)
+                                       return not_found(pvmw);
+                               return true;
                         }
-                       return not_found(pvmw);
-               } else {
                         /* THP pmd was split under us: handle on pte level */
                         spin_unlock(pvmw->ptl);
                         pvmw->ptl = NULL;
-               }
-       } else if (!pmd_present(pmde)) {
-               /*
-                * If PVMW_SYNC, take and drop THP pmd lock so that we
-                * cannot return prematurely, while zap_huge_pmd() has
-                * cleared *pmd but not decremented compound_mapcount().
-                */
-               if ((pvmw->flags & PVMW_SYNC) &&
-                   PageTransCompound(pvmw->page)) {
-                       spinlock_t *ptl = pmd_lock(mm, pvmw->pmd);
+               } else if (!pmd_present(pmde)) {
+                       /*
+                        * If PVMW_SYNC, take and drop THP pmd lock so that we
+                        * cannot return prematurely, while zap_huge_pmd() has
+                        * cleared *pmd but not decremented compound_mapcount().
+                        */
+                       if ((pvmw->flags & PVMW_SYNC) &&
+                           PageTransCompound(page)) {
+                               spinlock_t *ptl = pmd_lock(mm, pvmw->pmd);
  
-                       spin_unlock(ptl);
+                               spin_unlock(ptl);
+                       }
+                       step_forward(pvmw, PMD_SIZE);
+                       continue;
                 }
-               return false;
-       }
-       if (!map_pte(pvmw))
-               goto next_pte;
-       while (1) {
-               unsigned long end;
-
+               if (!map_pte(pvmw))
+                       goto next_pte;
+this_pte:
                 if (check_pte(pvmw))
                         return true;
  next_pte:
-               /* Seek to next pte only makes sense for THP */
-               if (!PageTransHuge(pvmw->page) || PageHuge(pvmw->page))
-                       return not_found(pvmw);
-               end = vma_address_end(pvmw->page, pvmw->vma);
                 do {
                         pvmw->address += PAGE_SIZE;
                         if (pvmw->address >= end)
                                 return not_found(pvmw);
                         /* Did we cross page table boundary? */
-                       if (pvmw->address % PMD_SIZE == 0) {
-                               pte_unmap(pvmw->pte);
+                       if ((pvmw->address & (PMD_SIZE - PAGE_SIZE)) == 0) {
                                 if (pvmw->ptl) {
                                         spin_unlock(pvmw->ptl);
                                         pvmw->ptl = NULL;
                                 }
+                               pte_unmap(pvmw->pte);
+                               pvmw->pte = NULL;
                                 goto restart;
-                       } else {
-                               pvmw->pte++;
+                       }
+                       pvmw->pte++;
+                       if ((pvmw->flags & PVMW_SYNC) && !pvmw->ptl) {
+                               pvmw->ptl = pte_lockptr(mm, pvmw->pmd);
+                               spin_lock(pvmw->ptl);
                         }
                 } while (pte_none(*pvmw->pte));
  
@@ -258,7 +286,10 @@ next_pte:
                         pvmw->ptl = pte_lockptr(mm, pvmw->pmd);
                         spin_lock(pvmw->ptl);
                 }
-       }
+               goto this_pte;
+       } while (pvmw->address < end);
+
+       return false;
  }
  
  /**
diff --git a/mm/vmalloc.c b/mm/vmalloc.c

index a13ac524f6ff8e52d8dcaafd6a683c25e7a371de..d0a7d89be091e351b95e53bbd548d22f63d4a933 100644 (file)
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2344,15 +2344,16 @@ static void clear_vm_uninitialized_flag(struct vm_struct *vm)
  }
  
  static struct vm_struct *__get_vm_area_node(unsigned long size,
-               unsigned long align, unsigned long flags, unsigned long start,
-               unsigned long end, int node, gfp_t gfp_mask, const void *caller)
+               unsigned long align, unsigned long shift, unsigned long flags,
+               unsigned long start, unsigned long end, int node,
+               gfp_t gfp_mask, const void *caller)
  {
         struct vmap_area *va;
         struct vm_struct *area;
         unsigned long requested_size = size;
  
         BUG_ON(in_interrupt());
-       size = PAGE_ALIGN(size);
+       size = ALIGN(size, 1ul << shift);
         if (unlikely(!size))
                 return NULL;
  
@@ -2384,8 +2385,8 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
                                        unsigned long start, unsigned long end,
                                        const void *caller)
  {
-       return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE,
-                                 GFP_KERNEL, caller);
+       return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, start, end,
+                                 NUMA_NO_NODE, GFP_KERNEL, caller);
  }
  
  /**
@@ -2401,7 +2402,8 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
   */
  struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
  {
-       return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
+       return __get_vm_area_node(size, 1, PAGE_SHIFT, flags,
+                                 VMALLOC_START, VMALLOC_END,
                                   NUMA_NO_NODE, GFP_KERNEL,
                                   __builtin_return_address(0));
  }
@@ -2409,7 +2411,8 @@ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
  struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
                                 const void *caller)
  {
-       return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
+       return __get_vm_area_node(size, 1, PAGE_SHIFT, flags,
+                                 VMALLOC_START, VMALLOC_END,
                                   NUMA_NO_NODE, GFP_KERNEL, caller);
  }
  
@@ -2902,9 +2905,9 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
         }
  
  again:
-       size = PAGE_ALIGN(size);
-       area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED |
-                               vm_flags, start, end, node, gfp_mask, caller);
+       area = __get_vm_area_node(real_size, align, shift, VM_ALLOC |
+                                 VM_UNINITIALIZED | vm_flags, start, end, node,
+                                 gfp_mask, caller);
         if (!area) {
                 warn_alloc(gfp_mask, NULL,
                            "vmalloc size %lu allocation failure: "
@@ -2923,6 +2926,7 @@ again:
          */
         clear_vm_uninitialized_flag(area);
  
+       size = PAGE_ALIGN(size);
         kmemleak_vmalloc(area, size, gfp_mask);
  
         return addr;
@@ -2998,6 +3002,23 @@ void *vmalloc(unsigned long size)
  }
  EXPORT_SYMBOL(vmalloc);
  
+/**
+ * vmalloc_no_huge - allocate virtually contiguous memory using small pages
+ * @size:    allocation size
+ *
+ * Allocate enough non-huge pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ *
+ * Return: pointer to the allocated memory or %NULL on error
+ */
+void *vmalloc_no_huge(unsigned long size)
+{
+       return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
+                                   GFP_KERNEL, PAGE_KERNEL, VM_NO_HUGE_VMAP,
+                                   NUMA_NO_NODE, __builtin_return_address(0));
+}
+EXPORT_SYMBOL(vmalloc_no_huge);
+
  /**
   * vzalloc - allocate virtually contiguous memory with zero fill
   * @size:    allocation size
author	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 25 Jun 2021 18:05:03 +0000 (11:05 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 25 Jun 2021 18:05:03 +0000 (11:05 -0700)
.mailmap		patch \| blob \| history
MAINTAINERS		patch \| blob \| history
arch/s390/kvm/pv.c		patch \| blob \| history
fs/nilfs2/sysfs.c		patch \| blob \| history
include/linux/hugetlb.h		patch \| blob \| history
include/linux/pagemap.h		patch \| blob \| history
include/linux/vmalloc.h		patch \| blob \| history
kernel/futex.c		patch \| blob \| history
kernel/kthread.c		patch \| blob \| history
mm/hugetlb.c		patch \| blob \| history
mm/memory-failure.c		patch \| blob \| history
mm/page_alloc.c		patch \| blob \| history
mm/page_vma_mapped.c		patch \| blob \| history
mm/vmalloc.c		patch \| blob \| history