mm, fs, dax: handle layout changes to pinned dax mappings

author Dan Williams <dan.j.williams@intel.com>

Sat, 10 Mar 2018 01:44:31 +0000 (17:44 -0800)

committer Dan Williams <dan.j.williams@intel.com>

Tue, 22 May 2018 14:19:08 +0000 (07:19 -0700)
author Dan Williams <dan.j.williams@intel.com>
Sat, 10 Mar 2018 01:44:31 +0000 (17:44 -0800)
committer Dan Williams <dan.j.williams@intel.com>
Tue, 22 May 2018 14:19:08 +0000 (07:19 -0700)
diff --git a/fs/dax.c b/fs/dax.c

index aaec72d..e8f61ea 100644 (file)
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -351,6 +351,19 @@ static void dax_disassociate_entry(void *entry, struct address_space *mapping,
         }
  }
  
+static struct page *dax_busy_page(void *entry)
+{
+       unsigned long pfn;
+
+       for_each_mapped_pfn(entry, pfn) {
+               struct page *page = pfn_to_page(pfn);
+
+               if (page_ref_count(page) > 1)
+                       return page;
+       }
+       return NULL;
+}
+
  /*
   * Find radix tree entry at given index. If it points to an exceptional entry,
   * return it with the radix tree entry locked. If the radix tree doesn't
@@ -492,6 +505,90 @@ restart:
         return entry;
  }
  
+/**
+ * dax_layout_busy_page - find first pinned page in @mapping
+ * @mapping: address space to scan for a page with ref count > 1
+ *
+ * DAX requires ZONE_DEVICE mapped pages. These pages are never
+ * 'onlined' to the page allocator so they are considered idle when
+ * page->count == 1. A filesystem uses this interface to determine if
+ * any page in the mapping is busy, i.e. for DMA, or other
+ * get_user_pages() usages.
+ *
+ * It is expected that the filesystem is holding locks to block the
+ * establishment of new mappings in this address_space. I.e. it expects
+ * to be able to run unmap_mapping_range() and subsequently not race
+ * mapping_mapped() becoming true.
+ */
+struct page *dax_layout_busy_page(struct address_space *mapping)
+{
+       pgoff_t indices[PAGEVEC_SIZE];
+       struct page *page = NULL;
+       struct pagevec pvec;
+       pgoff_t index, end;
+       unsigned i;
+
+       /*
+        * In the 'limited' case get_user_pages() for dax is disabled.
+        */
+       if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
+               return NULL;
+
+       if (!dax_mapping(mapping) || !mapping_mapped(mapping))
+               return NULL;
+
+       pagevec_init(&pvec);
+       index = 0;
+       end = -1;
+
+       /*
+        * If we race get_user_pages_fast() here either we'll see the
+        * elevated page count in the pagevec_lookup and wait, or
+        * get_user_pages_fast() will see that the page it took a reference
+        * against is no longer mapped in the page tables and bail to the
+        * get_user_pages() slow path.  The slow path is protected by
+        * pte_lock() and pmd_lock(). New references are not taken without
+        * holding those locks, and unmap_mapping_range() will not zero the
+        * pte or pmd without holding the respective lock, so we are
+        * guaranteed to either see new references or prevent new
+        * references from being established.
+        */
+       unmap_mapping_range(mapping, 0, 0, 1);
+
+       while (index < end && pagevec_lookup_entries(&pvec, mapping, index,
+                               min(end - index, (pgoff_t)PAGEVEC_SIZE),
+                               indices)) {
+               for (i = 0; i < pagevec_count(&pvec); i++) {
+                       struct page *pvec_ent = pvec.pages[i];
+                       void *entry;
+
+                       index = indices[i];
+                       if (index >= end)
+                               break;
+
+                       if (!radix_tree_exceptional_entry(pvec_ent))
+                               continue;
+
+                       xa_lock_irq(&mapping->i_pages);
+                       entry = get_unlocked_mapping_entry(mapping, index, NULL);
+                       if (entry)
+                               page = dax_busy_page(entry);
+                       put_unlocked_mapping_entry(mapping, index, entry);
+                       xa_unlock_irq(&mapping->i_pages);
+                       if (page)
+                               break;
+               }
+               pagevec_remove_exceptionals(&pvec);
+               pagevec_release(&pvec);
+               index++;
+
+               if (page)
+                       break;
+       }
+       return page;
+}
+EXPORT_SYMBOL_GPL(dax_layout_busy_page);
+
  static int __dax_invalidate_mapping_entry(struct address_space *mapping,
                                           pgoff_t index, bool trunc)
  {
diff --git a/include/linux/dax.h b/include/linux/dax.h

index f9eb22a..25bab6a 100644 (file)
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -83,6 +83,8 @@ static inline void fs_put_dax(struct dax_device *dax_dev)
  struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev);
  int dax_writeback_mapping_range(struct address_space *mapping,
                 struct block_device *bdev, struct writeback_control *wbc);
+
+struct page *dax_layout_busy_page(struct address_space *mapping);
  #else
  static inline int bdev_dax_supported(struct super_block *sb, int blocksize)
  {
@@ -103,6 +105,11 @@ static inline struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev)
         return NULL;
  }
  
+static inline struct page *dax_layout_busy_page(struct address_space *mapping)
+{
+       return NULL;
+}
+
  static inline int dax_writeback_mapping_range(struct address_space *mapping,
                 struct block_device *bdev, struct writeback_control *wbc)
  {
author	Dan Williams <dan.j.williams@intel.com>
	Sat, 10 Mar 2018 01:44:31 +0000 (17:44 -0800)
committer	Dan Williams <dan.j.williams@intel.com>
	Tue, 22 May 2018 14:19:08 +0000 (07:19 -0700)
fs/dax.c		patch \| blob \| history
include/linux/dax.h		patch \| blob \| history