Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs

author Linus Torvalds <torvalds@linux-foundation.org>

Sat, 23 Jan 2016 20:24:56 +0000 (12:24 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Sat, 23 Jan 2016 20:24:56 +0000 (12:24 -0800)
author Linus Torvalds <torvalds@linux-foundation.org>
Sat, 23 Jan 2016 20:24:56 +0000 (12:24 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Sat, 23 Jan 2016 20:24:56 +0000 (12:24 -0800)
diff --combined fs/block_dev.c

index 60895e5,2c3aeab..7b9cd49
--- 1/fs/block_dev.c
--- 2/fs/block_dev.c
+++ b/fs/block_dev.c
@@@ -75,7 -75,7 +75,7 @@@ void kill_bdev(struct block_device *bde
   {
         struct address_space *mapping = bdev->bd_inode->i_mapping;
   
- -      if (mapping->nrpages == 0 && mapping->nrshadows == 0)
+ +      if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
                 return;
   
         invalidate_bh_lrus();
@@@ -346,9 -346,9 +346,9 @@@ static loff_t block_llseek(struct file 
         struct inode *bd_inode = bdev_file_inode(file);
         loff_t retval;
   
-       mutex_lock(&bd_inode->i_mutex);
+       inode_lock(bd_inode);
         retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode));
-       mutex_unlock(&bd_inode->i_mutex);
+       inode_unlock(bd_inode);
         return retval;
   }
         
@@@ -1142,9 -1142,9 +1142,9 @@@ void bd_set_size(struct block_device *b
   {
         unsigned bsize = bdev_logical_block_size(bdev);
   
-       mutex_lock(&bdev->bd_inode->i_mutex);
+       inode_lock(bdev->bd_inode);
         i_size_write(bdev->bd_inode, size);
-       mutex_unlock(&bdev->bd_inode->i_mutex);
+       inode_unlock(bdev->bd_inode);
         while (bsize < PAGE_CACHE_SIZE) {
                 if (size & bsize)
                         break;
@@@ -1741,9 -1741,9 +1741,9 @@@ static void blkdev_vm_open(struct vm_ar
         struct inode *bd_inode = bdev_file_inode(vma->vm_file);
         struct block_device *bdev = I_BDEV(bd_inode);
   
-       mutex_lock(&bd_inode->i_mutex);
+       inode_lock(bd_inode);
         bdev->bd_map_count++;
-       mutex_unlock(&bd_inode->i_mutex);
+       inode_unlock(bd_inode);
   }
   
   static void blkdev_vm_close(struct vm_area_struct *vma)
@@@ -1751,9 -1751,9 +1751,9 @@@
         struct inode *bd_inode = bdev_file_inode(vma->vm_file);
         struct block_device *bdev = I_BDEV(bd_inode);
   
-       mutex_lock(&bd_inode->i_mutex);
+       inode_lock(bd_inode);
         bdev->bd_map_count--;
-       mutex_unlock(&bd_inode->i_mutex);
+       inode_unlock(bd_inode);
   }
   
   static const struct vm_operations_struct blkdev_dax_vm_ops = {
@@@ -1777,7 -1777,7 +1777,7 @@@ static int blkdev_mmap(struct file *fil
         struct block_device *bdev = I_BDEV(bd_inode);
   
         file_accessed(file);
-       mutex_lock(&bd_inode->i_mutex);
+       inode_lock(bd_inode);
         bdev->bd_map_count++;
         if (IS_DAX(bd_inode)) {
                 vma->vm_ops = &blkdev_dax_vm_ops;
@@@ -1785,7 -1785,7 +1785,7 @@@
         } else {
                 vma->vm_ops = &blkdev_default_vm_ops;
         }
-       mutex_unlock(&bd_inode->i_mutex);
+       inode_unlock(bd_inode);
   
         return 0;
   }
diff --combined fs/dax.c

index 206650f,55aa273..4fd6b0c
--- 1/fs/dax.c
--- 2/fs/dax.c
+++ b/fs/dax.c
@@@ -24,7 -24,6 +24,7 @@@
   #include <linux/memcontrol.h>
   #include <linux/mm.h>
   #include <linux/mutex.h>
+ +#include <linux/pagevec.h>
   #include <linux/pmem.h>
   #include <linux/sched.h>
   #include <linux/uio.h>
@@@ -246,14 -245,13 +246,14 @@@ ssize_t dax_do_io(struct kiocb *iocb, s
         loff_t end = pos + iov_iter_count(iter);
   
         memset(&bh, 0, sizeof(bh));
+ +      bh.b_bdev = inode->i_sb->s_bdev;
   
         if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) {
                 struct address_space *mapping = inode->i_mapping;
-               mutex_lock(&inode->i_mutex);
+               inode_lock(inode);
                 retval = filemap_write_and_wait_range(mapping, pos, end - 1);
                 if (retval) {
-                       mutex_unlock(&inode->i_mutex);
+                       inode_unlock(inode);
                         goto out;
                 }
         }
@@@ -265,7 -263,7 +265,7 @@@
         retval = dax_io(inode, iter, pos, end, get_block, &bh);
   
         if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
   
         if ((retval > 0) && end_io)
                 end_io(iocb, pos, retval, bh.b_private);
@@@ -326,199 -324,6 +326,199 @@@ static int copy_user_bh(struct page *to
         return 0;
   }
   
+ +#define NO_SECTOR -1
+ +#define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_CACHE_SHIFT))
+ +
+ +static int dax_radix_entry(struct address_space *mapping, pgoff_t index,
+ +              sector_t sector, bool pmd_entry, bool dirty)
+ +{
+ +      struct radix_tree_root *page_tree = &mapping->page_tree;
+ +      pgoff_t pmd_index = DAX_PMD_INDEX(index);
+ +      int type, error = 0;
+ +      void *entry;
+ +
+ +      WARN_ON_ONCE(pmd_entry && !dirty);
+ +      __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+ +
+ +      spin_lock_irq(&mapping->tree_lock);
+ +
+ +      entry = radix_tree_lookup(page_tree, pmd_index);
+ +      if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) {
+ +              index = pmd_index;
+ +              goto dirty;
+ +      }
+ +
+ +      entry = radix_tree_lookup(page_tree, index);
+ +      if (entry) {
+ +              type = RADIX_DAX_TYPE(entry);
+ +              if (WARN_ON_ONCE(type != RADIX_DAX_PTE &&
+ +                                      type != RADIX_DAX_PMD)) {
+ +                      error = -EIO;
+ +                      goto unlock;
+ +              }
+ +
+ +              if (!pmd_entry || type == RADIX_DAX_PMD)
+ +                      goto dirty;
+ +
+ +              /*
+ +               * We only insert dirty PMD entries into the radix tree.  This
+ +               * means we don't need to worry about removing a dirty PTE
+ +               * entry and inserting a clean PMD entry, thus reducing the
+ +               * range we would flush with a follow-up fsync/msync call.
+ +               */
+ +              radix_tree_delete(&mapping->page_tree, index);
+ +              mapping->nrexceptional--;
+ +      }
+ +
+ +      if (sector == NO_SECTOR) {
+ +              /*
+ +               * This can happen during correct operation if our pfn_mkwrite
+ +               * fault raced against a hole punch operation.  If this
+ +               * happens the pte that was hole punched will have been
+ +               * unmapped and the radix tree entry will have been removed by
+ +               * the time we are called, but the call will still happen.  We
+ +               * will return all the way up to wp_pfn_shared(), where the
+ +               * pte_same() check will fail, eventually causing page fault
+ +               * to be retried by the CPU.
+ +               */
+ +              goto unlock;
+ +      }
+ +
+ +      error = radix_tree_insert(page_tree, index,
+ +                      RADIX_DAX_ENTRY(sector, pmd_entry));
+ +      if (error)
+ +              goto unlock;
+ +
+ +      mapping->nrexceptional++;
+ + dirty:
+ +      if (dirty)
+ +              radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
+ + unlock:
+ +      spin_unlock_irq(&mapping->tree_lock);
+ +      return error;
+ +}
+ +
+ +static int dax_writeback_one(struct block_device *bdev,
+ +              struct address_space *mapping, pgoff_t index, void *entry)
+ +{
+ +      struct radix_tree_root *page_tree = &mapping->page_tree;
+ +      int type = RADIX_DAX_TYPE(entry);
+ +      struct radix_tree_node *node;
+ +      struct blk_dax_ctl dax;
+ +      void **slot;
+ +      int ret = 0;
+ +
+ +      spin_lock_irq(&mapping->tree_lock);
+ +      /*
+ +       * Regular page slots are stabilized by the page lock even
+ +       * without the tree itself locked.  These unlocked entries
+ +       * need verification under the tree lock.
+ +       */
+ +      if (!__radix_tree_lookup(page_tree, index, &node, &slot))
+ +              goto unlock;
+ +      if (*slot != entry)
+ +              goto unlock;
+ +
+ +      /* another fsync thread may have already written back this entry */
+ +      if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
+ +              goto unlock;
+ +
+ +      if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) {
+ +              ret = -EIO;
+ +              goto unlock;
+ +      }
+ +
+ +      dax.sector = RADIX_DAX_SECTOR(entry);
+ +      dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE);
+ +      spin_unlock_irq(&mapping->tree_lock);
+ +
+ +      /*
+ +       * We cannot hold tree_lock while calling dax_map_atomic() because it
+ +       * eventually calls cond_resched().
+ +       */
+ +      ret = dax_map_atomic(bdev, &dax);
+ +      if (ret < 0)
+ +              return ret;
+ +
+ +      if (WARN_ON_ONCE(ret < dax.size)) {
+ +              ret = -EIO;
+ +              goto unmap;
+ +      }
+ +
+ +      wb_cache_pmem(dax.addr, dax.size);
+ +
+ +      spin_lock_irq(&mapping->tree_lock);
+ +      radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE);
+ +      spin_unlock_irq(&mapping->tree_lock);
+ + unmap:
+ +      dax_unmap_atomic(bdev, &dax);
+ +      return ret;
+ +
+ + unlock:
+ +      spin_unlock_irq(&mapping->tree_lock);
+ +      return ret;
+ +}
+ +
+ +/*
+ + * Flush the mapping to the persistent domain within the byte range of [start,
+ + * end]. This is required by data integrity operations to ensure file data is
+ + * on persistent storage prior to completion of the operation.
+ + */
+ +int dax_writeback_mapping_range(struct address_space *mapping, loff_t start,
+ +              loff_t end)
+ +{
+ +      struct inode *inode = mapping->host;
+ +      struct block_device *bdev = inode->i_sb->s_bdev;
+ +      pgoff_t start_index, end_index, pmd_index;
+ +      pgoff_t indices[PAGEVEC_SIZE];
+ +      struct pagevec pvec;
+ +      bool done = false;
+ +      int i, ret = 0;
+ +      void *entry;
+ +
+ +      if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
+ +              return -EIO;
+ +
+ +      start_index = start >> PAGE_CACHE_SHIFT;
+ +      end_index = end >> PAGE_CACHE_SHIFT;
+ +      pmd_index = DAX_PMD_INDEX(start_index);
+ +
+ +      rcu_read_lock();
+ +      entry = radix_tree_lookup(&mapping->page_tree, pmd_index);
+ +      rcu_read_unlock();
+ +
+ +      /* see if the start of our range is covered by a PMD entry */
+ +      if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD)
+ +              start_index = pmd_index;
+ +
+ +      tag_pages_for_writeback(mapping, start_index, end_index);
+ +
+ +      pagevec_init(&pvec, 0);
+ +      while (!done) {
+ +              pvec.nr = find_get_entries_tag(mapping, start_index,
+ +                              PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE,
+ +                              pvec.pages, indices);
+ +
+ +              if (pvec.nr == 0)
+ +                      break;
+ +
+ +              for (i = 0; i < pvec.nr; i++) {
+ +                      if (indices[i] > end_index) {
+ +                              done = true;
+ +                              break;
+ +                      }
+ +
+ +                      ret = dax_writeback_one(bdev, mapping, indices[i],
+ +                                      pvec.pages[i]);
+ +                      if (ret < 0)
+ +                              return ret;
+ +              }
+ +      }
+ +      wmb_pmem();
+ +      return 0;
+ +}
+ +EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
+ +
   static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
                         struct vm_area_struct *vma, struct vm_fault *vmf)
   {
@@@ -558,11 -363,6 +558,11 @@@
         }
         dax_unmap_atomic(bdev, &dax);
   
+ +      error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false,
+ +                      vmf->flags & FAULT_FLAG_WRITE);
+ +      if (error)
+ +              goto out;
+ +
         error = vm_insert_mixed(vma, vaddr, dax.pfn);
   
    out:
@@@ -608,7 -408,6 +608,7 @@@ int __dax_fault(struct vm_area_struct *
   
         memset(&bh, 0, sizeof(bh));
         block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
+ +      bh.b_bdev = inode->i_sb->s_bdev;
         bh.b_size = PAGE_SIZE;
   
    repeat:
@@@ -688,7 -487,6 +688,7 @@@
                 delete_from_page_cache(page);
                 unlock_page(page);
                 page_cache_release(page);
+ +              page = NULL;
         }
   
         /*
@@@ -792,8 -590,7 +792,8 @@@ int __dax_pmd_fault(struct vm_area_stru
         struct block_device *bdev;
         pgoff_t size, pgoff;
         sector_t block;
- -      int result = 0;
+ +      int error, result = 0;
+ +      bool alloc = false;
   
         /* dax pmd mappings require pfn_t_devmap() */
         if (!IS_ENABLED(CONFIG_FS_DAX_PMD))
@@@ -827,21 -624,13 +827,21 @@@
         }
   
         memset(&bh, 0, sizeof(bh));
+ +      bh.b_bdev = inode->i_sb->s_bdev;
         block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
   
         bh.b_size = PMD_SIZE;
- -      if (get_block(inode, block, &bh, write) != 0)
+ +
+ +      if (get_block(inode, block, &bh, 0) != 0)
                 return VM_FAULT_SIGBUS;
+ +
+ +      if (!buffer_mapped(&bh) && write) {
+ +              if (get_block(inode, block, &bh, 1) != 0)
+ +                      return VM_FAULT_SIGBUS;
+ +              alloc = true;
+ +      }
+ +
         bdev = bh.b_bdev;
- -      i_mmap_lock_read(mapping);
   
         /*
          * If the filesystem isn't willing to tell us the length of a hole,
@@@ -850,22 -639,19 +850,22 @@@
          */
         if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) {
                 dax_pmd_dbg(&bh, address, "allocated block too small");
- -              goto fallback;
+ +              return VM_FAULT_FALLBACK;
         }
   
         /*
          * If we allocated new storage, make sure no process has any
          * zero pages covering this hole
          */
- -      if (buffer_new(&bh)) {
- -              i_mmap_unlock_read(mapping);
- -              unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0);
- -              i_mmap_lock_read(mapping);
+ +      if (alloc) {
+ +              loff_t lstart = pgoff << PAGE_SHIFT;
+ +              loff_t lend = lstart + PMD_SIZE - 1; /* inclusive */
+ +
+ +              truncate_pagecache_range(inode, lstart, lend);
         }
   
+ +      i_mmap_lock_read(mapping);
+ +
         /*
          * If a truncate happened while we were allocating blocks, we may
          * leave blocks allocated to the file that are beyond EOF.  We can't
@@@ -878,8 -664,7 +878,8 @@@
                 goto out;
         }
         if ((pgoff | PG_PMD_COLOUR) >= size) {
- -              dax_pmd_dbg(&bh, address, "pgoff unaligned");
+ +              dax_pmd_dbg(&bh, address,
+ +                              "offset + huge page size > file size");
                 goto fallback;
         }
   
@@@ -947,31 -732,6 +947,31 @@@
                 }
                 dax_unmap_atomic(bdev, &dax);
   
+ +              /*
+ +               * For PTE faults we insert a radix tree entry for reads, and
+ +               * leave it clean.  Then on the first write we dirty the radix
+ +               * tree entry via the dax_pfn_mkwrite() path.  This sequence
+ +               * allows the dax_pfn_mkwrite() call to be simpler and avoid a
+ +               * call into get_block() to translate the pgoff to a sector in
+ +               * order to be able to create a new radix tree entry.
+ +               *
+ +               * The PMD path doesn't have an equivalent to
+ +               * dax_pfn_mkwrite(), though, so for a read followed by a
+ +               * write we traverse all the way through __dax_pmd_fault()
+ +               * twice.  This means we can just skip inserting a radix tree
+ +               * entry completely on the initial read and just wait until
+ +               * the write to insert a dirty entry.
+ +               */
+ +              if (write) {
+ +                      error = dax_radix_entry(mapping, pgoff, dax.sector,
+ +                                      true, true);
+ +                      if (error) {
+ +                              dax_pmd_dbg(&bh, address,
+ +                                              "PMD radix insertion failed");
+ +                              goto fallback;
+ +                      }
+ +              }
+ +
                 dev_dbg(part_to_dev(bdev->bd_part),
                                 "%s: %s addr: %lx pfn: %lx sect: %llx\n",
                                 __func__, current->comm, address,
@@@ -1030,20 -790,15 +1030,20 @@@ EXPORT_SYMBOL_GPL(dax_pmd_fault)
    * dax_pfn_mkwrite - handle first write to DAX page
    * @vma: The virtual memory area where the fault occurred
    * @vmf: The description of the fault
- - *
    */
   int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
   {
- -      struct super_block *sb = file_inode(vma->vm_file)->i_sb;
+ +      struct file *file = vma->vm_file;
   
- -      sb_start_pagefault(sb);
- -      file_update_time(vma->vm_file);
- -      sb_end_pagefault(sb);
+ +      /*
+ +       * We pass NO_SECTOR to dax_radix_entry() because we expect that a
+ +       * RADIX_DAX_PTE entry already exists in the radix tree from a
+ +       * previous call to __dax_fault().  We just want to look up that PTE
+ +       * entry using vmf->pgoff and make sure the dirty tag is set.  This
+ +       * saves us from having to make a call to get_block() here to look
+ +       * up the sector.
+ +       */
+ +      dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false, true);
         return VM_FAULT_NOPAGE;
   }
   EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
@@@ -1080,7 -835,6 +1080,7 @@@ int dax_zero_page_range(struct inode *i
         BUG_ON((offset + length) > PAGE_CACHE_SIZE);
   
         memset(&bh, 0, sizeof(bh));
+ +      bh.b_bdev = inode->i_sb->s_bdev;
         bh.b_size = PAGE_CACHE_SIZE;
         err = get_block(inode, index, &bh, 0);
         if (err < 0)
diff --combined fs/ext4/file.c

index 8c8965c,8eb87e3..1126436
--- 1/fs/ext4/file.c
--- 2/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@@ -113,7 -113,7 +113,7 @@@ ext4_file_write_iter(struct kiocb *iocb
                 ext4_unwritten_wait(inode);
         }
   
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
         ret = generic_write_checks(iocb, from);
         if (ret <= 0)
                 goto out;
@@@ -169,7 -169,7 +169,7 @@@
         }
   
         ret = __generic_file_write_iter(iocb, from);
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
   
         if (ret > 0) {
                 ssize_t err;
@@@ -186,7 -186,7 +186,7 @@@
         return ret;
   
   out:
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
         if (aio_mutex)
                 mutex_unlock(aio_mutex);
         return ret;
@@@ -291,8 -291,8 +291,8 @@@ static int ext4_dax_pfn_mkwrite(struct 
   {
         struct inode *inode = file_inode(vma->vm_file);
         struct super_block *sb = inode->i_sb;
- -      int ret = VM_FAULT_NOPAGE;
         loff_t size;
+ +      int ret;
   
         sb_start_pagefault(sb);
         file_update_time(vma->vm_file);
@@@ -300,8 -300,6 +300,8 @@@
         size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
         if (vmf->pgoff >= size)
                 ret = VM_FAULT_SIGBUS;
+ +      else
+ +              ret = dax_pfn_mkwrite(vma, vmf);
         up_read(&EXT4_I(inode)->i_mmap_sem);
         sb_end_pagefault(sb);
   
@@@ -563,11 -561,11 +563,11 @@@ static loff_t ext4_seek_data(struct fil
         int blkbits;
         int ret = 0;
   
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
   
         isize = i_size_read(inode);
         if (offset >= isize) {
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
                 return -ENXIO;
         }
   
@@@ -615,7 -613,7 +615,7 @@@
                 dataoff = (loff_t)last << blkbits;
         } while (last <= end);
   
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
   
         if (dataoff > isize)
                 return -ENXIO;
@@@ -636,11 -634,11 +636,11 @@@ static loff_t ext4_seek_hole(struct fil
         int blkbits;
         int ret = 0;
   
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
   
         isize = i_size_read(inode);
         if (offset >= isize) {
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
                 return -ENXIO;
         }
   
@@@ -691,7 -689,7 +691,7 @@@
                 break;
         } while (last <= end);
   
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
   
         if (holeoff > isize)
                 holeoff = isize;
diff --combined fs/inode.c

index 1e6dd38,bb86852..9f62db3
--- 1/fs/inode.c
--- 2/fs/inode.c
+++ b/fs/inode.c
@@@ -495,7 -495,7 +495,7 @@@ void clear_inode(struct inode *inode
          */
         spin_lock_irq(&inode->i_data.tree_lock);
         BUG_ON(inode->i_data.nrpages);
- -      BUG_ON(inode->i_data.nrshadows);
+ +      BUG_ON(inode->i_data.nrexceptional);
         spin_unlock_irq(&inode->i_data.tree_lock);
         BUG_ON(!list_empty(&inode->i_data.private_list));
         BUG_ON(!(inode->i_state & I_FREEING));
@@@ -966,9 -966,9 +966,9 @@@ void lock_two_nondirectories(struct ino
                 swap(inode1, inode2);
   
         if (inode1 && !S_ISDIR(inode1->i_mode))
-               mutex_lock(&inode1->i_mutex);
+               inode_lock(inode1);
         if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1)
-               mutex_lock_nested(&inode2->i_mutex, I_MUTEX_NONDIR2);
+               inode_lock_nested(inode2, I_MUTEX_NONDIR2);
   }
   EXPORT_SYMBOL(lock_two_nondirectories);
   
@@@ -980,9 -980,9 +980,9 @@@
   void unlock_two_nondirectories(struct inode *inode1, struct inode *inode2)
   {
         if (inode1 && !S_ISDIR(inode1->i_mode))
-               mutex_unlock(&inode1->i_mutex);
+               inode_unlock(inode1);
         if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1)
-               mutex_unlock(&inode2->i_mutex);
+               inode_unlock(inode2);
   }
   EXPORT_SYMBOL(unlock_two_nondirectories);
   
diff --combined fs/xfs/xfs_file.c

index 55e16e2,bb2b8f3..52883ac
--- 1/fs/xfs/xfs_file.c
--- 2/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@@ -55,7 -55,7 +55,7 @@@ xfs_rw_ilock
         int                     type)
   {
         if (type & XFS_IOLOCK_EXCL)
-               mutex_lock(&VFS_I(ip)->i_mutex);
+               inode_lock(VFS_I(ip));
         xfs_ilock(ip, type);
   }
   
@@@ -66,7 -66,7 +66,7 @@@ xfs_rw_iunlock
   {
         xfs_iunlock(ip, type);
         if (type & XFS_IOLOCK_EXCL)
-               mutex_unlock(&VFS_I(ip)->i_mutex);
+               inode_unlock(VFS_I(ip));
   }
   
   static inline void
@@@ -76,7 -76,7 +76,7 @@@ xfs_rw_ilock_demote
   {
         xfs_ilock_demote(ip, type);
         if (type & XFS_IOLOCK_EXCL)
-               mutex_unlock(&VFS_I(ip)->i_mutex);
+               inode_unlock(VFS_I(ip));
   }
   
   /*
@@@ -1610,8 -1610,9 +1610,8 @@@ xfs_filemap_pmd_fault
   /*
    * pfn_mkwrite was originally inteneded to ensure we capture time stamp
    * updates on write faults. In reality, it's need to serialise against
- - * truncate similar to page_mkwrite. Hence we open-code dax_pfn_mkwrite()
- - * here and cycle the XFS_MMAPLOCK_SHARED to ensure we serialise the fault
- - * barrier in place.
+ + * truncate similar to page_mkwrite. Hence we cycle the XFS_MMAPLOCK_SHARED
+ + * to ensure we serialise the fault barrier in place.
    */
   static int
   xfs_filemap_pfn_mkwrite(
@@@ -1634,8 -1635,6 +1634,8 @@@
         size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
         if (vmf->pgoff >= size)
                 ret = VM_FAULT_SIGBUS;
+ +      else if (IS_DAX(inode))
+ +              ret = dax_pfn_mkwrite(vma, vmf);
         xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
         sb_end_pagefault(inode->i_sb);
         return ret;
diff --combined include/linux/fs.h

index 0d75703,2df6c03..1a20462
--- 1/include/linux/fs.h
--- 2/include/linux/fs.h
+++ b/include/linux/fs.h
@@@ -433,8 -433,7 +433,8 @@@ struct address_space 
         struct rw_semaphore     i_mmap_rwsem;   /* protect tree, count, list */
         /* Protected by tree_lock together with the radix tree */
         unsigned long           nrpages;        /* number of total pages */
- -      unsigned long           nrshadows;      /* number of shadow entries */
+ +      /* number of shadow or DAX exceptional entries */
+ +      unsigned long           nrexceptional;
         pgoff_t                 writeback_index;/* writeback starts here */
         const struct address_space_operations *a_ops;   /* methods */
         unsigned long           flags;          /* error bits/gfp mask */
@@@ -715,6 -714,31 +715,31 @@@ enum inode_i_mutex_lock_clas
         I_MUTEX_PARENT2,
   };
   
+ static inline void inode_lock(struct inode *inode)
+ {
+       mutex_lock(&inode->i_mutex);
+ }
+ 
+ static inline void inode_unlock(struct inode *inode)
+ {
+       mutex_unlock(&inode->i_mutex);
+ }
+ 
+ static inline int inode_trylock(struct inode *inode)
+ {
+       return mutex_trylock(&inode->i_mutex);
+ }
+ 
+ static inline int inode_is_locked(struct inode *inode)
+ {
+       return mutex_is_locked(&inode->i_mutex);
+ }
+ 
+ static inline void inode_lock_nested(struct inode *inode, unsigned subclass)
+ {
+       mutex_lock_nested(&inode->i_mutex, subclass);
+ }
+ 
   void lock_two_nondirectories(struct inode *, struct inode*);
   void unlock_two_nondirectories(struct inode *, struct inode*);
   
@@@ -3048,8 -3072,8 +3073,8 @@@ static inline bool dir_emit_dots(struc
   }
   static inline bool dir_relax(struct inode *inode)
   {
-       mutex_unlock(&inode->i_mutex);
-       mutex_lock(&inode->i_mutex);
+       inode_unlock(inode);
+       inode_lock(inode);
         return !IS_DEADDIR(inode);
   }
   
diff --combined mm/filemap.c

index 2e7c8d9,30ab120..bc94386
--- 1/mm/filemap.c
--- 2/mm/filemap.c
+++ b/mm/filemap.c
@@@ -11,7 -11,6 +11,7 @@@
    */
   #include <linux/export.h>
   #include <linux/compiler.h>
+ +#include <linux/dax.h>
   #include <linux/fs.h>
   #include <linux/uaccess.h>
   #include <linux/capability.h>
@@@ -124,9 -123,9 +124,9 @@@ static void page_cache_tree_delete(stru
         __radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot);
   
         if (shadow) {
- -              mapping->nrshadows++;
+ +              mapping->nrexceptional++;
                 /*
- -               * Make sure the nrshadows update is committed before
+ +               * Make sure the nrexceptional update is committed before
                  * the nrpages update so that final truncate racing
                  * with reclaim does not see both counters 0 at the
                  * same time and miss a shadow entry.
@@@ -482,12 -481,6 +482,12 @@@ int filemap_write_and_wait_range(struc
   {
         int err = 0;
   
+ +      if (dax_mapping(mapping) && mapping->nrexceptional) {
+ +              err = dax_writeback_mapping_range(mapping, lstart, lend);
+ +              if (err)
+ +                      return err;
+ +      }
+ +
         if (mapping->nrpages) {
                 err = __filemap_fdatawrite_range(mapping, lstart, lend,
                                                  WB_SYNC_ALL);
@@@ -586,13 -579,9 +586,13 @@@ static int page_cache_tree_insert(struc
                 p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
                 if (!radix_tree_exceptional_entry(p))
                         return -EEXIST;
+ +
+ +              if (WARN_ON(dax_mapping(mapping)))
+ +                      return -EINVAL;
+ +
                 if (shadowp)
                         *shadowp = p;
- -              mapping->nrshadows--;
+ +              mapping->nrexceptional--;
                 if (node)
                         workingset_node_shadows_dec(node);
         }
@@@ -1256,9 -1245,9 +1256,9 @@@ repeat
                         if (radix_tree_deref_retry(page))
                                 goto restart;
                         /*
- -                       * A shadow entry of a recently evicted page,
- -                       * or a swap entry from shmem/tmpfs.  Return
- -                       * it without attempting to raise page count.
+ +                       * A shadow entry of a recently evicted page, a swap
+ +                       * entry from shmem/tmpfs or a DAX entry.  Return it
+ +                       * without attempting to raise page count.
                          */
                         goto export;
                 }
@@@ -1505,74 -1494,6 +1505,74 @@@ repeat
   }
   EXPORT_SYMBOL(find_get_pages_tag);
   
+ +/**
+ + * find_get_entries_tag - find and return entries that match @tag
+ + * @mapping:  the address_space to search
+ + * @start:    the starting page cache index
+ + * @tag:      the tag index
+ + * @nr_entries:       the maximum number of entries
+ + * @entries:  where the resulting entries are placed
+ + * @indices:  the cache indices corresponding to the entries in @entries
+ + *
+ + * Like find_get_entries, except we only return entries which are tagged with
+ + * @tag.
+ + */
+ +unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
+ +                      int tag, unsigned int nr_entries,
+ +                      struct page **entries, pgoff_t *indices)
+ +{
+ +      void **slot;
+ +      unsigned int ret = 0;
+ +      struct radix_tree_iter iter;
+ +
+ +      if (!nr_entries)
+ +              return 0;
+ +
+ +      rcu_read_lock();
+ +restart:
+ +      radix_tree_for_each_tagged(slot, &mapping->page_tree,
+ +                                 &iter, start, tag) {
+ +              struct page *page;
+ +repeat:
+ +              page = radix_tree_deref_slot(slot);
+ +              if (unlikely(!page))
+ +                      continue;
+ +              if (radix_tree_exception(page)) {
+ +                      if (radix_tree_deref_retry(page)) {
+ +                              /*
+ +                               * Transient condition which can only trigger
+ +                               * when entry at index 0 moves out of or back
+ +                               * to root: none yet gotten, safe to restart.
+ +                               */
+ +                              goto restart;
+ +                      }
+ +
+ +                      /*
+ +                       * A shadow entry of a recently evicted page, a swap
+ +                       * entry from shmem/tmpfs or a DAX entry.  Return it
+ +                       * without attempting to raise page count.
+ +                       */
+ +                      goto export;
+ +              }
+ +              if (!page_cache_get_speculative(page))
+ +                      goto repeat;
+ +
+ +              /* Has the page moved? */
+ +              if (unlikely(page != *slot)) {
+ +                      page_cache_release(page);
+ +                      goto repeat;
+ +              }
+ +export:
+ +              indices[ret] = iter.index;
+ +              entries[ret] = page;
+ +              if (++ret == nr_entries)
+ +                      break;
+ +      }
+ +      rcu_read_unlock();
+ +      return ret;
+ +}
+ +EXPORT_SYMBOL(find_get_entries_tag);
+ +
   /*
    * CD/DVDs are error prone. When a medium error occurs, the driver may fail
    * a _large_ part of the i/o request. Imagine the worst scenario:
@@@ -2763,11 -2684,11 +2763,11 @@@ ssize_t generic_file_write_iter(struct 
         struct inode *inode = file->f_mapping->host;
         ssize_t ret;
   
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
         ret = generic_write_checks(iocb, from);
         if (ret > 0)
                 ret = __generic_file_write_iter(iocb, from);
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
   
         if (ret > 0) {
                 ssize_t err;
author	Linus Torvalds <torvalds@linux-foundation.org>
	Sat, 23 Jan 2016 20:24:56 +0000 (12:24 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Sat, 23 Jan 2016 20:24:56 +0000 (12:24 -0800)
		1	2
fs/block_dev.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/dax.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ext4/file.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/xfs_file.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/fs.h	patch \|	diff1 \|	diff2 \|	blob \| history
mm/filemap.c	patch \|	diff1 \|	diff2 \|	blob \| history