Merge branch 'for-linus-unmerged' of git://git.kernel.org/pub/scm/linux/kernel/git...
authorLinus Torvalds <torvalds@linux-foundation.org>
Mon, 28 Mar 2011 22:31:05 +0000 (15:31 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Mon, 28 Mar 2011 22:31:05 +0000 (15:31 -0700)
* 'for-linus-unmerged' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable: (45 commits)
  Btrfs: fix __btrfs_map_block on 32 bit machines
  btrfs: fix possible deadlock by clearing __GFP_FS flag
  btrfs: check link counter overflow in link(2)
  btrfs: don't mess with i_nlink of unlocked inode in rename()
  Btrfs: check return value of btrfs_alloc_path()
  Btrfs: fix OOPS of empty filesystem after balance
  Btrfs: fix memory leak of empty filesystem after balance
  Btrfs: fix return value of setflags ioctl
  Btrfs: fix uncheck memory allocations
  btrfs: make inode ref log recovery faster
  Btrfs: add btrfs_trim_fs() to handle FITRIM
  Btrfs: adjust btrfs_discard_extent() return errors and trimmed bytes
  Btrfs: make btrfs_map_block() return entire free extent for each device of RAID0/1/10/DUP
  Btrfs: make update_reserved_bytes() public
  btrfs: return EXDEV when linking from different subvolumes
  Btrfs: Per file/directory controls for COW and compression
  Btrfs: add datacow flag in inode flag
  btrfs: use GFP_NOFS instead of GFP_KERNEL
  Btrfs: check return value of read_tree_block()
  btrfs: properly access unaligned checksum buffer
  ...

Fix up trivial conflicts in fs/btrfs/volumes.c due to plug removal in
the block layer.

1  2 
fs/btrfs/disk-io.c
fs/btrfs/extent_io.c
fs/btrfs/inode.c
fs/btrfs/ioctl.c
fs/btrfs/volumes.c
fs/btrfs/xattr.c
include/linux/fs.h

diff --combined fs/btrfs/disk-io.c
@@@ -29,6 -29,7 +29,7 @@@
  #include <linux/crc32c.h>
  #include <linux/slab.h>
  #include <linux/migrate.h>
+ #include <asm/unaligned.h>
  #include "compat.h"
  #include "ctree.h"
  #include "disk-io.h"
@@@ -198,7 -199,7 +199,7 @@@ u32 btrfs_csum_data(struct btrfs_root *
  
  void btrfs_csum_final(u32 crc, char *result)
  {
-       *(__le32 *)result = ~cpu_to_le32(crc);
+       put_unaligned_le32(~crc, result);
  }
  
  /*
@@@ -323,6 -324,7 +324,7 @@@ static int btree_read_extent_buffer_pag
        int num_copies = 0;
        int mirror_num = 0;
  
+       clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
        io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
        while (1) {
                ret = read_extent_buffer_pages(io_tree, eb, start, 1,
                    !verify_parent_transid(io_tree, eb, parent_transid))
                        return ret;
  
+               /*
+                * This buffer's crc is fine, but its contents are corrupted, so
+                * there is no reason to read the other copies, they won't be
+                * any less wrong.
+                */
+               if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
+                       return ret;
                num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
                                              eb->start, eb->len);
                if (num_copies == 1)
@@@ -419,6 -429,73 +429,73 @@@ static int check_tree_block_fsid(struc
        return ret;
  }
  
+ #define CORRUPT(reason, eb, root, slot)                               \
+       printk(KERN_CRIT "btrfs: corrupt leaf, %s: block=%llu," \
+              "root=%llu, slot=%d\n", reason,                  \
+              (unsigned long long)btrfs_header_bytenr(eb),     \
+              (unsigned long long)root->objectid, slot)
+ static noinline int check_leaf(struct btrfs_root *root,
+                              struct extent_buffer *leaf)
+ {
+       struct btrfs_key key;
+       struct btrfs_key leaf_key;
+       u32 nritems = btrfs_header_nritems(leaf);
+       int slot;
+       if (nritems == 0)
+               return 0;
+       /* Check the 0 item */
+       if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) !=
+           BTRFS_LEAF_DATA_SIZE(root)) {
+               CORRUPT("invalid item offset size pair", leaf, root, 0);
+               return -EIO;
+       }
+       /*
+        * Check to make sure each items keys are in the correct order and their
+        * offsets make sense.  We only have to loop through nritems-1 because
+        * we check the current slot against the next slot, which verifies the
+        * next slot's offset+size makes sense and that the current's slot
+        * offset is correct.
+        */
+       for (slot = 0; slot < nritems - 1; slot++) {
+               btrfs_item_key_to_cpu(leaf, &leaf_key, slot);
+               btrfs_item_key_to_cpu(leaf, &key, slot + 1);
+               /* Make sure the keys are in the right order */
+               if (btrfs_comp_cpu_keys(&leaf_key, &key) >= 0) {
+                       CORRUPT("bad key order", leaf, root, slot);
+                       return -EIO;
+               }
+               /*
+                * Make sure the offset and ends are right, remember that the
+                * item data starts at the end of the leaf and grows towards the
+                * front.
+                */
+               if (btrfs_item_offset_nr(leaf, slot) !=
+                       btrfs_item_end_nr(leaf, slot + 1)) {
+                       CORRUPT("slot offset bad", leaf, root, slot);
+                       return -EIO;
+               }
+               /*
+                * Check to make sure that we don't point outside of the leaf,
+                * just incase all the items are consistent to eachother, but
+                * all point outside of the leaf.
+                */
+               if (btrfs_item_end_nr(leaf, slot) >
+                   BTRFS_LEAF_DATA_SIZE(root)) {
+                       CORRUPT("slot end outside of leaf", leaf, root, slot);
+                       return -EIO;
+               }
+       }
+       return 0;
+ }
  #ifdef CONFIG_DEBUG_LOCK_ALLOC
  void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level)
  {
@@@ -485,8 -562,20 +562,20 @@@ static int btree_readpage_end_io_hook(s
        btrfs_set_buffer_lockdep_class(eb, found_level);
  
        ret = csum_tree_block(root, eb, 1);
-       if (ret)
+       if (ret) {
                ret = -EIO;
+               goto err;
+       }
+       /*
+        * If this is a leaf block and it is corrupt, set the corrupt bit so
+        * that we don't try and read the other copies of this block, just
+        * return -EIO.
+        */
+       if (found_level == 0 && check_leaf(root, eb)) {
+               set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
+               ret = -EIO;
+       }
  
        end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
        end = eb->start + end - 1;
@@@ -847,6 -936,7 +936,6 @@@ static const struct address_space_opera
        .writepages     = btree_writepages,
        .releasepage    = btree_releasepage,
        .invalidatepage = btree_invalidatepage,
 -      .sync_page      = block_sync_page,
  #ifdef CONFIG_MIGRATION
        .migratepage    = btree_migratepage,
  #endif
@@@ -1159,7 -1249,10 +1248,10 @@@ struct btrfs_root *btrfs_read_fs_root_n
                     root, fs_info, location->objectid);
  
        path = btrfs_alloc_path();
-       BUG_ON(!path);
+       if (!path) {
+               kfree(root);
+               return ERR_PTR(-ENOMEM);
+       }
        ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
        if (ret == 0) {
                l = path->nodes[0];
@@@ -1330,6 -1423,82 +1422,6 @@@ static int btrfs_congested_fn(void *con
  }
  
  /*
 - * this unplugs every device on the box, and it is only used when page
 - * is null
 - */
 -static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
 -{
 -      struct btrfs_device *device;
 -      struct btrfs_fs_info *info;
 -
 -      info = (struct btrfs_fs_info *)bdi->unplug_io_data;
 -      list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
 -              if (!device->bdev)
 -                      continue;
 -
 -              bdi = blk_get_backing_dev_info(device->bdev);
 -              if (bdi->unplug_io_fn)
 -                      bdi->unplug_io_fn(bdi, page);
 -      }
 -}
 -
 -static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
 -{
 -      struct inode *inode;
 -      struct extent_map_tree *em_tree;
 -      struct extent_map *em;
 -      struct address_space *mapping;
 -      u64 offset;
 -
 -      /* the generic O_DIRECT read code does this */
 -      if (1 || !page) {
 -              __unplug_io_fn(bdi, page);
 -              return;
 -      }
 -
 -      /*
 -       * page->mapping may change at any time.  Get a consistent copy
 -       * and use that for everything below
 -       */
 -      smp_mb();
 -      mapping = page->mapping;
 -      if (!mapping)
 -              return;
 -
 -      inode = mapping->host;
 -
 -      /*
 -       * don't do the expensive searching for a small number of
 -       * devices
 -       */
 -      if (BTRFS_I(inode)->root->fs_info->fs_devices->open_devices <= 2) {
 -              __unplug_io_fn(bdi, page);
 -              return;
 -      }
 -
 -      offset = page_offset(page);
 -
 -      em_tree = &BTRFS_I(inode)->extent_tree;
 -      read_lock(&em_tree->lock);
 -      em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
 -      read_unlock(&em_tree->lock);
 -      if (!em) {
 -              __unplug_io_fn(bdi, page);
 -              return;
 -      }
 -
 -      if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
 -              free_extent_map(em);
 -              __unplug_io_fn(bdi, page);
 -              return;
 -      }
 -      offset = offset - em->start;
 -      btrfs_unplug_page(&BTRFS_I(inode)->root->fs_info->mapping_tree,
 -                        em->block_start + offset, page);
 -      free_extent_map(em);
 -}
 -
 -/*
   * If this fails, caller must call bdi_destroy() to get rid of the
   * bdi again.
   */
@@@ -1343,6 -1512,8 +1435,6 @@@ static int setup_bdi(struct btrfs_fs_in
                return err;
  
        bdi->ra_pages   = default_backing_dev_info.ra_pages;
 -      bdi->unplug_io_fn       = btrfs_unplug_io_fn;
 -      bdi->unplug_io_data     = info;
        bdi->congested_fn       = btrfs_congested_fn;
        bdi->congested_data     = info;
        return 0;
@@@ -1553,6 -1724,8 +1645,8 @@@ struct btrfs_root *open_ctree(struct su
                goto fail_bdi;
        }
  
+       fs_info->btree_inode->i_mapping->flags &= ~__GFP_FS;
        INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
        INIT_LIST_HEAD(&fs_info->trans_list);
        INIT_LIST_HEAD(&fs_info->dead_roots);
  
        btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
  
+       /*
+        * In the long term, we'll store the compression type in the super
+        * block, and it'll be used for per file compression control.
+        */
+       fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
        ret = btrfs_parse_options(tree_root, options);
        if (ret) {
                err = ret;
        fs_info->metadata_alloc_profile = (u64)-1;
        fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
  
+       ret = btrfs_init_space_info(fs_info);
+       if (ret) {
+               printk(KERN_ERR "Failed to initial space info: %d\n", ret);
+               goto fail_block_groups;
+       }
        ret = btrfs_read_block_groups(extent_root);
        if (ret) {
                printk(KERN_ERR "Failed to read block groups: %d\n", ret);
  
        if (!(sb->s_flags & MS_RDONLY)) {
                down_read(&fs_info->cleanup_work_sem);
-               btrfs_orphan_cleanup(fs_info->fs_root);
-               btrfs_orphan_cleanup(fs_info->tree_root);
+               err = btrfs_orphan_cleanup(fs_info->fs_root);
+               if (!err)
+                       err = btrfs_orphan_cleanup(fs_info->tree_root);
                up_read(&fs_info->cleanup_work_sem);
+               if (err) {
+                       close_ctree(tree_root);
+                       return ERR_PTR(err);
+               }
        }
  
        return tree_root;
@@@ -2356,8 -2546,12 +2467,12 @@@ int btrfs_cleanup_fs_roots(struct btrfs
  
                root_objectid = gang[ret - 1]->root_key.objectid + 1;
                for (i = 0; i < ret; i++) {
+                       int err;
                        root_objectid = gang[i]->root_key.objectid;
-                       btrfs_orphan_cleanup(gang[i]);
+                       err = btrfs_orphan_cleanup(gang[i]);
+                       if (err)
+                               return err;
                }
                root_objectid++;
        }
@@@ -2414,7 -2608,7 +2529,7 @@@ int close_ctree(struct btrfs_root *root
         * ERROR state on disk.
         *
         * 2. when btrfs flips readonly just in btrfs_commit_super,
 -       * and in such case, btrfs cannnot write sb via btrfs_commit_super,
 +       * and in such case, btrfs cannot write sb via btrfs_commit_super,
         * and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag,
         * btrfs will cleanup all FS resources first and write sb then.
         */
@@@ -2868,7 -3062,10 +2983,10 @@@ static int btrfs_destroy_pinned_extent(
                        break;
  
                /* opt_discard */
-               ret = btrfs_error_discard_extent(root, start, end + 1 - start);
+               if (btrfs_test_opt(root, DISCARD))
+                       ret = btrfs_error_discard_extent(root, start,
+                                                        end + 1 - start,
+                                                        NULL);
  
                clear_extent_dirty(unpin, start, end, GFP_NOFS);
                btrfs_error_unpin_extent_range(root, start, end);
diff --combined fs/btrfs/extent_io.c
@@@ -2188,10 -2188,12 +2188,12 @@@ static int __extent_writepage(struct pa
        unsigned long nr_written = 0;
  
        if (wbc->sync_mode == WB_SYNC_ALL)
 -              write_flags = WRITE_SYNC_PLUG;
 +              write_flags = WRITE_SYNC;
        else
                write_flags = WRITE;
  
+       trace___extent_writepage(page, inode, wbc);
        WARN_ON(!PageLocked(page));
        pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
        if (page->index > end_index ||
@@@ -3690,6 -3692,7 +3692,7 @@@ int map_private_extent_buffer(struct ex
                       "wanted %lu %lu\n", (unsigned long long)eb->start,
                       eb->len, start, min_len);
                WARN_ON(1);
+               return -EINVAL;
        }
  
        p = extent_buffer_page(eb, i);
diff --combined fs/btrfs/inode.c
@@@ -50,6 -50,7 +50,7 @@@
  #include "tree-log.h"
  #include "compression.h"
  #include "locking.h"
+ #include "free-space-cache.h"
  
  struct btrfs_iget_args {
        u64 ino;
@@@ -70,6 -71,7 +71,7 @@@ static struct kmem_cache *btrfs_inode_c
  struct kmem_cache *btrfs_trans_handle_cachep;
  struct kmem_cache *btrfs_transaction_cachep;
  struct kmem_cache *btrfs_path_cachep;
+ struct kmem_cache *btrfs_free_space_cachep;
  
  #define S_SHIFT 12
  static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
@@@ -82,7 -84,8 +84,8 @@@
        [S_IFLNK >> S_SHIFT]    = BTRFS_FT_SYMLINK,
  };
  
- static void btrfs_truncate(struct inode *inode);
+ static int btrfs_setsize(struct inode *inode, loff_t newsize);
+ static int btrfs_truncate(struct inode *inode);
  static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end);
  static noinline int cow_file_range(struct inode *inode,
                                   struct page *locked_page,
                                   unsigned long *nr_written, int unlock);
  
  static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
 -                                   struct inode *inode,  struct inode *dir)
 +                                   struct inode *inode,  struct inode *dir,
 +                                   const struct qstr *qstr)
  {
        int err;
  
        err = btrfs_init_acl(trans, inode, dir);
        if (!err)
 -              err = btrfs_xattr_security_init(trans, inode, dir);
 +              err = btrfs_xattr_security_init(trans, inode, dir, qstr);
        return err;
  }
  
@@@ -288,6 -290,7 +291,7 @@@ static noinline int add_async_extent(st
        struct async_extent *async_extent;
  
        async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
+       BUG_ON(!async_extent);
        async_extent->start = start;
        async_extent->ram_size = ram_size;
        async_extent->compressed_size = compressed_size;
@@@ -382,9 -385,11 +386,11 @@@ again
         */
        if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) &&
            (btrfs_test_opt(root, COMPRESS) ||
-            (BTRFS_I(inode)->force_compress))) {
+            (BTRFS_I(inode)->force_compress) ||
+            (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) {
                WARN_ON(pages);
                pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
+               BUG_ON(!pages);
  
                if (BTRFS_I(inode)->force_compress)
                        compress_type = BTRFS_I(inode)->force_compress;
@@@ -1254,7 -1259,8 +1260,8 @@@ static int run_delalloc_range(struct in
                ret = run_delalloc_nocow(inode, locked_page, start, end,
                                         page_started, 0, nr_written);
        else if (!btrfs_test_opt(root, COMPRESS) &&
-                !(BTRFS_I(inode)->force_compress))
+                !(BTRFS_I(inode)->force_compress) &&
+                !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))
                ret = cow_file_range(inode, locked_page, start, end,
                                      page_started, nr_written, 1);
        else
@@@ -1461,8 -1467,11 +1468,11 @@@ static int btrfs_submit_bio_hook(struc
                if (bio_flags & EXTENT_BIO_COMPRESSED) {
                        return btrfs_submit_compressed_read(inode, bio,
                                                    mirror_num, bio_flags);
-               } else if (!skip_sum)
-                       btrfs_lookup_bio_sums(root, inode, bio, NULL);
+               } else if (!skip_sum) {
+                       ret = btrfs_lookup_bio_sums(root, inode, bio, NULL);
+                       if (ret)
+                               return ret;
+               }
                goto mapit;
        } else if (!skip_sum) {
                /* csum items have already been cloned */
@@@ -1785,6 -1794,8 +1795,8 @@@ out
  static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
                                struct extent_state *state, int uptodate)
  {
+       trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
        ClearPagePrivate2(page);
        return btrfs_finish_ordered_io(page->mapping->host, start, end);
  }
@@@ -1895,10 -1906,10 +1907,10 @@@ static int btrfs_io_failed_hook(struct 
        else
                rw = READ;
  
-       BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
+       ret = BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
                                                      failrec->last_mirror,
                                                      failrec->bio_flags, 0);
-       return 0;
+       return ret;
  }
  
  /*
@@@ -2282,7 -2293,7 +2294,7 @@@ int btrfs_orphan_del(struct btrfs_trans
   * this cleans up any orphans that may be left on the list from the last use
   * of this root.
   */
void btrfs_orphan_cleanup(struct btrfs_root *root)
int btrfs_orphan_cleanup(struct btrfs_root *root)
  {
        struct btrfs_path *path;
        struct extent_buffer *leaf;
        int ret = 0, nr_unlink = 0, nr_truncate = 0;
  
        if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
-               return;
+               return 0;
  
        path = btrfs_alloc_path();
-       BUG_ON(!path);
+       if (!path) {
+               ret = -ENOMEM;
+               goto out;
+       }
        path->reada = -1;
  
        key.objectid = BTRFS_ORPHAN_OBJECTID;
  
        while (1) {
                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-               if (ret < 0) {
-                       printk(KERN_ERR "Error searching slot for orphan: %d"
-                              "\n", ret);
-                       break;
-               }
+               if (ret < 0)
+                       goto out;
  
                /*
                 * if ret == 0 means we found what we were searching for, which
                 * find the key and see if we have stuff that matches
                 */
                if (ret > 0) {
+                       ret = 0;
                        if (path->slots[0] == 0)
                                break;
                        path->slots[0]--;
                found_key.type = BTRFS_INODE_ITEM_KEY;
                found_key.offset = 0;
                inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
-               BUG_ON(IS_ERR(inode));
+               if (IS_ERR(inode)) {
+                       ret = PTR_ERR(inode);
+                       goto out;
+               }
  
                /*
                 * add this inode to the orphan list so btrfs_orphan_del does
                 */
                if (is_bad_inode(inode)) {
                        trans = btrfs_start_transaction(root, 0);
-                       BUG_ON(IS_ERR(trans));
+                       if (IS_ERR(trans)) {
+                               ret = PTR_ERR(trans);
+                               goto out;
+                       }
                        btrfs_orphan_del(trans, inode);
                        btrfs_end_transaction(trans, root);
                        iput(inode);
  
                /* if we have links, this was a truncate, lets do that */
                if (inode->i_nlink) {
+                       if (!S_ISREG(inode->i_mode)) {
+                               WARN_ON(1);
+                               iput(inode);
+                               continue;
+                       }
                        nr_truncate++;
-                       btrfs_truncate(inode);
+                       ret = btrfs_truncate(inode);
                } else {
                        nr_unlink++;
                }
  
                /* this will do delete_inode and everything for us */
                iput(inode);
+               if (ret)
+                       goto out;
        }
-       btrfs_free_path(path);
        root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
  
        if (root->orphan_block_rsv)
  
        if (root->orphan_block_rsv || root->orphan_item_inserted) {
                trans = btrfs_join_transaction(root, 1);
-               BUG_ON(IS_ERR(trans));
-               btrfs_end_transaction(trans, root);
+               if (!IS_ERR(trans))
+                       btrfs_end_transaction(trans, root);
        }
  
        if (nr_unlink)
                printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
        if (nr_truncate)
                printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
+ out:
+       if (ret)
+               printk(KERN_CRIT "btrfs: could not do orphan cleanup %d\n", ret);
+       btrfs_free_path(path);
+       return ret;
  }
  
  /*
@@@ -2507,6 -2536,8 +2537,8 @@@ static void btrfs_read_locked_inode(str
        BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
  
        alloc_group_block = btrfs_inode_block_group(leaf, inode_item);
+       if (location.objectid == BTRFS_FREE_SPACE_OBJECTID)
+               inode->i_mapping->flags &= ~__GFP_FS;
  
        /*
         * try to precache a NULL acl entry for files that don't have
@@@ -2635,10 -2666,10 +2667,10 @@@ failed
   * recovery code.  It remove a link in a directory with a given name, and
   * also drops the back refs in the inode to the directory
   */
int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
-                      struct btrfs_root *root,
-                      struct inode *dir, struct inode *inode,
-                      const char *name, int name_len)
static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root,
+                               struct inode *dir, struct inode *inode,
+                               const char *name, int name_len)
  {
        struct btrfs_path *path;
        int ret = 0;
        btrfs_i_size_write(dir, dir->i_size - name_len * 2);
        inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
        btrfs_update_inode(trans, root, dir);
-       btrfs_drop_nlink(inode);
-       ret = btrfs_update_inode(trans, root, inode);
  out:
        return ret;
  }
  
+ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
+                      struct btrfs_root *root,
+                      struct inode *dir, struct inode *inode,
+                      const char *name, int name_len)
+ {
+       int ret;
+       ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
+       if (!ret) {
+               btrfs_drop_nlink(inode);
+               ret = btrfs_update_inode(trans, root, inode);
+       }
+       return ret;
+ }
+               
  /* helper to check if there is any shared block in the path */
  static int check_path_shared(struct btrfs_root *root,
                             struct btrfs_path *path)
        return ret;
  }
  
- int btrfs_cont_expand(struct inode *inode, loff_t size)
+ /*
+  * This function puts in dummy file extents for the area we're creating a hole
+  * for.  So if we are truncating this file to a larger size we need to insert
+  * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
+  * the range between oldsize and size
+  */
+ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
  {
        struct btrfs_trans_handle *trans;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct extent_map *em = NULL;
        struct extent_state *cached_state = NULL;
        u64 mask = root->sectorsize - 1;
-       u64 hole_start = (inode->i_size + mask) & ~mask;
+       u64 hole_start = (oldsize + mask) & ~mask;
        u64 block_end = (size + mask) & ~mask;
        u64 last_byte;
        u64 cur_offset;
                        err = btrfs_drop_extents(trans, inode, cur_offset,
                                                 cur_offset + hole_size,
                                                 &hint_byte, 1);
-                       BUG_ON(err);
+                       if (err)
+                               break;
  
                        err = btrfs_insert_file_extent(trans, root,
                                        inode->i_ino, cur_offset, 0,
                                        0, hole_size, 0, hole_size,
                                        0, 0, 0);
-                       BUG_ON(err);
+                       if (err)
+                               break;
  
                        btrfs_drop_extent_cache(inode, hole_start,
                                        last_byte - 1, 0);
        return err;
  }
  
- static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
+ static int btrfs_setsize(struct inode *inode, loff_t newsize)
  {
-       struct btrfs_root *root = BTRFS_I(inode)->root;
-       struct btrfs_trans_handle *trans;
-       unsigned long nr;
+       loff_t oldsize = i_size_read(inode);
        int ret;
  
-       if (attr->ia_size == inode->i_size)
+       if (newsize == oldsize)
                return 0;
  
-       if (attr->ia_size > inode->i_size) {
-               unsigned long limit;
-               limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
-               if (attr->ia_size > inode->i_sb->s_maxbytes)
-                       return -EFBIG;
-               if (limit != RLIM_INFINITY && attr->ia_size > limit) {
-                       send_sig(SIGXFSZ, current, 0);
-                       return -EFBIG;
-               }
-       }
-       trans = btrfs_start_transaction(root, 5);
-       if (IS_ERR(trans))
-               return PTR_ERR(trans);
-       btrfs_set_trans_block_group(trans, inode);
-       ret = btrfs_orphan_add(trans, inode);
-       BUG_ON(ret);
-       nr = trans->blocks_used;
-       btrfs_end_transaction(trans, root);
-       btrfs_btree_balance_dirty(root, nr);
-       if (attr->ia_size > inode->i_size) {
-               ret = btrfs_cont_expand(inode, attr->ia_size);
+       if (newsize > oldsize) {
+               i_size_write(inode, newsize);
+               btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
+               truncate_pagecache(inode, oldsize, newsize);
+               ret = btrfs_cont_expand(inode, oldsize, newsize);
                if (ret) {
-                       btrfs_truncate(inode);
+                       btrfs_setsize(inode, oldsize);
                        return ret;
                }
  
-               i_size_write(inode, attr->ia_size);
-               btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
+               mark_inode_dirty(inode);
+       } else {
  
-               trans = btrfs_start_transaction(root, 0);
-               BUG_ON(IS_ERR(trans));
-               btrfs_set_trans_block_group(trans, inode);
-               trans->block_rsv = root->orphan_block_rsv;
-               BUG_ON(!trans->block_rsv);
+               /*
+                * We're truncating a file that used to have good data down to
+                * zero. Make sure it gets into the ordered flush list so that
+                * any new writes get down to disk quickly.
+                */
+               if (newsize == 0)
+                       BTRFS_I(inode)->ordered_data_close = 1;
  
-               ret = btrfs_update_inode(trans, root, inode);
-               BUG_ON(ret);
-               if (inode->i_nlink > 0) {
-                       ret = btrfs_orphan_del(trans, inode);
-                       BUG_ON(ret);
-               }
-               nr = trans->blocks_used;
-               btrfs_end_transaction(trans, root);
-               btrfs_btree_balance_dirty(root, nr);
-               return 0;
+               /* we don't support swapfiles, so vmtruncate shouldn't fail */
+               truncate_setsize(inode, newsize);
+               ret = btrfs_truncate(inode);
        }
  
-       /*
-        * We're truncating a file that used to have good data down to
-        * zero. Make sure it gets into the ordered flush list so that
-        * any new writes get down to disk quickly.
-        */
-       if (attr->ia_size == 0)
-               BTRFS_I(inode)->ordered_data_close = 1;
-       /* we don't support swapfiles, so vmtruncate shouldn't fail */
-       ret = vmtruncate(inode, attr->ia_size);
-       BUG_ON(ret);
-       return 0;
+       return ret;
  }
  
  static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
                return err;
  
        if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
-               err = btrfs_setattr_size(inode, attr);
+               err = btrfs_setsize(inode, attr->ia_size);
                if (err)
                        return err;
        }
@@@ -3730,6 -3742,8 +3743,8 @@@ void btrfs_evict_inode(struct inode *in
        unsigned long nr;
        int ret;
  
+       trace_btrfs_inode_evict(inode);
        truncate_inode_pages(&inode->i_data, 0);
        if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 ||
                               root == root->fs_info->tree_root))
@@@ -4072,7 -4086,6 +4087,6 @@@ struct inode *btrfs_iget(struct super_b
                BTRFS_I(inode)->root = root;
                memcpy(&BTRFS_I(inode)->location, location, sizeof(*location));
                btrfs_read_locked_inode(inode);
                inode_tree_add(inode);
                unlock_new_inode(inode);
                if (new)
@@@ -4147,8 -4160,10 +4161,10 @@@ struct inode *btrfs_lookup_dentry(struc
        if (!IS_ERR(inode) && root != sub_root) {
                down_read(&root->fs_info->cleanup_work_sem);
                if (!(inode->i_sb->s_flags & MS_RDONLY))
-                       btrfs_orphan_cleanup(sub_root);
+                       ret = btrfs_orphan_cleanup(sub_root);
                up_read(&root->fs_info->cleanup_work_sem);
+               if (ret)
+                       inode = ERR_PTR(ret);
        }
  
        return inode;
@@@ -4282,6 -4297,9 +4298,9 @@@ static int btrfs_real_readdir(struct fi
                while (di_cur < di_total) {
                        struct btrfs_key location;
  
+                       if (verify_dir_item(root, leaf, di))
+                               break;
                        name_len = btrfs_dir_name_len(leaf, di);
                        if (name_len <= sizeof(tmp_name)) {
                                name_ptr = tmp_name;
@@@ -4517,6 -4535,8 +4536,8 @@@ static struct inode *btrfs_new_inode(st
                return ERR_PTR(-ENOMEM);
  
        if (dir) {
+               trace_btrfs_inode_request(dir);
                ret = btrfs_set_inode_index(dir, index);
                if (ret) {
                        iput(inode);
        if ((mode & S_IFREG)) {
                if (btrfs_test_opt(root, NODATASUM))
                        BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
-               if (btrfs_test_opt(root, NODATACOW))
+               if (btrfs_test_opt(root, NODATACOW) ||
+                   (BTRFS_I(dir)->flags & BTRFS_INODE_NODATACOW))
                        BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
        }
  
        insert_inode_hash(inode);
        inode_tree_add(inode);
+       trace_btrfs_inode_new(inode);
        return inode;
  fail:
        if (dir)
@@@ -4705,7 -4729,7 +4730,7 @@@ static int btrfs_mknod(struct inode *di
        if (IS_ERR(inode))
                goto out_unlock;
  
 -      err = btrfs_init_inode_security(trans, inode, dir);
 +      err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
        if (err) {
                drop_inode = 1;
                goto out_unlock;
@@@ -4766,7 -4790,7 +4791,7 @@@ static int btrfs_create(struct inode *d
        if (IS_ERR(inode))
                goto out_unlock;
  
 -      err = btrfs_init_inode_security(trans, inode, dir);
 +      err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
        if (err) {
                drop_inode = 1;
                goto out_unlock;
@@@ -4807,9 -4831,15 +4832,12 @@@ static int btrfs_link(struct dentry *ol
        int err;
        int drop_inode = 0;
  
 -      if (inode->i_nlink == 0)
 -              return -ENOENT;
 -
        /* do not allow sys_link's with other subvols of the same device */
        if (root->objectid != BTRFS_I(inode)->root->objectid)
-               return -EPERM;
+               return -EXDEV;
+       if (inode->i_nlink == ~0U)
+               return -EMLINK;
  
        btrfs_inc_nlink(inode);
        inode->i_ctime = CURRENT_TIME;
@@@ -4892,7 -4922,7 +4920,7 @@@ static int btrfs_mkdir(struct inode *di
  
        drop_on_err = 1;
  
 -      err = btrfs_init_inode_security(trans, inode, dir);
 +      err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
        if (err)
                goto out_fail;
  
@@@ -5265,6 -5295,9 +5293,9 @@@ insert
        }
        write_unlock(&em_tree->lock);
  out:
+       trace_btrfs_get_extent(root, em);
        if (path)
                btrfs_free_path(path);
        if (trans) {
@@@ -5748,6 -5781,10 +5779,10 @@@ static void btrfs_endio_direct_read(str
  
        kfree(dip->csums);
        kfree(dip);
+       /* If we had a csum failure make sure to clear the uptodate flag */
+       if (err)
+               clear_bit(BIO_UPTODATE, &bio->bi_flags);
        dio_end_io(bio, err);
  }
  
@@@ -5849,6 -5886,10 +5884,10 @@@ out_done
  
        kfree(dip->csums);
        kfree(dip);
+       /* If we had an error make sure to clear the uptodate flag */
+       if (err)
+               clear_bit(BIO_UPTODATE, &bio->bi_flags);
        dio_end_io(bio, err);
  }
  
@@@ -5922,9 -5963,12 +5961,12 @@@ static inline int __btrfs_submit_dio_bi
                                   __btrfs_submit_bio_start_direct_io,
                                   __btrfs_submit_bio_done);
                goto err;
-       } else if (!skip_sum)
-               btrfs_lookup_bio_sums_dio(root, inode, bio,
+       } else if (!skip_sum) {
+               ret = btrfs_lookup_bio_sums_dio(root, inode, bio,
                                          file_offset, csums);
+               if (ret)
+                       goto err;
+       }
  
        ret = btrfs_map_bio(root, rw, bio, 0, 1);
  err:
@@@ -5948,6 -5992,7 +5990,7 @@@ static int btrfs_submit_direct_hook(in
        int nr_pages = 0;
        u32 *csums = dip->csums;
        int ret = 0;
+       int write = rw & REQ_WRITE;
  
        bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
        if (!bio)
                                goto out_err;
                        }
  
-                       if (!skip_sum)
+                       /* Write's use the ordered csums */
+                       if (!write && !skip_sum)
                                csums = csums + nr_pages;
                        start_sector += submit_len >> 9;
                        file_offset += submit_len;
@@@ -6052,7 -6098,8 +6096,8 @@@ static void btrfs_submit_direct(int rw
        }
        dip->csums = NULL;
  
-       if (!skip_sum) {
+       /* Write's use the ordered csum stuff, so we don't need dip->csums */
+       if (!write && !skip_sum) {
                dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS);
                if (!dip->csums) {
                        kfree(dip);
        return ret;
  }
  
- static void btrfs_truncate(struct inode *inode)
+ static int btrfs_truncate(struct inode *inode)
  {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        int ret;
+       int err = 0;
        struct btrfs_trans_handle *trans;
        unsigned long nr;
        u64 mask = root->sectorsize - 1;
  
-       if (!S_ISREG(inode->i_mode)) {
-               WARN_ON(1);
-               return;
-       }
        ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
        if (ret)
-               return;
+               return ret;
  
        btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
        btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
  
+       trans = btrfs_start_transaction(root, 5);
+       if (IS_ERR(trans))
+               return PTR_ERR(trans);
+       btrfs_set_trans_block_group(trans, inode);
+       ret = btrfs_orphan_add(trans, inode);
+       if (ret) {
+               btrfs_end_transaction(trans, root);
+               return ret;
+       }
+       nr = trans->blocks_used;
+       btrfs_end_transaction(trans, root);
+       btrfs_btree_balance_dirty(root, nr);
+       /* Now start a transaction for the truncate */
        trans = btrfs_start_transaction(root, 0);
-       BUG_ON(IS_ERR(trans));
+       if (IS_ERR(trans))
+               return PTR_ERR(trans);
        btrfs_set_trans_block_group(trans, inode);
        trans->block_rsv = root->orphan_block_rsv;
  
        while (1) {
                if (!trans) {
                        trans = btrfs_start_transaction(root, 0);
-                       BUG_ON(IS_ERR(trans));
+                       if (IS_ERR(trans))
+                               return PTR_ERR(trans);
                        btrfs_set_trans_block_group(trans, inode);
                        trans->block_rsv = root->orphan_block_rsv;
                }
  
                ret = btrfs_block_rsv_check(trans, root,
                                            root->orphan_block_rsv, 0, 5);
-               if (ret) {
-                       BUG_ON(ret != -EAGAIN);
+               if (ret == -EAGAIN) {
                        ret = btrfs_commit_transaction(trans, root);
-                       BUG_ON(ret);
+                       if (ret)
+                               return ret;
                        trans = NULL;
                        continue;
+               } else if (ret) {
+                       err = ret;
+                       break;
                }
  
                ret = btrfs_truncate_inode_items(trans, root, inode,
                                                 inode->i_size,
                                                 BTRFS_EXTENT_DATA_KEY);
-               if (ret != -EAGAIN)
+               if (ret != -EAGAIN) {
+                       err = ret;
                        break;
+               }
  
                ret = btrfs_update_inode(trans, root, inode);
-               BUG_ON(ret);
+               if (ret) {
+                       err = ret;
+                       break;
+               }
  
                nr = trans->blocks_used;
                btrfs_end_transaction(trans, root);
  
        if (ret == 0 && inode->i_nlink > 0) {
                ret = btrfs_orphan_del(trans, inode);
-               BUG_ON(ret);
+               if (ret)
+                       err = ret;
+       } else if (ret && inode->i_nlink > 0) {
+               /*
+                * Failed to do the truncate, remove us from the in memory
+                * orphan list.
+                */
+               ret = btrfs_orphan_del(NULL, inode);
        }
  
        ret = btrfs_update_inode(trans, root, inode);
-       BUG_ON(ret);
+       if (ret && !err)
+               err = ret;
  
        nr = trans->blocks_used;
        ret = btrfs_end_transaction_throttle(trans, root);
-       BUG_ON(ret);
+       if (ret && !err)
+               err = ret;
        btrfs_btree_balance_dirty(root, nr);
+       return err;
  }
  
  /*
@@@ -6630,9 -6711,8 +6709,8 @@@ struct inode *btrfs_alloc_inode(struct 
        ei->index_cnt = (u64)-1;
        ei->last_unlink_trans = 0;
  
-       spin_lock_init(&ei->accounting_lock);
        atomic_set(&ei->outstanding_extents, 0);
-       ei->reserved_extents = 0;
+       atomic_set(&ei->reserved_extents, 0);
  
        ei->ordered_data_close = 0;
        ei->orphan_meta_reserved = 0;
@@@ -6668,7 -6748,7 +6746,7 @@@ void btrfs_destroy_inode(struct inode *
        WARN_ON(!list_empty(&inode->i_dentry));
        WARN_ON(inode->i_data.nrpages);
        WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents));
-       WARN_ON(BTRFS_I(inode)->reserved_extents);
+       WARN_ON(atomic_read(&BTRFS_I(inode)->reserved_extents));
  
        /*
         * This can happen where we create an inode, but somebody else also
@@@ -6760,6 -6840,8 +6838,8 @@@ void btrfs_destroy_cachep(void
                kmem_cache_destroy(btrfs_transaction_cachep);
        if (btrfs_path_cachep)
                kmem_cache_destroy(btrfs_path_cachep);
+       if (btrfs_free_space_cachep)
+               kmem_cache_destroy(btrfs_free_space_cachep);
  }
  
  int btrfs_init_cachep(void)
        if (!btrfs_path_cachep)
                goto fail;
  
+       btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space_cache",
+                       sizeof(struct btrfs_free_space), 0,
+                       SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+       if (!btrfs_free_space_cachep)
+               goto fail;
        return 0;
  fail:
        btrfs_destroy_cachep();
@@@ -6806,6 -6894,26 +6892,26 @@@ static int btrfs_getattr(struct vfsmoun
        return 0;
  }
  
+ /*
+  * If a file is moved, it will inherit the cow and compression flags of the new
+  * directory.
+  */
+ static void fixup_inode_flags(struct inode *dir, struct inode *inode)
+ {
+       struct btrfs_inode *b_dir = BTRFS_I(dir);
+       struct btrfs_inode *b_inode = BTRFS_I(inode);
+       if (b_dir->flags & BTRFS_INODE_NODATACOW)
+               b_inode->flags |= BTRFS_INODE_NODATACOW;
+       else
+               b_inode->flags &= ~BTRFS_INODE_NODATACOW;
+       if (b_dir->flags & BTRFS_INODE_COMPRESS)
+               b_inode->flags |= BTRFS_INODE_COMPRESS;
+       else
+               b_inode->flags &= ~BTRFS_INODE_COMPRESS;
+ }
  static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                           struct inode *new_dir, struct dentry *new_dentry)
  {
                                        old_dentry->d_name.name,
                                        old_dentry->d_name.len);
        } else {
-               btrfs_inc_nlink(old_dentry->d_inode);
-               ret = btrfs_unlink_inode(trans, root, old_dir,
-                                        old_dentry->d_inode,
-                                        old_dentry->d_name.name,
-                                        old_dentry->d_name.len);
+               ret = __btrfs_unlink_inode(trans, root, old_dir,
+                                       old_dentry->d_inode,
+                                       old_dentry->d_name.name,
+                                       old_dentry->d_name.len);
+               if (!ret)
+                       ret = btrfs_update_inode(trans, root, old_inode);
        }
        BUG_ON(ret);
  
                }
        }
  
+       fixup_inode_flags(new_dir, old_inode);
        ret = btrfs_add_link(trans, new_dir, old_inode,
                             new_dentry->d_name.name,
                             new_dentry->d_name.len, 0, index);
@@@ -7104,7 -7215,7 +7213,7 @@@ static int btrfs_symlink(struct inode *
        if (IS_ERR(inode))
                goto out_unlock;
  
 -      err = btrfs_init_inode_security(trans, inode, dir);
 +      err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
        if (err) {
                drop_inode = 1;
                goto out_unlock;
@@@ -7340,6 -7451,7 +7449,6 @@@ static const struct address_space_opera
        .writepage      = btrfs_writepage,
        .writepages     = btrfs_writepages,
        .readpages      = btrfs_readpages,
 -      .sync_page      = block_sync_page,
        .direct_IO      = btrfs_direct_IO,
        .invalidatepage = btrfs_invalidatepage,
        .releasepage    = btrfs_releasepage,
@@@ -7355,7 -7467,6 +7464,6 @@@ static const struct address_space_opera
  };
  
  static const struct inode_operations btrfs_file_inode_operations = {
-       .truncate       = btrfs_truncate,
        .getattr        = btrfs_getattr,
        .setattr        = btrfs_setattr,
        .setxattr       = btrfs_setxattr,
diff --combined fs/btrfs/ioctl.c
@@@ -40,6 -40,7 +40,7 @@@
  #include <linux/xattr.h>
  #include <linux/vmalloc.h>
  #include <linux/slab.h>
+ #include <linux/blkdev.h>
  #include "compat.h"
  #include "ctree.h"
  #include "disk-io.h"
@@@ -138,6 -139,24 +139,24 @@@ static int btrfs_ioctl_getflags(struct 
        return 0;
  }
  
+ static int check_flags(unsigned int flags)
+ {
+       if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
+                     FS_NOATIME_FL | FS_NODUMP_FL | \
+                     FS_SYNC_FL | FS_DIRSYNC_FL | \
+                     FS_NOCOMP_FL | FS_COMPR_FL | \
+                     FS_NOCOW_FL | FS_COW_FL))
+               return -EOPNOTSUPP;
+       if ((flags & FS_NOCOMP_FL) && (flags & FS_COMPR_FL))
+               return -EINVAL;
+       if ((flags & FS_NOCOW_FL) && (flags & FS_COW_FL))
+               return -EINVAL;
+       return 0;
+ }
  static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
  {
        struct inode *inode = file->f_path.dentry->d_inode;
        if (copy_from_user(&flags, arg, sizeof(flags)))
                return -EFAULT;
  
-       if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
-                     FS_NOATIME_FL | FS_NODUMP_FL | \
-                     FS_SYNC_FL | FS_DIRSYNC_FL))
-               return -EOPNOTSUPP;
+       ret = check_flags(flags);
+       if (ret)
+               return ret;
  
 -      if (!is_owner_or_cap(inode))
 +      if (!inode_owner_or_capable(inode))
                return -EACCES;
  
        mutex_lock(&inode->i_mutex);
        else
                ip->flags &= ~BTRFS_INODE_DIRSYNC;
  
+       /*
+        * The COMPRESS flag can only be changed by users, while the NOCOMPRESS
+        * flag may be changed automatically if compression code won't make
+        * things smaller.
+        */
+       if (flags & FS_NOCOMP_FL) {
+               ip->flags &= ~BTRFS_INODE_COMPRESS;
+               ip->flags |= BTRFS_INODE_NOCOMPRESS;
+       } else if (flags & FS_COMPR_FL) {
+               ip->flags |= BTRFS_INODE_COMPRESS;
+               ip->flags &= ~BTRFS_INODE_NOCOMPRESS;
+       }
+       if (flags & FS_NOCOW_FL)
+               ip->flags |= BTRFS_INODE_NODATACOW;
+       else if (flags & FS_COW_FL)
+               ip->flags &= ~BTRFS_INODE_NODATACOW;
  
        trans = btrfs_join_transaction(root, 1);
        BUG_ON(IS_ERR(trans));
        btrfs_end_transaction(trans, root);
  
        mnt_drop_write(file->f_path.mnt);
+       ret = 0;
   out_unlock:
        mutex_unlock(&inode->i_mutex);
-       return 0;
+       return ret;
  }
  
  static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
        return put_user(inode->i_generation, arg);
  }
  
+ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
+ {
+       struct btrfs_root *root = fdentry(file)->d_sb->s_fs_info;
+       struct btrfs_fs_info *fs_info = root->fs_info;
+       struct btrfs_device *device;
+       struct request_queue *q;
+       struct fstrim_range range;
+       u64 minlen = ULLONG_MAX;
+       u64 num_devices = 0;
+       int ret;
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+       mutex_lock(&fs_info->fs_devices->device_list_mutex);
+       list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
+               if (!device->bdev)
+                       continue;
+               q = bdev_get_queue(device->bdev);
+               if (blk_queue_discard(q)) {
+                       num_devices++;
+                       minlen = min((u64)q->limits.discard_granularity,
+                                    minlen);
+               }
+       }
+       mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+       if (!num_devices)
+               return -EOPNOTSUPP;
+       if (copy_from_user(&range, arg, sizeof(range)))
+               return -EFAULT;
+       range.minlen = max(range.minlen, minlen);
+       ret = btrfs_trim_fs(root, &range);
+       if (ret < 0)
+               return ret;
+       if (copy_to_user(arg, &range, sizeof(range)))
+               return -EFAULT;
+       return 0;
+ }
  static noinline int create_subvol(struct btrfs_root *root,
                                  struct dentry *dentry,
                                  char *name, int namelen,
@@@ -409,7 -488,9 +488,9 @@@ static int create_snapshot(struct btrfs
        if (ret)
                goto fail;
  
-       btrfs_orphan_cleanup(pending_snapshot->snap);
+       ret = btrfs_orphan_cleanup(pending_snapshot->snap);
+       if (ret)
+               goto fail;
  
        parent = dget_parent(dentry);
        inode = btrfs_lookup_dentry(parent->d_inode, dentry);
@@@ -1077,7 -1158,7 +1158,7 @@@ static noinline int btrfs_ioctl_subvol_
        if (flags & ~BTRFS_SUBVOL_RDONLY)
                return -EOPNOTSUPP;
  
 -      if (!is_owner_or_cap(inode))
 +      if (!inode_owner_or_capable(inode))
                return -EACCES;
  
        down_write(&root->fs_info->subvol_sem);
@@@ -2348,12 -2429,15 +2429,15 @@@ static noinline long btrfs_ioctl_start_
        struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
        struct btrfs_trans_handle *trans;
        u64 transid;
+       int ret;
  
        trans = btrfs_start_transaction(root, 0);
        if (IS_ERR(trans))
                return PTR_ERR(trans);
        transid = trans->transid;
-       btrfs_commit_transaction_async(trans, root, 0);
+       ret = btrfs_commit_transaction_async(trans, root, 0);
+       if (ret)
+               return ret;
  
        if (argp)
                if (copy_to_user(argp, &transid, sizeof(transid)))
@@@ -2388,6 -2472,8 +2472,8 @@@ long btrfs_ioctl(struct file *file, uns
                return btrfs_ioctl_setflags(file, argp);
        case FS_IOC_GETVERSION:
                return btrfs_ioctl_getversion(file, argp);
+       case FITRIM:
+               return btrfs_ioctl_fitrim(file, argp);
        case BTRFS_IOC_SNAP_CREATE:
                return btrfs_ioctl_snap_create(file, argp, 0);
        case BTRFS_IOC_SNAP_CREATE_V2:
diff --combined fs/btrfs/volumes.c
  #include "volumes.h"
  #include "async-thread.h"
  
- struct map_lookup {
-       u64 type;
-       int io_align;
-       int io_width;
-       int stripe_len;
-       int sector_size;
-       int num_stripes;
-       int sub_stripes;
-       struct btrfs_bio_stripe stripes[];
- };
  static int init_first_rw_device(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root,
                                struct btrfs_device *device);
@@@ -162,6 -151,7 +151,6 @@@ static noinline int run_scheduled_bios(
        struct bio *cur;
        int again = 0;
        unsigned long num_run;
 -      unsigned long num_sync_run;
        unsigned long batch_run = 0;
        unsigned long limit;
        unsigned long last_waited = 0;
        limit = btrfs_async_submit_limit(fs_info);
        limit = limit * 2 / 3;
  
 -      /* we want to make sure that every time we switch from the sync
 -       * list to the normal list, we unplug
 -       */
 -      num_sync_run = 0;
 -
  loop:
        spin_lock(&device->io_lock);
  
@@@ -217,6 -212,15 +206,6 @@@ loop_lock
  
        spin_unlock(&device->io_lock);
  
 -      /*
 -       * if we're doing the regular priority list, make sure we unplug
 -       * for any high prio bios we've sent down
 -       */
 -      if (pending_bios == &device->pending_bios && num_sync_run > 0) {
 -              num_sync_run = 0;
 -              blk_run_backing_dev(bdi, NULL);
 -      }
 -
        while (pending) {
  
                rmb();
  
                BUG_ON(atomic_read(&cur->bi_cnt) == 0);
  
 -              if (cur->bi_rw & REQ_SYNC)
 -                      num_sync_run++;
 -
                submit_bio(cur->bi_rw, cur);
                num_run++;
                batch_run++;
 -              if (need_resched()) {
 -                      if (num_sync_run) {
 -                              blk_run_backing_dev(bdi, NULL);
 -                              num_sync_run = 0;
 -                      }
 +              if (need_resched())
                        cond_resched();
 -              }
  
                /*
                 * we made progress, there is more work to do and the bdi
                                 * against it before looping
                                 */
                                last_waited = ioc->last_waited;
 -                              if (need_resched()) {
 -                                      if (num_sync_run) {
 -                                              blk_run_backing_dev(bdi, NULL);
 -                                              num_sync_run = 0;
 -                                      }
 +                              if (need_resched())
                                        cond_resched();
 -                              }
                                continue;
                        }
                        spin_lock(&device->io_lock);
                }
        }
  
 -      if (num_sync_run) {
 -              num_sync_run = 0;
 -              blk_run_backing_dev(bdi, NULL);
 -      }
 -      /*
 -       * IO has already been through a long path to get here.  Checksumming,
 -       * async helper threads, perhaps compression.  We've done a pretty
 -       * good job of collecting a batch of IO and should just unplug
 -       * the device right away.
 -       *
 -       * This will help anyone who is waiting on the IO, they might have
 -       * already unplugged, but managed to do so before the bio they
 -       * cared about found its way down here.
 -       */
 -      blk_run_backing_dev(bdi, NULL);
 -
        cond_resched();
        if (again)
                goto loop;
@@@ -1879,6 -1912,8 +1868,8 @@@ static int btrfs_relocate_chunk(struct 
  
        BUG_ON(ret);
  
+       trace_btrfs_chunk_free(root, map, chunk_offset, em->len);
        if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
                ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
                BUG_ON(ret);
@@@ -2606,6 -2641,8 +2597,8 @@@ static int __btrfs_alloc_chunk(struct b
        *num_bytes = chunk_bytes_by_type(type, calc_size,
                                         map->num_stripes, sub_stripes);
  
+       trace_btrfs_chunk_alloc(info->chunk_root, map, start, *num_bytes);
        em = alloc_extent_map(GFP_NOFS);
        if (!em) {
                ret = -ENOMEM;
@@@ -2714,6 -2751,7 +2707,7 @@@ static int __finish_chunk_alloc(struct 
                                             item_size);
                BUG_ON(ret);
        }
        kfree(chunk);
        return 0;
  }
@@@ -2911,14 -2949,17 +2905,17 @@@ static int find_live_mirror(struct map_
  static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
                             u64 logical, u64 *length,
                             struct btrfs_multi_bio **multi_ret,
 -                           int mirror_num, struct page *unplug_page)
 +                           int mirror_num)
  {
        struct extent_map *em;
        struct map_lookup *map;
        struct extent_map_tree *em_tree = &map_tree->map_tree;
        u64 offset;
        u64 stripe_offset;
+       u64 stripe_end_offset;
        u64 stripe_nr;
+       u64 stripe_nr_orig;
+       u64 stripe_nr_end;
        int stripes_allocated = 8;
        int stripes_required = 1;
        int stripe_index;
        int max_errors = 0;
        struct btrfs_multi_bio *multi = NULL;
  
-       if (multi_ret && !(rw & REQ_WRITE))
+       if (multi_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
                stripes_allocated = 1;
  again:
        if (multi_ret) {
        em = lookup_extent_mapping(em_tree, logical, *length);
        read_unlock(&em_tree->lock);
  
 -      if (!em && unplug_page) {
 -              kfree(multi);
 -              return 0;
 -      }
 -
        if (!em) {
                printk(KERN_CRIT "unable to find logical %llu len %llu\n",
                       (unsigned long long)logical,
                        max_errors = 1;
                }
        }
-       if (multi_ret && (rw & REQ_WRITE) &&
+       if (rw & REQ_DISCARD) {
+               if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+                                BTRFS_BLOCK_GROUP_RAID1 |
+                                BTRFS_BLOCK_GROUP_DUP |
+                                BTRFS_BLOCK_GROUP_RAID10)) {
+                       stripes_required = map->num_stripes;
+               }
+       }
+       if (multi_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
            stripes_allocated < stripes_required) {
                stripes_allocated = map->num_stripes;
                free_extent_map(em);
        /* stripe_offset is the offset of this block in its stripe*/
        stripe_offset = offset - stripe_offset;
  
-       if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
-                        BTRFS_BLOCK_GROUP_RAID10 |
-                        BTRFS_BLOCK_GROUP_DUP)) {
+       if (rw & REQ_DISCARD)
+               *length = min_t(u64, em->len - offset, *length);
+       else if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+                             BTRFS_BLOCK_GROUP_RAID1 |
+                             BTRFS_BLOCK_GROUP_RAID10 |
+                             BTRFS_BLOCK_GROUP_DUP)) {
                /* we limit the length of each bio to what fits in a stripe */
                *length = min_t(u64, em->len - offset,
-                             map->stripe_len - stripe_offset);
+                               map->stripe_len - stripe_offset);
        } else {
                *length = em->len - offset;
        }
  
 -      if (!multi_ret && !unplug_page)
 +      if (!multi_ret)
                goto out;
  
        num_stripes = 1;
        stripe_index = 0;
-       if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
-               if (rw & REQ_WRITE)
+       stripe_nr_orig = stripe_nr;
+       stripe_nr_end = (offset + *length + map->stripe_len - 1) &
+                       (~(map->stripe_len - 1));
+       do_div(stripe_nr_end, map->stripe_len);
+       stripe_end_offset = stripe_nr_end * map->stripe_len -
+                           (offset + *length);
+       if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
+               if (rw & REQ_DISCARD)
+                       num_stripes = min_t(u64, map->num_stripes,
+                                           stripe_nr_end - stripe_nr_orig);
+               stripe_index = do_div(stripe_nr, map->num_stripes);
+       } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
 -              if (unplug_page || (rw & (REQ_WRITE | REQ_DISCARD)))
++              if (rw & (REQ_WRITE | REQ_DISCARD))
                        num_stripes = map->num_stripes;
                else if (mirror_num)
                        stripe_index = mirror_num - 1;
                }
  
        } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
-               if (rw & REQ_WRITE)
+               if (rw & (REQ_WRITE | REQ_DISCARD))
                        num_stripes = map->num_stripes;
                else if (mirror_num)
                        stripe_index = mirror_num - 1;
                stripe_index = do_div(stripe_nr, factor);
                stripe_index *= map->sub_stripes;
  
 -              if (unplug_page || (rw & REQ_WRITE))
 +              if (rw & REQ_WRITE)
                        num_stripes = map->sub_stripes;
+               else if (rw & REQ_DISCARD)
+                       num_stripes = min_t(u64, map->sub_stripes *
+                                           (stripe_nr_end - stripe_nr_orig),
+                                           map->num_stripes);
                else if (mirror_num)
                        stripe_index += mirror_num - 1;
                else {
        }
        BUG_ON(stripe_index >= map->num_stripes);
  
-       for (i = 0; i < num_stripes; i++) {
-               multi->stripes[i].physical =
-                       map->stripes[stripe_index].physical +
-                       stripe_offset + stripe_nr * map->stripe_len;
-               multi->stripes[i].dev = map->stripes[stripe_index].dev;
-               stripe_index++;
+       if (rw & REQ_DISCARD) {
+               for (i = 0; i < num_stripes; i++) {
+                       multi->stripes[i].physical =
+                               map->stripes[stripe_index].physical +
+                               stripe_offset + stripe_nr * map->stripe_len;
+                       multi->stripes[i].dev = map->stripes[stripe_index].dev;
+                       if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
+                               u64 stripes;
+                               u32 last_stripe = 0;
+                               int j;
+                               div_u64_rem(stripe_nr_end - 1,
+                                           map->num_stripes,
+                                           &last_stripe);
+                               for (j = 0; j < map->num_stripes; j++) {
+                                       u32 test;
+                                       div_u64_rem(stripe_nr_end - 1 - j,
+                                                   map->num_stripes, &test);
+                                       if (test == stripe_index)
+                                               break;
+                               }
+                               stripes = stripe_nr_end - 1 - j;
+                               do_div(stripes, map->num_stripes);
+                               multi->stripes[i].length = map->stripe_len *
+                                       (stripes - stripe_nr + 1);
+                               if (i == 0) {
+                                       multi->stripes[i].length -=
+                                               stripe_offset;
+                                       stripe_offset = 0;
+                               }
+                               if (stripe_index == last_stripe)
+                                       multi->stripes[i].length -=
+                                               stripe_end_offset;
+                       } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+                               u64 stripes;
+                               int j;
+                               int factor = map->num_stripes /
+                                            map->sub_stripes;
+                               u32 last_stripe = 0;
+                               div_u64_rem(stripe_nr_end - 1,
+                                           factor, &last_stripe);
+                               last_stripe *= map->sub_stripes;
+                               for (j = 0; j < factor; j++) {
+                                       u32 test;
+                                       div_u64_rem(stripe_nr_end - 1 - j,
+                                                   factor, &test);
+                                       if (test ==
+                                           stripe_index / map->sub_stripes)
+                                               break;
+                               }
+                               stripes = stripe_nr_end - 1 - j;
+                               do_div(stripes, factor);
+                               multi->stripes[i].length = map->stripe_len *
+                                       (stripes - stripe_nr + 1);
+                               if (i < map->sub_stripes) {
+                                       multi->stripes[i].length -=
+                                               stripe_offset;
+                                       if (i == map->sub_stripes - 1)
+                                               stripe_offset = 0;
+                               }
+                               if (stripe_index >= last_stripe &&
+                                   stripe_index <= (last_stripe +
+                                                    map->sub_stripes - 1)) {
+                                       multi->stripes[i].length -=
+                                               stripe_end_offset;
+                               }
+                       } else
+                               multi->stripes[i].length = *length;
+                       stripe_index++;
+                       if (stripe_index == map->num_stripes) {
+                               /* This could only happen for RAID0/10 */
+                               stripe_index = 0;
+                               stripe_nr++;
+                       }
+               }
+       } else {
+               for (i = 0; i < num_stripes; i++) {
 -                      if (unplug_page) {
 -                              struct btrfs_device *device;
 -                              struct backing_dev_info *bdi;
 -
 -                              device = map->stripes[stripe_index].dev;
 -                              if (device->bdev) {
 -                                      bdi = blk_get_backing_dev_info(device->
 -                                                                     bdev);
 -                                      if (bdi->unplug_io_fn)
 -                                              bdi->unplug_io_fn(bdi,
 -                                                                unplug_page);
 -                              }
 -                      } else {
 -                              multi->stripes[i].physical =
 -                                      map->stripes[stripe_index].physical +
 -                                      stripe_offset +
 -                                      stripe_nr * map->stripe_len;
 -                              multi->stripes[i].dev =
 -                                      map->stripes[stripe_index].dev;
 -                      }
++                      multi->stripes[i].physical =
++                              map->stripes[stripe_index].physical +
++                              stripe_offset +
++                              stripe_nr * map->stripe_len;
++                      multi->stripes[i].dev =
++                              map->stripes[stripe_index].dev;
+                       stripe_index++;
+               }
        }
        if (multi_ret) {
                *multi_ret = multi;
@@@ -3067,7 -3242,7 +3179,7 @@@ int btrfs_map_block(struct btrfs_mappin
                      struct btrfs_multi_bio **multi_ret, int mirror_num)
  {
        return __btrfs_map_block(map_tree, rw, logical, length, multi_ret,
 -                               mirror_num, NULL);
 +                               mirror_num);
  }
  
  int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
  
        free_extent_map(em);
        return 0;
 -}
 -
 -int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
 -                    u64 logical, struct page *page)
 -{
 -      u64 length = PAGE_CACHE_SIZE;
 -      return __btrfs_map_block(map_tree, READ, logical, &length,
 -                               NULL, 0, page);
  }
  
  static void end_bio_multi_stripe(struct bio *bio, int err)
diff --combined fs/btrfs/xattr.c
@@@ -242,6 -242,8 +242,8 @@@ ssize_t btrfs_listxattr(struct dentry *
                        break;
  
                di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
+               if (verify_dir_item(root, leaf, di))
+                       continue;
  
                name_len = btrfs_dir_name_len(leaf, di);
                total_size += name_len + 1;
@@@ -370,8 -372,7 +372,8 @@@ int btrfs_removexattr(struct dentry *de
  }
  
  int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
 -                            struct inode *inode, struct inode *dir)
 +                            struct inode *inode, struct inode *dir,
 +                            const struct qstr *qstr)
  {
        int err;
        size_t len;
        char *suffix;
        char *name;
  
 -      err = security_inode_init_security(inode, dir, &suffix, &value, &len);
 +      err = security_inode_init_security(inode, dir, qstr, &suffix, &value,
 +                                         &len);
        if (err) {
                if (err == -EOPNOTSUPP)
                        return 0;
diff --combined include/linux/fs.h
@@@ -102,9 -102,6 +102,9 @@@ struct inodes_stat_t 
  /* File is huge (eg. /dev/kmem): treat loff_t as unsigned */
  #define FMODE_UNSIGNED_OFFSET ((__force fmode_t)0x2000)
  
 +/* File is opened with O_PATH; almost nothing can be done with it */
 +#define FMODE_PATH            ((__force fmode_t)0x4000)
 +
  /* File was opened by fanotify and shouldn't generate fanotify events */
  #define FMODE_NONOTIFY                ((__force fmode_t)0x1000000)
  
   *                    block layer could (in theory) choose to ignore this
   *                    request if it runs into resource problems.
   * WRITE              A normal async write. Device will be plugged.
 - * WRITE_SYNC_PLUG    Synchronous write. Identical to WRITE, but passes down
 + * WRITE_SYNC         Synchronous write. Identical to WRITE, but passes down
   *                    the hint that someone will be waiting on this IO
 - *                    shortly. The device must still be unplugged explicitly,
 - *                    WRITE_SYNC_PLUG does not do this as we could be
 - *                    submitting more writes before we actually wait on any
 - *                    of them.
 - * WRITE_SYNC         Like WRITE_SYNC_PLUG, but also unplugs the device
 - *                    immediately after submission. The write equivalent
 - *                    of READ_SYNC.
 - * WRITE_ODIRECT_PLUG Special case write for O_DIRECT only.
 + *                    shortly. The write equivalent of READ_SYNC.
 + * WRITE_ODIRECT      Special case write for O_DIRECT only.
   * WRITE_FLUSH                Like WRITE_SYNC but with preceding cache flush.
   * WRITE_FUA          Like WRITE_SYNC but data is guaranteed to be on
   *                    non-volatile media on completion.
  #define WRITE                 RW_MASK
  #define READA                 RWA_MASK
  
 -#define READ_SYNC             (READ | REQ_SYNC | REQ_UNPLUG)
 +#define READ_SYNC             (READ | REQ_SYNC)
  #define READ_META             (READ | REQ_META)
 -#define WRITE_SYNC_PLUG               (WRITE | REQ_SYNC | REQ_NOIDLE)
 -#define WRITE_SYNC            (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG)
 -#define WRITE_ODIRECT_PLUG    (WRITE | REQ_SYNC)
 +#define WRITE_SYNC            (WRITE | REQ_SYNC | REQ_NOIDLE)
 +#define WRITE_ODIRECT         (WRITE | REQ_SYNC)
  #define WRITE_META            (WRITE | REQ_META)
 -#define WRITE_FLUSH           (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \
 -                               REQ_FLUSH)
 -#define WRITE_FUA             (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \
 -                               REQ_FUA)
 -#define WRITE_FLUSH_FUA               (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \
 -                               REQ_FLUSH | REQ_FUA)
 +#define WRITE_FLUSH           (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_FLUSH)
 +#define WRITE_FUA             (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_FUA)
 +#define WRITE_FLUSH_FUA               (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_FLUSH | REQ_FUA)
  
  #define SEL_IN                1
  #define SEL_OUT               2
  #define FS_TOPDIR_FL                  0x00020000 /* Top of directory hierarchies*/
  #define FS_EXTENT_FL                  0x00080000 /* Extents */
  #define FS_DIRECTIO_FL                        0x00100000 /* Use direct i/o */
+ #define FS_NOCOW_FL                   0x00800000 /* Do not cow file */
+ #define FS_COW_FL                     0x02000000 /* Cow file */
  #define FS_RESERVED_FL                        0x80000000 /* reserved for ext2 lib */
  
  #define FS_FL_USER_VISIBLE            0x0003DFFF /* User visible flags */
@@@ -576,6 -585,7 +578,6 @@@ typedef int (*read_actor_t)(read_descri
  struct address_space_operations {
        int (*writepage)(struct page *page, struct writeback_control *wbc);
        int (*readpage)(struct file *, struct page *);
 -      void (*sync_page)(struct page *);
  
        /* Write back some dirty pages from this mapping. */
        int (*writepages)(struct address_space *, struct writeback_control *);
@@@ -651,9 -661,9 +653,9 @@@ struct address_space 
  
  struct block_device {
        dev_t                   bd_dev;  /* not a kdev_t - it's a search key */
 +      int                     bd_openers;
        struct inode *          bd_inode;       /* will die */
        struct super_block *    bd_super;
 -      int                     bd_openers;
        struct mutex            bd_mutex;       /* open/close mutex */
        struct list_head        bd_inodes;
        void *                  bd_claiming;
@@@ -790,7 -800,8 +792,7 @@@ struct inode 
  #endif
  
  #ifdef CONFIG_IMA
 -      /* protected by i_lock */
 -      unsigned int            i_readcount; /* struct files open RO */
 +      atomic_t                i_readcount; /* struct files open RO */
  #endif
        atomic_t                i_writecount;
  #ifdef CONFIG_SECURITY
@@@ -969,13 -980,6 +971,13 @@@ struct file 
  #endif
  };
  
 +struct file_handle {
 +      __u32 handle_bytes;
 +      int handle_type;
 +      /* file identifier */
 +      unsigned char f_handle[0];
 +};
 +
  #define get_file(x)   atomic_long_inc(&(x)->f_count)
  #define fput_atomic(x)        atomic_long_add_unless(&(x)->f_count, -1, 1)
  #define file_count(x) atomic_long_read(&(x)->f_count)
@@@ -1399,7 -1403,6 +1401,7 @@@ struct super_block 
        wait_queue_head_t       s_wait_unfrozen;
  
        char s_id[32];                          /* Informational name */
 +      u8 s_uuid[16];                          /* UUID */
  
        void                    *s_fs_info;     /* Filesystem private info */
        fmode_t                 s_mode;
@@@ -1446,13 -1449,8 +1448,13 @@@ enum 
  #define put_fs_excl() atomic_dec(&current->fs_excl)
  #define has_fs_excl() atomic_read(&current->fs_excl)
  
 -#define is_owner_or_cap(inode)        \
 -      ((current_fsuid() == (inode)->i_uid) || capable(CAP_FOWNER))
 +/*
 + * until VFS tracks user namespaces for inodes, just make all files
 + * belong to init_user_ns
 + */
 +extern struct user_namespace init_user_ns;
 +#define inode_userns(inode) (&init_user_ns)
 +extern bool inode_owner_or_capable(const struct inode *inode);
  
  /* not quite ready to be deprecated, but... */
  extern void lock_super(struct super_block *);
@@@ -1625,8 -1623,6 +1627,8 @@@ struct super_operations 
        void (*umount_begin) (struct super_block *);
  
        int (*show_options)(struct seq_file *, struct vfsmount *);
 +      int (*show_devname)(struct seq_file *, struct vfsmount *);
 +      int (*show_path)(struct seq_file *, struct vfsmount *);
        int (*show_stats)(struct seq_file *, struct vfsmount *);
  #ifdef CONFIG_QUOTA
        ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
  };
  
  /*
 - * Inode state bits.  Protected by inode_lock.
 + * Inode state bits.  Protected by inode->i_lock
   *
   * Three bits determine the dirty state of the inode, I_DIRTY_SYNC,
   * I_DIRTY_DATASYNC and I_DIRTY_PAGES.
@@@ -1790,6 -1786,8 +1792,6 @@@ int sync_inode_metadata(struct inode *i
  struct file_system_type {
        const char *name;
        int fs_flags;
 -      int (*get_sb) (struct file_system_type *, int,
 -                     const char *, void *, struct vfsmount *);
        struct dentry *(*mount) (struct file_system_type *, int,
                       const char *, void *);
        void (*kill_sb) (struct super_block *);
@@@ -1812,12 -1810,24 +1814,12 @@@ extern struct dentry *mount_ns(struct f
  extern struct dentry *mount_bdev(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *data,
        int (*fill_super)(struct super_block *, void *, int));
 -extern int get_sb_bdev(struct file_system_type *fs_type,
 -      int flags, const char *dev_name, void *data,
 -      int (*fill_super)(struct super_block *, void *, int),
 -      struct vfsmount *mnt);
  extern struct dentry *mount_single(struct file_system_type *fs_type,
        int flags, void *data,
        int (*fill_super)(struct super_block *, void *, int));
 -extern int get_sb_single(struct file_system_type *fs_type,
 -      int flags, void *data,
 -      int (*fill_super)(struct super_block *, void *, int),
 -      struct vfsmount *mnt);
  extern struct dentry *mount_nodev(struct file_system_type *fs_type,
        int flags, void *data,
        int (*fill_super)(struct super_block *, void *, int));
 -extern int get_sb_nodev(struct file_system_type *fs_type,
 -      int flags, void *data,
 -      int (*fill_super)(struct super_block *, void *, int),
 -      struct vfsmount *mnt);
  void generic_shutdown_super(struct super_block *sb);
  void kill_block_super(struct super_block *sb);
  void kill_anon_super(struct super_block *sb);
@@@ -1833,6 -1843,7 +1835,6 @@@ extern struct dentry *mount_pseudo(stru
        const struct super_operations *ops,
        const struct dentry_operations *dops,
        unsigned long);
 -extern void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb);
  
  static inline void sb_mark_dirty(struct super_block *sb)
  {
@@@ -1865,8 -1876,6 +1867,8 @@@ extern void drop_collected_mounts(struc
  extern int iterate_mounts(int (*)(struct vfsmount *, void *), void *,
                          struct vfsmount *);
  extern int vfs_statfs(struct path *, struct kstatfs *);
 +extern int user_statfs(const char __user *, struct kstatfs *);
 +extern int fd_statfs(int, struct kstatfs *);
  extern int statfs_by_dentry(struct dentry *, struct kstatfs *);
  extern int freeze_super(struct super_block *super);
  extern int thaw_super(struct super_block *super);
@@@ -1983,8 -1992,6 +1985,8 @@@ extern int do_fallocate(struct file *fi
  extern long do_sys_open(int dfd, const char __user *filename, int flags,
                        int mode);
  extern struct file *filp_open(const char *, int, int);
 +extern struct file *file_open_root(struct dentry *, struct vfsmount *,
 +                                 const char *, int);
  extern struct file * dentry_open(struct dentry *, struct vfsmount *, int,
                                 const struct cred *);
  extern int filp_close(struct file *, fl_owner_t id);
@@@ -2195,31 -2202,15 +2197,31 @@@ static inline void allow_write_access(s
        if (file)
                atomic_inc(&file->f_path.dentry->d_inode->i_writecount);
  }
 +#ifdef CONFIG_IMA
 +static inline void i_readcount_dec(struct inode *inode)
 +{
 +      BUG_ON(!atomic_read(&inode->i_readcount));
 +      atomic_dec(&inode->i_readcount);
 +}
 +static inline void i_readcount_inc(struct inode *inode)
 +{
 +      atomic_inc(&inode->i_readcount);
 +}
 +#else
 +static inline void i_readcount_dec(struct inode *inode)
 +{
 +      return;
 +}
 +static inline void i_readcount_inc(struct inode *inode)
 +{
 +      return;
 +}
 +#endif
  extern int do_pipe_flags(int *, int);
  extern struct file *create_read_pipe(struct file *f, int flags);
  extern struct file *create_write_pipe(int flags);
  extern void free_write_pipe(struct file *);
  
 -extern struct file *do_filp_open(int dfd, const char *pathname,
 -              int open_flag, int mode, int acc_mode);
 -extern int may_open(struct path *, int, int);
 -
  extern int kernel_read(struct file *, loff_t, char *, unsigned long);
  extern struct file * open_exec(const char *);