Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux...
authorLinus Torvalds <torvalds@linux-foundation.org>
Tue, 18 Dec 2012 17:42:05 +0000 (09:42 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 18 Dec 2012 17:42:05 +0000 (09:42 -0800)
Pull btrfs update from Chris Mason:
 "A big set of fixes and features.

  In terms of line count, most of the code comes from Stefan, who added
  the ability to replace a single drive in place.  This is different
  from how btrfs normally replaces drives, and is much much much faster.

  Josef is plowing through our synchronous write performance.  This pull
  request does not include the DIO_OWN_WAITING patch that was discussed
  on the list, but it has a number of other improvements to cut down our
  latencies and CPU time during fsync/O_DIRECT writes.

  Miao Xie has a big series of fixes and is spreading out ordered
  operations over more CPUs.  This improves performance and reduces
  contention.

  I've put in fixes for error handling around hash collisions.  These
  are going back to individual stable kernels as I test against them.

  Otherwise we have a lot of fixes and cleanups, thanks everyone!
  raid5/6 is being rebased against the device replacement code.  I'll
  have it posted this Friday along with a nice series of benchmarks."

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs: (115 commits)
  Btrfs: fix a bug of per-file nocow
  Btrfs: fix hash overflow handling
  Btrfs: don't take inode delalloc mutex if we're a free space inode
  Btrfs: fix autodefrag and umount lockup
  Btrfs: fix permissions of empty files not affected by umask
  Btrfs: put raid properties into global table
  Btrfs: fix BUG() in scrub when first superblock reading gives EIO
  Btrfs: do not call file_update_time in aio_write
  Btrfs: only unlock and relock if we have to
  Btrfs: use tokens where we can in the tree log
  Btrfs: optimize leaf_space_used
  Btrfs: don't memset new tokens
  Btrfs: only clear dirty on the buffer if it is marked as dirty
  Btrfs: move checks in set_page_dirty under DEBUG
  Btrfs: log changed inodes based on the extent map tree
  Btrfs: add path->really_keep_locks
  Btrfs: do not mark ems as prealloc if we are writing to them
  Btrfs: keep track of the extents original block length
  Btrfs: inline csums if we're fsyncing
  Btrfs: don't bother copying if we're only logging the inode
  ...

1  2 
fs/btrfs/disk-io.c
fs/btrfs/extent-tree.c
fs/btrfs/extent_map.c
fs/btrfs/file.c
fs/btrfs/ioctl.c
fs/btrfs/ordered-data.h

diff --combined fs/btrfs/disk-io.c
@@@ -45,6 -45,7 +45,7 @@@
  #include "inode-map.h"
  #include "check-integrity.h"
  #include "rcu-string.h"
+ #include "dev-replace.h"
  
  #ifdef CONFIG_X86
  #include <asm/cpufeature.h>
@@@ -387,7 -388,7 +388,7 @@@ static int btree_read_extent_buffer_pag
                if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
                        break;
  
-               num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
+               num_copies = btrfs_num_copies(root->fs_info,
                                              eb->start, eb->len);
                if (num_copies == 1)
                        break;
@@@ -852,11 -853,16 +853,16 @@@ static int __btree_submit_bio_done(stru
                                 int mirror_num, unsigned long bio_flags,
                                 u64 bio_offset)
  {
+       int ret;
        /*
         * when we're called for a write, we're already in the async
         * submission context.  Just jump into btrfs_map_bio
         */
-       return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
+       ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
+       if (ret)
+               bio_endio(bio, ret);
+       return ret;
  }
  
  static int check_async_write(struct inode *inode, unsigned long bio_flags)
@@@ -878,7 -884,6 +884,6 @@@ static int btree_submit_bio_hook(struc
        int ret;
  
        if (!(rw & REQ_WRITE)) {
                /*
                 * called for a read, do the setup so that checksum validation
                 * can happen in the async kernel threads
                ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
                                          bio, 1);
                if (ret)
-                       return ret;
-               return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
-                                    mirror_num, 0);
+                       goto out_w_error;
+               ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
+                                   mirror_num, 0);
        } else if (!async) {
                ret = btree_csum_one_bio(bio);
                if (ret)
-                       return ret;
-               return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
-                                    mirror_num, 0);
+                       goto out_w_error;
+               ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
+                                   mirror_num, 0);
+       } else {
+               /*
+                * kthread helpers are used to submit writes so that
+                * checksumming can happen in parallel across all CPUs
+                */
+               ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
+                                         inode, rw, bio, mirror_num, 0,
+                                         bio_offset,
+                                         __btree_submit_bio_start,
+                                         __btree_submit_bio_done);
        }
  
-       /*
-        * kthread helpers are used to submit writes so that checksumming
-        * can happen in parallel across all CPUs
-        */
-       return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
-                                  inode, rw, bio, mirror_num, 0,
-                                  bio_offset,
-                                  __btree_submit_bio_start,
-                                  __btree_submit_bio_done);
+       if (ret) {
+ out_w_error:
+               bio_endio(bio, ret);
+       }
+       return ret;
  }
  
  #ifdef CONFIG_MIGRATION
@@@ -990,6 -1001,7 +1001,7 @@@ static void btree_invalidatepage(struc
  
  static int btree_set_page_dirty(struct page *page)
  {
+ #ifdef DEBUG
        struct extent_buffer *eb;
  
        BUG_ON(!PagePrivate(page));
        BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
        BUG_ON(!atomic_read(&eb->refs));
        btrfs_assert_tree_locked(eb);
+ #endif
        return __set_page_dirty_nobuffers(page);
  }
  
@@@ -1129,11 -1142,11 +1142,11 @@@ void clean_tree_block(struct btrfs_tran
                                          root->fs_info->dirty_metadata_bytes);
                        }
                        spin_unlock(&root->fs_info->delalloc_lock);
-               }
  
-               /* ugh, clear_extent_buffer_dirty needs to lock the page */
-               btrfs_set_lock_blocking(buf);
-               clear_extent_buffer_dirty(buf);
+                       /* ugh, clear_extent_buffer_dirty needs to lock the page */
+                       btrfs_set_lock_blocking(buf);
+                       clear_extent_buffer_dirty(buf);
+               }
        }
  }
  
@@@ -1193,7 -1206,7 +1206,7 @@@ static void __setup_root(u32 nodesize, 
        root->root_key.objectid = objectid;
        root->anon_dev = 0;
  
-       spin_lock_init(&root->root_times_lock);
+       spin_lock_init(&root->root_item_lock);
  }
  
  static int __must_check find_and_setup_root(struct btrfs_root *tree_root,
@@@ -2131,6 -2144,11 +2144,11 @@@ int open_ctree(struct super_block *sb
        init_rwsem(&fs_info->extent_commit_sem);
        init_rwsem(&fs_info->cleanup_work_sem);
        init_rwsem(&fs_info->subvol_sem);
+       fs_info->dev_replace.lock_owner = 0;
+       atomic_set(&fs_info->dev_replace.nesting_level, 0);
+       mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
+       mutex_init(&fs_info->dev_replace.lock_management_lock);
+       mutex_init(&fs_info->dev_replace.lock);
  
        spin_lock_init(&fs_info->qgroup_lock);
        fs_info->qgroup_tree = RB_ROOT;
                           fs_info->thread_pool_size,
                           &fs_info->generic_worker);
  
+       btrfs_init_workers(&fs_info->flush_workers, "flush_delalloc",
+                          fs_info->thread_pool_size,
+                          &fs_info->generic_worker);
        btrfs_init_workers(&fs_info->submit_workers, "submit",
                           min_t(u64, fs_devices->num_devices,
                           fs_info->thread_pool_size),
        ret |= btrfs_start_workers(&fs_info->delayed_workers);
        ret |= btrfs_start_workers(&fs_info->caching_workers);
        ret |= btrfs_start_workers(&fs_info->readahead_workers);
+       ret |= btrfs_start_workers(&fs_info->flush_workers);
        if (ret) {
                err = -ENOMEM;
                goto fail_sb_buffer;
                goto fail_tree_roots;
        }
  
-       btrfs_close_extra_devices(fs_devices);
+       /*
+        * keep the device that is marked to be the target device for the
+        * dev_replace procedure
+        */
+       btrfs_close_extra_devices(fs_info, fs_devices, 0);
  
        if (!fs_devices->latest_bdev) {
                printk(KERN_CRIT "btrfs: failed to read devices on %s\n",
@@@ -2490,6 -2517,14 +2517,14 @@@ retry_root_backup
                goto fail_block_groups;
        }
  
+       ret = btrfs_init_dev_replace(fs_info);
+       if (ret) {
+               pr_err("btrfs: failed to init dev_replace: %d\n", ret);
+               goto fail_block_groups;
+       }
+       btrfs_close_extra_devices(fs_info, fs_devices, 1);
        ret = btrfs_init_space_info(fs_info);
        if (ret) {
                printk(KERN_ERR "Failed to initial space info: %d\n", ret);
        }
        fs_info->num_tolerated_disk_barrier_failures =
                btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
+       if (fs_info->fs_devices->missing_devices >
+            fs_info->num_tolerated_disk_barrier_failures &&
+           !(sb->s_flags & MS_RDONLY)) {
+               printk(KERN_WARNING
+                      "Btrfs: too many missing devices, writeable mount is not allowed\n");
+               goto fail_block_groups;
+       }
  
        fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
                                               "btrfs-cleaner");
                return ret;
        }
  
+       ret = btrfs_resume_dev_replace_async(fs_info);
+       if (ret) {
+               pr_warn("btrfs: failed to resume dev_replace\n");
+               close_ctree(tree_root);
+               return ret;
+       }
        return 0;
  
  fail_qgroup:
@@@ -2667,6 -2716,7 +2716,7 @@@ fail_sb_buffer
        btrfs_stop_workers(&fs_info->submit_workers);
        btrfs_stop_workers(&fs_info->delayed_workers);
        btrfs_stop_workers(&fs_info->caching_workers);
+       btrfs_stop_workers(&fs_info->flush_workers);
  fail_alloc:
  fail_iput:
        btrfs_mapping_tree_free(&fs_info->mapping_tree);
@@@ -3270,16 -3320,18 +3320,18 @@@ int close_ctree(struct btrfs_root *root
        smp_mb();
  
        /* pause restriper - we want to resume on mount */
-       btrfs_pause_balance(root->fs_info);
+       btrfs_pause_balance(fs_info);
  
-       btrfs_scrub_cancel(root);
+       btrfs_dev_replace_suspend_for_unmount(fs_info);
+       btrfs_scrub_cancel(fs_info);
  
        /* wait for any defraggers to finish */
        wait_event(fs_info->transaction_wait,
                   (atomic_read(&fs_info->defrag_running) == 0));
  
        /* clear out the rbtree of defraggable inodes */
-       btrfs_run_defrag_inodes(fs_info);
+       btrfs_cleanup_defrag_inodes(fs_info);
  
        if (!(fs_info->sb->s_flags & MS_RDONLY)) {
                ret = btrfs_commit_super(root);
        btrfs_stop_workers(&fs_info->delayed_workers);
        btrfs_stop_workers(&fs_info->caching_workers);
        btrfs_stop_workers(&fs_info->readahead_workers);
+       btrfs_stop_workers(&fs_info->flush_workers);
  
  #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
        if (btrfs_test_opt(root, CHECK_INTEGRITY))
@@@ -3383,14 -3436,12 +3436,12 @@@ void btrfs_mark_buffer_dirty(struct ext
        int was_dirty;
  
        btrfs_assert_tree_locked(buf);
-       if (transid != root->fs_info->generation) {
-               printk(KERN_CRIT "btrfs transid mismatch buffer %llu, "
+       if (transid != root->fs_info->generation)
+               WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, "
                       "found %llu running %llu\n",
                        (unsigned long long)buf->start,
                        (unsigned long long)transid,
                        (unsigned long long)root->fs_info->generation);
-               WARN_ON(1);
-       }
        was_dirty = set_extent_buffer_dirty(buf);
        if (!was_dirty) {
                spin_lock(&root->fs_info->delalloc_lock);
        }
  }
  
- void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
+ static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
+                                       int flush_delayed)
  {
        /*
         * looks as though older kernels can get into trouble with
        if (current->flags & PF_MEMALLOC)
                return;
  
-       btrfs_balance_delayed_items(root);
+       if (flush_delayed)
+               btrfs_balance_delayed_items(root);
  
        num_dirty = root->fs_info->dirty_metadata_bytes;
  
        if (num_dirty > thresh) {
 -              balance_dirty_pages_ratelimited_nr(
 -                                 root->fs_info->btree_inode->i_mapping, 1);
 +              balance_dirty_pages_ratelimited(
 +                                 root->fs_info->btree_inode->i_mapping);
        }
        return;
  }
  
- void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
+ void btrfs_btree_balance_dirty(struct btrfs_root *root)
  {
-       /*
-        * looks as though older kernels can get into trouble with
-        * this code, they end up stuck in balance_dirty_pages forever
-        */
-       u64 num_dirty;
-       unsigned long thresh = 32 * 1024 * 1024;
-       if (current->flags & PF_MEMALLOC)
-               return;
-       num_dirty = root->fs_info->dirty_metadata_bytes;
+       __btrfs_btree_balance_dirty(root, 1);
+ }
  
-       if (num_dirty > thresh) {
-               balance_dirty_pages_ratelimited(
-                                  root->fs_info->btree_inode->i_mapping);
-       }
-       return;
+ void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root)
+ {
+       __btrfs_btree_balance_dirty(root, 0);
  }
  
  int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
diff --combined fs/btrfs/extent-tree.c
@@@ -33,6 -33,7 +33,7 @@@
  #include "volumes.h"
  #include "locking.h"
  #include "free-space-cache.h"
+ #include "math.h"
  
  #undef SCRAMBLE_DELAYED_REFS
  
@@@ -649,24 -650,6 +650,6 @@@ void btrfs_clear_space_info_full(struc
        rcu_read_unlock();
  }
  
- static u64 div_factor(u64 num, int factor)
- {
-       if (factor == 10)
-               return num;
-       num *= factor;
-       do_div(num, 10);
-       return num;
- }
- static u64 div_factor_fine(u64 num, int factor)
- {
-       if (factor == 100)
-               return num;
-       num *= factor;
-       do_div(num, 100);
-       return num;
- }
  u64 btrfs_find_block_group(struct btrfs_root *root,
                           u64 search_start, u64 search_hint, int owner)
  {
@@@ -1835,7 -1818,7 +1818,7 @@@ static int btrfs_discard_extent(struct 
  
  
        /* Tell the block device(s) that the sectors can be discarded */
-       ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD,
+       ret = btrfs_map_block(root->fs_info, REQ_DISCARD,
                              bytenr, &num_bytes, &bbio, 0);
        /* Error condition is -ENOMEM */
        if (!ret) {
@@@ -2314,6 -2297,9 +2297,9 @@@ static noinline int run_clustered_refs(
                                kfree(extent_op);
  
                                if (ret) {
+                                       list_del_init(&locked_ref->cluster);
+                                       mutex_unlock(&locked_ref->mutex);
                                        printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret);
                                        spin_lock(&delayed_refs->lock);
                                        return ret;
                count++;
  
                if (ret) {
+                       if (locked_ref) {
+                               list_del_init(&locked_ref->cluster);
+                               mutex_unlock(&locked_ref->mutex);
+                       }
                        printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret);
                        spin_lock(&delayed_refs->lock);
                        return ret;
@@@ -3661,7 -3651,7 +3651,7 @@@ out
  
  static int can_overcommit(struct btrfs_root *root,
                          struct btrfs_space_info *space_info, u64 bytes,
-                         int flush)
+                         enum btrfs_reserve_flush_enum flush)
  {
        u64 profile = btrfs_get_alloc_profile(root, 0);
        u64 avail;
                avail >>= 1;
  
        /*
-        * If we aren't flushing don't let us overcommit too much, say
-        * 1/8th of the space.  If we can flush, let it overcommit up to
-        * 1/2 of the space.
+        * If we aren't flushing all things, let us overcommit up to
+        * 1/2th of the space. If we can flush, don't let us overcommit
+        * too much, let it overcommit up to 1/8 of the space.
         */
-       if (flush)
+       if (flush == BTRFS_RESERVE_FLUSH_ALL)
                avail >>= 3;
        else
                avail >>= 1;
        return 0;
  }
  
+ static int writeback_inodes_sb_nr_if_idle_safe(struct super_block *sb,
+                                              unsigned long nr_pages,
+                                              enum wb_reason reason)
+ {
+       if (!writeback_in_progress(sb->s_bdi) &&
+           down_read_trylock(&sb->s_umount)) {
+               writeback_inodes_sb_nr(sb, nr_pages, reason);
+               up_read(&sb->s_umount);
+               return 1;
+       }
+       return 0;
+ }
  /*
   * shrink metadata reservation for delalloc
   */
@@@ -3713,6 -3717,7 +3717,7 @@@ static void shrink_delalloc(struct btrf
        long time_left;
        unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
        int loops = 0;
+       enum btrfs_reserve_flush_enum flush;
  
        trans = (struct btrfs_trans_handle *)current->journal_info;
        block_rsv = &root->fs_info->delalloc_block_rsv;
        while (delalloc_bytes && loops < 3) {
                max_reclaim = min(delalloc_bytes, to_reclaim);
                nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
-               writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages,
-                                              WB_REASON_FS_FREE_SPACE);
+               writeback_inodes_sb_nr_if_idle_safe(root->fs_info->sb,
+                                                   nr_pages,
+                                                   WB_REASON_FS_FREE_SPACE);
  
                /*
                 * We need to wait for the async pages to actually start before
                wait_event(root->fs_info->async_submit_wait,
                           !atomic_read(&root->fs_info->async_delalloc_pages));
  
+               if (!trans)
+                       flush = BTRFS_RESERVE_FLUSH_ALL;
+               else
+                       flush = BTRFS_RESERVE_NO_FLUSH;
                spin_lock(&space_info->lock);
-               if (can_overcommit(root, space_info, orig, !trans)) {
+               if (can_overcommit(root, space_info, orig, flush)) {
                        spin_unlock(&space_info->lock);
                        break;
                }
@@@ -3888,7 -3898,7 +3898,7 @@@ static int flush_space(struct btrfs_roo
   * @root - the root we're allocating for
   * @block_rsv - the block_rsv we're allocating for
   * @orig_bytes - the number of bytes we want
 - * @flush - wether or not we can flush to make our reservation
 + * @flush - whether or not we can flush to make our reservation
   *
   * This will reserve orgi_bytes number of bytes from the space info associated
   * with the block_rsv.  If there is not enough space it will make an attempt to
   */
  static int reserve_metadata_bytes(struct btrfs_root *root,
                                  struct btrfs_block_rsv *block_rsv,
-                                 u64 orig_bytes, int flush)
+                                 u64 orig_bytes,
+                                 enum btrfs_reserve_flush_enum flush)
  {
        struct btrfs_space_info *space_info = block_rsv->space_info;
        u64 used;
@@@ -3912,10 -3923,11 +3923,11 @@@ again
        ret = 0;
        spin_lock(&space_info->lock);
        /*
-        * We only want to wait if somebody other than us is flushing and we are
-        * actually alloed to flush.
+        * We only want to wait if somebody other than us is flushing and we
+        * are actually allowed to flush all things.
         */
-       while (flush && !flushing && space_info->flush) {
+       while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing &&
+              space_info->flush) {
                spin_unlock(&space_info->lock);
                /*
                 * If we have a trans handle we can't wait because the flusher
         * Couldn't make our reservation, save our place so while we're trying
         * to reclaim space we can actually use it instead of somebody else
         * stealing it from us.
+        *
+        * We make the other tasks wait for the flush only when we can flush
+        * all things.
         */
-       if (ret && flush) {
+       if (ret && flush == BTRFS_RESERVE_FLUSH_ALL) {
                flushing = true;
                space_info->flush = 1;
        }
  
        spin_unlock(&space_info->lock);
  
-       if (!ret || !flush)
+       if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
                goto out;
  
        ret = flush_space(root, space_info, num_bytes, orig_bytes,
                          flush_state);
        flush_state++;
+       /*
+        * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock
+        * would happen. So skip delalloc flush.
+        */
+       if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
+           (flush_state == FLUSH_DELALLOC ||
+            flush_state == FLUSH_DELALLOC_WAIT))
+               flush_state = ALLOC_CHUNK;
        if (!ret)
                goto again;
-       else if (flush_state <= COMMIT_TRANS)
+       else if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
+                flush_state < COMMIT_TRANS)
+               goto again;
+       else if (flush == BTRFS_RESERVE_FLUSH_ALL &&
+                flush_state <= COMMIT_TRANS)
                goto again;
  
  out:
@@@ -4148,9 -4177,9 +4177,9 @@@ void btrfs_free_block_rsv(struct btrfs_
        kfree(rsv);
  }
  
static inline int __block_rsv_add(struct btrfs_root *root,
-                                 struct btrfs_block_rsv *block_rsv,
-                                 u64 num_bytes, int flush)
int btrfs_block_rsv_add(struct btrfs_root *root,
+                       struct btrfs_block_rsv *block_rsv, u64 num_bytes,
+                       enum btrfs_reserve_flush_enum flush)
  {
        int ret;
  
        return ret;
  }
  
- int btrfs_block_rsv_add(struct btrfs_root *root,
-                       struct btrfs_block_rsv *block_rsv,
-                       u64 num_bytes)
- {
-       return __block_rsv_add(root, block_rsv, num_bytes, 1);
- }
- int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
-                               struct btrfs_block_rsv *block_rsv,
-                               u64 num_bytes)
- {
-       return __block_rsv_add(root, block_rsv, num_bytes, 0);
- }
  int btrfs_block_rsv_check(struct btrfs_root *root,
                          struct btrfs_block_rsv *block_rsv, int min_factor)
  {
        return ret;
  }
  
static inline int __btrfs_block_rsv_refill(struct btrfs_root *root,
-                                          struct btrfs_block_rsv *block_rsv,
-                                          u64 min_reserved, int flush)
int btrfs_block_rsv_refill(struct btrfs_root *root,
+                          struct btrfs_block_rsv *block_rsv, u64 min_reserved,
+                          enum btrfs_reserve_flush_enum flush)
  {
        u64 num_bytes = 0;
        int ret = -ENOSPC;
        return ret;
  }
  
- int btrfs_block_rsv_refill(struct btrfs_root *root,
-                          struct btrfs_block_rsv *block_rsv,
-                          u64 min_reserved)
- {
-       return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1);
- }
- int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
-                                  struct btrfs_block_rsv *block_rsv,
-                                  u64 min_reserved)
- {
-       return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0);
- }
  int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
                            struct btrfs_block_rsv *dst_rsv,
                            u64 num_bytes)
@@@ -4532,17 -4533,27 +4533,27 @@@ int btrfs_delalloc_reserve_metadata(str
        u64 csum_bytes;
        unsigned nr_extents = 0;
        int extra_reserve = 0;
-       int flush = 1;
+       enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
        int ret;
+       bool delalloc_lock = true;
  
-       /* Need to be holding the i_mutex here if we aren't free space cache */
-       if (btrfs_is_free_space_inode(inode))
-               flush = 0;
+       /* If we are a free space inode we need to not flush since we will be in
+        * the middle of a transaction commit.  We also don't need the delalloc
+        * mutex since we won't race with anybody.  We need this mostly to make
+        * lockdep shut its filthy mouth.
+        */
+       if (btrfs_is_free_space_inode(inode)) {
+               flush = BTRFS_RESERVE_NO_FLUSH;
+               delalloc_lock = false;
+       }
  
-       if (flush && btrfs_transaction_in_commit(root->fs_info))
+       if (flush != BTRFS_RESERVE_NO_FLUSH &&
+           btrfs_transaction_in_commit(root->fs_info))
                schedule_timeout(1);
  
-       mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
+       if (delalloc_lock)
+               mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
        num_bytes = ALIGN(num_bytes, root->sectorsize);
  
        spin_lock(&BTRFS_I(inode)->lock);
                ret = btrfs_qgroup_reserve(root, num_bytes +
                                           nr_extents * root->leafsize);
                if (ret) {
-                       mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
+                       spin_lock(&BTRFS_I(inode)->lock);
+                       calc_csum_metadata_size(inode, num_bytes, 0);
+                       spin_unlock(&BTRFS_I(inode)->lock);
+                       if (delalloc_lock)
+                               mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
                        return ret;
                }
        }
                                                      btrfs_ino(inode),
                                                      to_free, 0);
                }
-               mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
+               if (root->fs_info->quota_enabled) {
+                       btrfs_qgroup_free(root, num_bytes +
+                                               nr_extents * root->leafsize);
+               }
+               if (delalloc_lock)
+                       mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
                return ret;
        }
  
        }
        BTRFS_I(inode)->reserved_extents += nr_extents;
        spin_unlock(&BTRFS_I(inode)->lock);
-       mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
+       if (delalloc_lock)
+               mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
  
        if (to_reserve)
                trace_btrfs_space_reservation(root->fs_info,"delalloc",
@@@ -4969,9 -4991,13 +4991,13 @@@ static int unpin_extent_range(struct bt
  {
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_block_group_cache *cache = NULL;
+       struct btrfs_space_info *space_info;
+       struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
        u64 len;
+       bool readonly;
  
        while (start <= end) {
+               readonly = false;
                if (!cache ||
                    start >= cache->key.objectid + cache->key.offset) {
                        if (cache)
                }
  
                start += len;
+               space_info = cache->space_info;
  
-               spin_lock(&cache->space_info->lock);
+               spin_lock(&space_info->lock);
                spin_lock(&cache->lock);
                cache->pinned -= len;
-               cache->space_info->bytes_pinned -= len;
-               if (cache->ro)
-                       cache->space_info->bytes_readonly += len;
+               space_info->bytes_pinned -= len;
+               if (cache->ro) {
+                       space_info->bytes_readonly += len;
+                       readonly = true;
+               }
                spin_unlock(&cache->lock);
-               spin_unlock(&cache->space_info->lock);
+               if (!readonly && global_rsv->space_info == space_info) {
+                       spin_lock(&global_rsv->lock);
+                       if (!global_rsv->full) {
+                               len = min(len, global_rsv->size -
+                                         global_rsv->reserved);
+                               global_rsv->reserved += len;
+                               space_info->bytes_may_use += len;
+                               if (global_rsv->reserved >= global_rsv->size)
+                                       global_rsv->full = 1;
+                       }
+                       spin_unlock(&global_rsv->lock);
+               }
+               spin_unlock(&space_info->lock);
        }
  
        if (cache)
@@@ -5466,7 -5507,7 +5507,7 @@@ wait_block_group_cache_done(struct btrf
        return 0;
  }
  
static int __get_block_group_index(u64 flags)
int __get_raid_index(u64 flags)
  {
        int index;
  
  
  static int get_block_group_index(struct btrfs_block_group_cache *cache)
  {
-       return __get_block_group_index(cache->flags);
+       return __get_raid_index(cache->flags);
  }
  
  enum btrfs_loop_type {
@@@ -6269,7 -6310,8 +6310,8 @@@ use_block_rsv(struct btrfs_trans_handl
        block_rsv = get_block_rsv(trans, root);
  
        if (block_rsv->size == 0) {
-               ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
+               ret = reserve_metadata_bytes(root, block_rsv, blocksize,
+                                            BTRFS_RESERVE_NO_FLUSH);
                /*
                 * If we couldn't reserve metadata bytes try and use some from
                 * the global reserve.
                static DEFINE_RATELIMIT_STATE(_rs,
                                DEFAULT_RATELIMIT_INTERVAL,
                                /*DEFAULT_RATELIMIT_BURST*/ 2);
-               if (__ratelimit(&_rs)) {
-                       printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret);
-                       WARN_ON(1);
-               }
-               ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
+               if (__ratelimit(&_rs))
+                       WARN(1, KERN_DEBUG "btrfs: block rsv returned %d\n",
+                            ret);
+               ret = reserve_metadata_bytes(root, block_rsv, blocksize,
+                                            BTRFS_RESERVE_NO_FLUSH);
                if (!ret) {
                        return block_rsv;
                } else if (ret && block_rsv != global_rsv) {
@@@ -7427,7 -7469,7 +7469,7 @@@ int btrfs_can_relocate(struct btrfs_roo
         */
        target = get_restripe_target(root->fs_info, block_group->flags);
        if (target) {
-               index = __get_block_group_index(extended_to_chunk(target));
+               index = __get_raid_index(extended_to_chunk(target));
        } else {
                /*
                 * this is just a balance, so if we were marked as full
                 * check to make sure we can actually find a chunk with enough
                 * space to fit our block group in.
                 */
-               if (device->total_bytes > device->bytes_used + min_free) {
+               if (device->total_bytes > device->bytes_used + min_free &&
+                   !device->is_tgtdev_for_dev_replace) {
                        ret = find_free_dev_extent(device, min_free,
                                                   &dev_offset, NULL);
                        if (!ret)
diff --combined fs/btrfs/extent_map.c
@@@ -49,7 -49,7 +49,7 @@@ void extent_map_tree_init(struct extent
  struct extent_map *alloc_extent_map(void)
  {
        struct extent_map *em;
-       em = kmem_cache_alloc(extent_map_cache, GFP_NOFS);
+       em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS);
        if (!em)
                return NULL;
        em->in_tree = 0;
@@@ -198,16 -198,15 +198,15 @@@ static void try_merge_map(struct extent
                        merge = rb_entry(rb, struct extent_map, rb_node);
                if (rb && mergable_maps(merge, em)) {
                        em->start = merge->start;
+                       em->orig_start = merge->orig_start;
                        em->len += merge->len;
                        em->block_len += merge->block_len;
                        em->block_start = merge->block_start;
                        merge->in_tree = 0;
-                       if (merge->generation > em->generation) {
-                               em->mod_start = em->start;
-                               em->mod_len = em->len;
-                               em->generation = merge->generation;
-                               list_move(&em->list, &tree->modified_extents);
-                       }
+                       em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start;
+                       em->mod_start = merge->mod_start;
+                       em->generation = max(em->generation, merge->generation);
+                       list_move(&em->list, &tree->modified_extents);
  
                        list_del_init(&merge->list);
                        rb_erase(&merge->rb_node, &tree->map);
                em->block_len += merge->len;
                rb_erase(&merge->rb_node, &tree->map);
                merge->in_tree = 0;
-               if (merge->generation > em->generation) {
-                       em->mod_len = em->len;
-                       em->generation = merge->generation;
-                       list_move(&em->list, &tree->modified_extents);
-               }
+               em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
+               em->generation = max(em->generation, merge->generation);
                list_del_init(&merge->list);
                free_extent_map(merge);
        }
  }
  
  /**
 - * unpint_extent_cache - unpin an extent from the cache
 + * unpin_extent_cache - unpin an extent from the cache
   * @tree:     tree to unpin the extent in
   * @start:    logical offset in the file
   * @len:      length of the extent
   * @gen:      generation that this extent has been modified in
 - * @prealloc: if this is set we need to clear the prealloc flag
   *
   * Called after an extent has been written to disk properly.  Set the generation
   * to the generation that actually added the file item to the inode so we know
@@@ -265,9 -262,9 +261,9 @@@ int unpin_extent_cache(struct extent_ma
        em->mod_start = em->start;
        em->mod_len = em->len;
  
-       if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+       if (test_bit(EXTENT_FLAG_FILLING, &em->flags)) {
                prealloc = true;
-               clear_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+               clear_bit(EXTENT_FLAG_FILLING, &em->flags);
        }
  
        try_merge_map(tree, em);
diff --combined fs/btrfs/file.c
@@@ -41,6 -41,7 +41,7 @@@
  #include "compat.h"
  #include "volumes.h"
  
+ static struct kmem_cache *btrfs_inode_defrag_cachep;
  /*
   * when auto defrag is enabled we
   * queue up these defrag structs to remember which
@@@ -90,7 -91,7 +91,7 @@@ static int __compare_inode_defrag(struc
   * If an existing record is found the defrag item you
   * pass in is freed
   */
- static void __btrfs_add_inode_defrag(struct inode *inode,
+ static int __btrfs_add_inode_defrag(struct inode *inode,
                                    struct inode_defrag *defrag)
  {
        struct btrfs_root *root = BTRFS_I(inode)->root;
                                entry->transid = defrag->transid;
                        if (defrag->last_offset > entry->last_offset)
                                entry->last_offset = defrag->last_offset;
-                       goto exists;
+                       return -EEXIST;
                }
        }
        set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
        rb_link_node(&defrag->rb_node, parent, p);
        rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
-       return;
+       return 0;
+ }
  
- exists:
-       kfree(defrag);
-       return;
+ static inline int __need_auto_defrag(struct btrfs_root *root)
+ {
+       if (!btrfs_test_opt(root, AUTO_DEFRAG))
+               return 0;
+       if (btrfs_fs_closing(root->fs_info))
+               return 0;
  
+       return 1;
  }
  
  /*
@@@ -142,11 -149,9 +149,9 @@@ int btrfs_add_inode_defrag(struct btrfs
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct inode_defrag *defrag;
        u64 transid;
+       int ret;
  
-       if (!btrfs_test_opt(root, AUTO_DEFRAG))
-               return 0;
-       if (btrfs_fs_closing(root->fs_info))
+       if (!__need_auto_defrag(root))
                return 0;
  
        if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
        else
                transid = BTRFS_I(inode)->root->last_trans;
  
-       defrag = kzalloc(sizeof(*defrag), GFP_NOFS);
+       defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
        if (!defrag)
                return -ENOMEM;
  
        defrag->root = root->root_key.objectid;
  
        spin_lock(&root->fs_info->defrag_inodes_lock);
-       if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
-               __btrfs_add_inode_defrag(inode, defrag);
-       else
-               kfree(defrag);
+       if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) {
+               /*
+                * If we set IN_DEFRAG flag and evict the inode from memory,
+                * and then re-read this inode, this new inode doesn't have
+                * IN_DEFRAG flag. At the case, we may find the existed defrag.
+                */
+               ret = __btrfs_add_inode_defrag(inode, defrag);
+               if (ret)
+                       kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+       } else {
+               kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+       }
        spin_unlock(&root->fs_info->defrag_inodes_lock);
        return 0;
  }
  
  /*
-  * must be called with the defrag_inodes lock held
+  * Requeue the defrag object. If there is a defrag object that points to
+  * the same inode in the tree, we will merge them together (by
+  * __btrfs_add_inode_defrag()) and free the one that we want to requeue.
   */
- struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info,
-                                            u64 root, u64 ino,
-                                            struct rb_node **next)
+ void btrfs_requeue_inode_defrag(struct inode *inode,
+                               struct inode_defrag *defrag)
+ {
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       int ret;
+       if (!__need_auto_defrag(root))
+               goto out;
+       /*
+        * Here we don't check the IN_DEFRAG flag, because we need merge
+        * them together.
+        */
+       spin_lock(&root->fs_info->defrag_inodes_lock);
+       ret = __btrfs_add_inode_defrag(inode, defrag);
+       spin_unlock(&root->fs_info->defrag_inodes_lock);
+       if (ret)
+               goto out;
+       return;
+ out:
+       kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+ }
+ /*
+  * pick the defragable inode that we want, if it doesn't exist, we will get
+  * the next one.
+  */
+ static struct inode_defrag *
+ btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino)
  {
        struct inode_defrag *entry = NULL;
        struct inode_defrag tmp;
        tmp.ino = ino;
        tmp.root = root;
  
-       p = info->defrag_inodes.rb_node;
+       spin_lock(&fs_info->defrag_inodes_lock);
+       p = fs_info->defrag_inodes.rb_node;
        while (p) {
                parent = p;
                entry = rb_entry(parent, struct inode_defrag, rb_node);
                else if (ret > 0)
                        p = parent->rb_right;
                else
-                       return entry;
+                       goto out;
        }
  
-       if (next) {
-               while (parent && __compare_inode_defrag(&tmp, entry) > 0) {
-                       parent = rb_next(parent);
+       if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
+               parent = rb_next(parent);
+               if (parent)
                        entry = rb_entry(parent, struct inode_defrag, rb_node);
-               }
-               *next = parent;
+               else
+                       entry = NULL;
        }
-       return NULL;
+ out:
+       if (entry)
+               rb_erase(parent, &fs_info->defrag_inodes);
+       spin_unlock(&fs_info->defrag_inodes_lock);
+       return entry;
  }
  
- /*
-  * run through the list of inodes in the FS that need
-  * defragging
-  */
- int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
+ void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
  {
        struct inode_defrag *defrag;
+       struct rb_node *node;
+       spin_lock(&fs_info->defrag_inodes_lock);
+       node = rb_first(&fs_info->defrag_inodes);
+       while (node) {
+               rb_erase(node, &fs_info->defrag_inodes);
+               defrag = rb_entry(node, struct inode_defrag, rb_node);
+               kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+               if (need_resched()) {
+                       spin_unlock(&fs_info->defrag_inodes_lock);
+                       cond_resched();
+                       spin_lock(&fs_info->defrag_inodes_lock);
+               }
+               node = rb_first(&fs_info->defrag_inodes);
+       }
+       spin_unlock(&fs_info->defrag_inodes_lock);
+ }
+ #define BTRFS_DEFRAG_BATCH    1024
+ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
+                                   struct inode_defrag *defrag)
+ {
        struct btrfs_root *inode_root;
        struct inode *inode;
-       struct rb_node *n;
        struct btrfs_key key;
        struct btrfs_ioctl_defrag_range_args range;
-       u64 first_ino = 0;
-       u64 root_objectid = 0;
        int num_defrag;
-       int defrag_batch = 1024;
  
+       /* get the inode */
+       key.objectid = defrag->root;
+       btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+       key.offset = (u64)-1;
+       inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
+       if (IS_ERR(inode_root)) {
+               kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+               return PTR_ERR(inode_root);
+       }
+       key.objectid = defrag->ino;
+       btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+       key.offset = 0;
+       inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
+       if (IS_ERR(inode)) {
+               kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+               return PTR_ERR(inode);
+       }
+       /* do a chunk of defrag */
+       clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
        memset(&range, 0, sizeof(range));
        range.len = (u64)-1;
+       range.start = defrag->last_offset;
+       sb_start_write(fs_info->sb);
+       num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
+                                      BTRFS_DEFRAG_BATCH);
+       sb_end_write(fs_info->sb);
+       /*
+        * if we filled the whole defrag batch, there
+        * must be more work to do.  Queue this defrag
+        * again
+        */
+       if (num_defrag == BTRFS_DEFRAG_BATCH) {
+               defrag->last_offset = range.start;
+               btrfs_requeue_inode_defrag(inode, defrag);
+       } else if (defrag->last_offset && !defrag->cycled) {
+               /*
+                * we didn't fill our defrag batch, but
+                * we didn't start at zero.  Make sure we loop
+                * around to the start of the file.
+                */
+               defrag->last_offset = 0;
+               defrag->cycled = 1;
+               btrfs_requeue_inode_defrag(inode, defrag);
+       } else {
+               kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+       }
+       iput(inode);
+       return 0;
+ }
+ /*
+  * run through the list of inodes in the FS that need
+  * defragging
+  */
+ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
+ {
+       struct inode_defrag *defrag;
+       u64 first_ino = 0;
+       u64 root_objectid = 0;
  
        atomic_inc(&fs_info->defrag_running);
-       spin_lock(&fs_info->defrag_inodes_lock);
        while(1) {
-               n = NULL;
+               if (!__need_auto_defrag(fs_info->tree_root))
+                       break;
  
                /* find an inode to defrag */
-               defrag = btrfs_find_defrag_inode(fs_info, root_objectid,
-                                                first_ino, &n);
+               defrag = btrfs_pick_defrag_inode(fs_info, root_objectid,
+                                                first_ino);
                if (!defrag) {
-                       if (n) {
-                               defrag = rb_entry(n, struct inode_defrag,
-                                                 rb_node);
-                       } else if (root_objectid || first_ino) {
+                       if (root_objectid || first_ino) {
                                root_objectid = 0;
                                first_ino = 0;
                                continue;
                        }
                }
  
-               /* remove it from the rbtree */
                first_ino = defrag->ino + 1;
                root_objectid = defrag->root;
-               rb_erase(&defrag->rb_node, &fs_info->defrag_inodes);
-               if (btrfs_fs_closing(fs_info))
-                       goto next_free;
-               spin_unlock(&fs_info->defrag_inodes_lock);
-               /* get the inode */
-               key.objectid = defrag->root;
-               btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
-               key.offset = (u64)-1;
-               inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
-               if (IS_ERR(inode_root))
-                       goto next;
-               key.objectid = defrag->ino;
-               btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
-               key.offset = 0;
-               inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
-               if (IS_ERR(inode))
-                       goto next;
  
-               /* do a chunk of defrag */
-               clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
-               range.start = defrag->last_offset;
-               num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
-                                              defrag_batch);
-               /*
-                * if we filled the whole defrag batch, there
-                * must be more work to do.  Queue this defrag
-                * again
-                */
-               if (num_defrag == defrag_batch) {
-                       defrag->last_offset = range.start;
-                       __btrfs_add_inode_defrag(inode, defrag);
-                       /*
-                        * we don't want to kfree defrag, we added it back to
-                        * the rbtree
-                        */
-                       defrag = NULL;
-               } else if (defrag->last_offset && !defrag->cycled) {
-                       /*
-                        * we didn't fill our defrag batch, but
-                        * we didn't start at zero.  Make sure we loop
-                        * around to the start of the file.
-                        */
-                       defrag->last_offset = 0;
-                       defrag->cycled = 1;
-                       __btrfs_add_inode_defrag(inode, defrag);
-                       defrag = NULL;
-               }
-               iput(inode);
- next:
-               spin_lock(&fs_info->defrag_inodes_lock);
- next_free:
-               kfree(defrag);
+               __btrfs_run_defrag_inode(fs_info, defrag);
        }
-       spin_unlock(&fs_info->defrag_inodes_lock);
        atomic_dec(&fs_info->defrag_running);
  
        /*
@@@ -526,6 -588,8 +588,8 @@@ void btrfs_drop_extent_cache(struct ino
                                split->block_len = em->block_len;
                        else
                                split->block_len = split->len;
+                       split->orig_block_len = max(split->block_len,
+                                                   em->orig_block_len);
                        split->generation = gen;
                        split->bdev = em->bdev;
                        split->flags = flags;
                        split->flags = flags;
                        split->compress_type = em->compress_type;
                        split->generation = gen;
+                       split->orig_block_len = max(em->block_len,
+                                                   em->orig_block_len);
  
                        if (compressed) {
                                split->block_len = em->block_len;
                        } else {
                                split->block_len = split->len;
                                split->block_start = em->block_start + diff;
-                               split->orig_start = split->start;
+                               split->orig_start = em->orig_start;
                        }
  
                        ret = add_extent_mapping(em_tree, split);
@@@ -1346,9 -1412,10 +1412,9 @@@ static noinline ssize_t __btrfs_buffere
  
                cond_resched();
  
 -              balance_dirty_pages_ratelimited_nr(inode->i_mapping,
 -                                                 dirty_pages);
 +              balance_dirty_pages_ratelimited(inode->i_mapping);
                if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
-                       btrfs_btree_balance_dirty(root, 1);
+                       btrfs_btree_balance_dirty(root);
  
                pos += copied;
                num_written += copied;
        return written ? written : err;
  }
  
+ static void update_time_for_write(struct inode *inode)
+ {
+       struct timespec now;
+       if (IS_NOCMTIME(inode))
+               return;
+       now = current_fs_time(inode->i_sb);
+       if (!timespec_equal(&inode->i_mtime, &now))
+               inode->i_mtime = now;
+       if (!timespec_equal(&inode->i_ctime, &now))
+               inode->i_ctime = now;
+       if (IS_I_VERSION(inode))
+               inode_inc_iversion(inode);
+ }
  static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
                                    const struct iovec *iov,
                                    unsigned long nr_segs, loff_t pos)
        ssize_t num_written = 0;
        ssize_t err = 0;
        size_t count, ocount;
+       bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);
  
        sb_start_write(inode->i_sb);
  
                goto out;
        }
  
-       err = file_update_time(file);
-       if (err) {
-               mutex_unlock(&inode->i_mutex);
-               goto out;
-       }
+       /*
+        * We reserve space for updating the inode when we reserve space for the
+        * extent we are going to write, so we will enospc out there.  We don't
+        * need to start yet another transaction to update the inode as we will
+        * update the inode when we finish writing whatever data we write.
+        */
+       update_time_for_write(inode);
  
        start_pos = round_down(pos, root->sectorsize);
        if (start_pos > i_size_read(inode)) {
                }
        }
  
+       if (sync)
+               atomic_inc(&BTRFS_I(inode)->sync_writers);
        if (unlikely(file->f_flags & O_DIRECT)) {
                num_written = __btrfs_direct_write(iocb, iov, nr_segs,
                                                   pos, ppos, count, ocount);
         * this will either be one more than the running transaction
         * or the generation used for the next transaction if there isn't
         * one running right now.
+        *
+        * We also have to set last_sub_trans to the current log transid,
+        * otherwise subsequent syncs to a file that's been synced in this
+        * transaction will appear to have already occured.
         */
        BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
+       BTRFS_I(inode)->last_sub_trans = root->log_transid;
        if (num_written > 0 || num_written == -EIOCBQUEUED) {
                err = generic_write_sync(file, pos, num_written);
                if (err < 0 && num_written > 0)
                        num_written = err;
        }
  out:
+       if (sync)
+               atomic_dec(&BTRFS_I(inode)->sync_writers);
        sb_end_write(inode->i_sb);
        current->backing_dev_info = NULL;
        return num_written ? num_written : err;
@@@ -1550,7 -1648,9 +1647,9 @@@ int btrfs_sync_file(struct file *file, 
         * out of the ->i_mutex. If so, we can flush the dirty pages by
         * multi-task, and make the performance up.
         */
+       atomic_inc(&BTRFS_I(inode)->sync_writers);
        ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+       atomic_dec(&BTRFS_I(inode)->sync_writers);
        if (ret)
                return ret;
  
         * range being left.
         */
        atomic_inc(&root->log_batch);
-       btrfs_wait_ordered_range(inode, start, end);
+       btrfs_wait_ordered_range(inode, start, end - start + 1);
        atomic_inc(&root->log_batch);
  
        /*
@@@ -1767,6 -1867,7 +1866,7 @@@ out
  
                hole_em->block_start = EXTENT_MAP_HOLE;
                hole_em->block_len = 0;
+               hole_em->orig_block_len = 0;
                hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
                hole_em->compress_type = BTRFS_COMPRESS_NONE;
                hole_em->generation = trans->transid;
@@@ -1796,48 -1897,51 +1896,51 @@@ static int btrfs_punch_hole(struct inod
        struct btrfs_path *path;
        struct btrfs_block_rsv *rsv;
        struct btrfs_trans_handle *trans;
-       u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
-       u64 lockstart = (offset + mask) & ~mask;
-       u64 lockend = ((offset + len) & ~mask) - 1;
+       u64 lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize);
+       u64 lockend = round_down(offset + len,
+                                BTRFS_I(inode)->root->sectorsize) - 1;
        u64 cur_offset = lockstart;
        u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
        u64 drop_end;
-       unsigned long nr;
        int ret = 0;
        int err = 0;
-       bool same_page = (offset >> PAGE_CACHE_SHIFT) ==
-               ((offset + len) >> PAGE_CACHE_SHIFT);
+       bool same_page = ((offset >> PAGE_CACHE_SHIFT) ==
+                         ((offset + len - 1) >> PAGE_CACHE_SHIFT));
  
        btrfs_wait_ordered_range(inode, offset, len);
  
        mutex_lock(&inode->i_mutex);
-       if (offset >= inode->i_size) {
-               mutex_unlock(&inode->i_mutex);
-               return 0;
-       }
+       /*
+        * We needn't truncate any page which is beyond the end of the file
+        * because we are sure there is no data there.
+        */
        /*
         * Only do this if we are in the same page and we aren't doing the
         * entire page.
         */
        if (same_page && len < PAGE_CACHE_SIZE) {
-               ret = btrfs_truncate_page(inode, offset, len, 0);
+               if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE))
+                       ret = btrfs_truncate_page(inode, offset, len, 0);
                mutex_unlock(&inode->i_mutex);
                return ret;
        }
  
        /* zero back part of the first page */
-       ret = btrfs_truncate_page(inode, offset, 0, 0);
-       if (ret) {
-               mutex_unlock(&inode->i_mutex);
-               return ret;
+       if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) {
+               ret = btrfs_truncate_page(inode, offset, 0, 0);
+               if (ret) {
+                       mutex_unlock(&inode->i_mutex);
+                       return ret;
+               }
        }
  
        /* zero the front end of the last page */
-       ret = btrfs_truncate_page(inode, offset + len, 0, 1);
-       if (ret) {
-               mutex_unlock(&inode->i_mutex);
-               return ret;
+       if (offset + len < round_up(inode->i_size, PAGE_CACHE_SIZE)) {
+               ret = btrfs_truncate_page(inode, offset + len, 0, 1);
+               if (ret) {
+                       mutex_unlock(&inode->i_mutex);
+                       return ret;
+               }
        }
  
        if (lockend < lockstart) {
                        break;
                }
  
-               nr = trans->blocks_used;
                btrfs_end_transaction(trans, root);
-               btrfs_btree_balance_dirty(root, nr);
+               btrfs_btree_balance_dirty(root);
  
                trans = btrfs_start_transaction(root, 3);
                if (IS_ERR(trans)) {
@@@ -1963,11 -2066,13 +2065,13 @@@ out_trans
        if (!trans)
                goto out_free;
  
+       inode_inc_iversion(inode);
+       inode->i_mtime = inode->i_ctime = CURRENT_TIME;
        trans->block_rsv = &root->fs_info->trans_block_rsv;
        ret = btrfs_update_inode(trans, root, inode);
-       nr = trans->blocks_used;
        btrfs_end_transaction(trans, root);
-       btrfs_btree_balance_dirty(root, nr);
+       btrfs_btree_balance_dirty(root);
  out_free:
        btrfs_free_path(path);
        btrfs_free_block_rsv(root, rsv);
@@@ -1991,12 -2096,12 +2095,12 @@@ static long btrfs_fallocate(struct fil
        u64 alloc_end;
        u64 alloc_hint = 0;
        u64 locked_end;
-       u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
        struct extent_map *em;
+       int blocksize = BTRFS_I(inode)->root->sectorsize;
        int ret;
  
-       alloc_start = offset & ~mask;
-       alloc_end =  (offset + len + mask) & ~mask;
+       alloc_start = round_down(offset, blocksize);
+       alloc_end = round_up(offset + len, blocksize);
  
        /* Make sure we aren't being give some crap mode */
        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
         * Make sure we have enough space before we do the
         * allocation.
         */
-       ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start + 1);
+       ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
        if (ret)
                return ret;
  
                }
                last_byte = min(extent_map_end(em), alloc_end);
                actual_end = min_t(u64, extent_map_end(em), offset + len);
-               last_byte = (last_byte + mask) & ~mask;
+               last_byte = ALIGN(last_byte, blocksize);
  
                if (em->block_start == EXTENT_MAP_HOLE ||
                    (cur_offset >= inode->i_size &&
  out:
        mutex_unlock(&inode->i_mutex);
        /* Let go of our reservation. */
-       btrfs_free_reserved_data_space(inode, alloc_end - alloc_start + 1);
+       btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
        return ret;
  }
  
 -static int find_desired_extent(struct inode *inode, loff_t *offset, int origin)
 +static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
  {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct extent_map *em;
         * before the position we want in case there is outstanding delalloc
         * going on here.
         */
 -      if (origin == SEEK_HOLE && start != 0) {
 +      if (whence == SEEK_HOLE && start != 0) {
                if (start <= root->sectorsize)
                        em = btrfs_get_extent_fiemap(inode, NULL, 0, 0,
                                                     root->sectorsize, 0);
                                }
                        }
  
 -                      if (origin == SEEK_HOLE) {
 +                      if (whence == SEEK_HOLE) {
                                *offset = start;
                                free_extent_map(em);
                                break;
                        }
                } else {
 -                      if (origin == SEEK_DATA) {
 +                      if (whence == SEEK_DATA) {
                                if (em->block_start == EXTENT_MAP_DELALLOC) {
                                        if (start >= inode->i_size) {
                                                free_extent_map(em);
        return ret;
  }
  
 -static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin)
 +static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
  {
        struct inode *inode = file->f_mapping->host;
        int ret;
  
        mutex_lock(&inode->i_mutex);
 -      switch (origin) {
 +      switch (whence) {
        case SEEK_END:
        case SEEK_CUR:
 -              offset = generic_file_llseek(file, offset, origin);
 +              offset = generic_file_llseek(file, offset, whence);
                goto out;
        case SEEK_DATA:
        case SEEK_HOLE:
                        return -ENXIO;
                }
  
 -              ret = find_desired_extent(inode, &offset, origin);
 +              ret = find_desired_extent(inode, &offset, whence);
                if (ret) {
                        mutex_unlock(&inode->i_mutex);
                        return ret;
@@@ -2292,3 -2397,21 +2396,21 @@@ const struct file_operations btrfs_file
        .compat_ioctl   = btrfs_ioctl,
  #endif
  };
+ void btrfs_auto_defrag_exit(void)
+ {
+       if (btrfs_inode_defrag_cachep)
+               kmem_cache_destroy(btrfs_inode_defrag_cachep);
+ }
+ int btrfs_auto_defrag_init(void)
+ {
+       btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
+                                       sizeof(struct inode_defrag), 0,
+                                       SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+                                       NULL);
+       if (!btrfs_inode_defrag_cachep)
+               return -ENOMEM;
+       return 0;
+ }
diff --combined fs/btrfs/ioctl.c
@@@ -55,6 -55,7 +55,7 @@@
  #include "backref.h"
  #include "rcu-string.h"
  #include "send.h"
+ #include "dev-replace.h"
  
  /* Mask out flags that are inappropriate for the given type of inode. */
  static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
@@@ -140,8 -141,11 +141,11 @@@ void btrfs_inherit_iflags(struct inode 
                BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
        }
  
-       if (flags & BTRFS_INODE_NODATACOW)
+       if (flags & BTRFS_INODE_NODATACOW) {
                BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
+               if (S_ISREG(inode->i_mode))
+                       BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
+       }
  
        btrfs_update_iflags(inode);
  }
@@@ -571,8 -575,12 +575,12 @@@ static int create_snapshot(struct btrfs
                ret = btrfs_commit_transaction(trans,
                                               root->fs_info->extent_root);
        }
-       if (ret)
+       if (ret) {
+               /* cleanup_transaction has freed this for us */
+               if (trans->aborted)
+                       pending_snapshot = NULL;
                goto fail;
+       }
  
        ret = pending_snapshot->error;
        if (ret)
@@@ -705,6 -713,16 +713,16 @@@ static noinline int btrfs_mksubvol(stru
        if (error)
                goto out_dput;
  
+       /*
+        * even if this name doesn't exist, we may get hash collisions.
+        * check for them now when we can safely fail
+        */
+       error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root,
+                                              dir->i_ino, name,
+                                              namelen);
+       if (error)
+               goto out_dput;
        down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
  
        if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0)
@@@ -1225,7 -1243,7 +1243,7 @@@ int btrfs_defrag_file(struct inode *ino
                }
  
                defrag_count += ret;
 -              balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret);
 +              balance_dirty_pages_ratelimited(inode->i_mapping);
                mutex_unlock(&inode->i_mutex);
  
                if (newer_than) {
@@@ -1293,12 -1311,13 +1311,13 @@@ out_ra
        return ret;
  }
  
- static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
+ static noinline int btrfs_ioctl_resize(struct file *file,
                                        void __user *arg)
  {
        u64 new_size;
        u64 old_size;
        u64 devid = 1;
+       struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
        struct btrfs_ioctl_vol_args *vol_args;
        struct btrfs_trans_handle *trans;
        struct btrfs_device *device = NULL;
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
  
-       mutex_lock(&root->fs_info->volume_mutex);
-       if (root->fs_info->balance_ctl) {
-               printk(KERN_INFO "btrfs: balance in progress\n");
-               ret = -EINVAL;
-               goto out;
+       ret = mnt_want_write_file(file);
+       if (ret)
+               return ret;
+       if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+                       1)) {
+               pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
+               return -EINPROGRESS;
        }
  
+       mutex_lock(&root->fs_info->volume_mutex);
        vol_args = memdup_user(arg, sizeof(*vol_args));
        if (IS_ERR(vol_args)) {
                ret = PTR_ERR(vol_args);
                printk(KERN_INFO "btrfs: resizing devid %llu\n",
                       (unsigned long long)devid);
        }
-       device = btrfs_find_device(root, devid, NULL, NULL);
+       device = btrfs_find_device(root->fs_info, devid, NULL, NULL);
        if (!device) {
                printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
                       (unsigned long long)devid);
                }
        }
  
+       if (device->is_tgtdev_for_dev_replace) {
+               ret = -EINVAL;
+               goto out_free;
+       }
        old_size = device->total_bytes;
  
        if (mod < 0) {
                btrfs_commit_transaction(trans, root);
        } else if (new_size < old_size) {
                ret = btrfs_shrink_device(device, new_size);
-       }
+       } /* equal, nothing need to do */
  
  out_free:
        kfree(vol_args);
  out:
        mutex_unlock(&root->fs_info->volume_mutex);
+       mnt_drop_write_file(file);
+       atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
        return ret;
  }
  
@@@ -2156,9 -2186,17 +2186,17 @@@ static int btrfs_ioctl_defrag(struct fi
        if (btrfs_root_readonly(root))
                return -EROFS;
  
+       if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+                       1)) {
+               pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
+               return -EINPROGRESS;
+       }
        ret = mnt_want_write_file(file);
-       if (ret)
+       if (ret) {
+               atomic_set(&root->fs_info->mutually_exclusive_operation_running,
+                          0);
                return ret;
+       }
  
        switch (inode->i_mode & S_IFMT) {
        case S_IFDIR:
        }
  out:
        mnt_drop_write_file(file);
+       atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
        return ret;
  }
  
@@@ -2221,13 -2260,13 +2260,13 @@@ static long btrfs_ioctl_add_dev(struct 
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
  
-       mutex_lock(&root->fs_info->volume_mutex);
-       if (root->fs_info->balance_ctl) {
-               printk(KERN_INFO "btrfs: balance in progress\n");
-               ret = -EINVAL;
-               goto out;
+       if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+                       1)) {
+               pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
+               return -EINPROGRESS;
        }
  
+       mutex_lock(&root->fs_info->volume_mutex);
        vol_args = memdup_user(arg, sizeof(*vol_args));
        if (IS_ERR(vol_args)) {
                ret = PTR_ERR(vol_args);
        kfree(vol_args);
  out:
        mutex_unlock(&root->fs_info->volume_mutex);
+       atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
        return ret;
  }
  
- static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
+ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
  {
+       struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
        struct btrfs_ioctl_vol_args *vol_args;
        int ret;
  
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
  
-       if (root->fs_info->sb->s_flags & MS_RDONLY)
-               return -EROFS;
+       ret = mnt_want_write_file(file);
+       if (ret)
+               return ret;
  
-       mutex_lock(&root->fs_info->volume_mutex);
-       if (root->fs_info->balance_ctl) {
-               printk(KERN_INFO "btrfs: balance in progress\n");
-               ret = -EINVAL;
-               goto out;
+       if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+                       1)) {
+               pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
+               mnt_drop_write_file(file);
+               return -EINPROGRESS;
        }
  
+       mutex_lock(&root->fs_info->volume_mutex);
        vol_args = memdup_user(arg, sizeof(*vol_args));
        if (IS_ERR(vol_args)) {
                ret = PTR_ERR(vol_args);
        kfree(vol_args);
  out:
        mutex_unlock(&root->fs_info->volume_mutex);
+       mnt_drop_write_file(file);
+       atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
        return ret;
  }
  
@@@ -2328,7 -2373,7 +2373,7 @@@ static long btrfs_ioctl_dev_info(struc
                s_uuid = di_args->uuid;
  
        mutex_lock(&fs_devices->device_list_mutex);
-       dev = btrfs_find_device(root, di_args->devid, s_uuid, NULL);
+       dev = btrfs_find_device(root->fs_info, di_args->devid, s_uuid, NULL);
        mutex_unlock(&fs_devices->device_list_mutex);
  
        if (!dev) {
@@@ -2821,12 -2866,19 +2866,19 @@@ static long btrfs_ioctl_default_subvol(
        struct btrfs_disk_key disk_key;
        u64 objectid = 0;
        u64 dir_id;
+       int ret;
  
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
  
-       if (copy_from_user(&objectid, argp, sizeof(objectid)))
-               return -EFAULT;
+       ret = mnt_want_write_file(file);
+       if (ret)
+               return ret;
+       if (copy_from_user(&objectid, argp, sizeof(objectid))) {
+               ret = -EFAULT;
+               goto out;
+       }
  
        if (!objectid)
                objectid = root->root_key.objectid;
        location.offset = (u64)-1;
  
        new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
-       if (IS_ERR(new_root))
-               return PTR_ERR(new_root);
+       if (IS_ERR(new_root)) {
+               ret = PTR_ERR(new_root);
+               goto out;
+       }
  
-       if (btrfs_root_refs(&new_root->root_item) == 0)
-               return -ENOENT;
+       if (btrfs_root_refs(&new_root->root_item) == 0) {
+               ret = -ENOENT;
+               goto out;
+       }
  
        path = btrfs_alloc_path();
-       if (!path)
-               return -ENOMEM;
+       if (!path) {
+               ret = -ENOMEM;
+               goto out;
+       }
        path->leave_spinning = 1;
  
        trans = btrfs_start_transaction(root, 1);
        if (IS_ERR(trans)) {
                btrfs_free_path(path);
-               return PTR_ERR(trans);
+               ret = PTR_ERR(trans);
+               goto out;
        }
  
        dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
                btrfs_end_transaction(trans, root);
                printk(KERN_ERR "Umm, you don't have the default dir item, "
                       "this isn't going to work\n");
-               return -ENOENT;
+               ret = -ENOENT;
+               goto out;
        }
  
        btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key);
  
        btrfs_set_fs_incompat(root->fs_info, DEFAULT_SUBVOL);
        btrfs_end_transaction(trans, root);
-       return 0;
+ out:
+       mnt_drop_write_file(file);
+       return ret;
  }
  
  void btrfs_get_block_group_info(struct list_head *groups_list,
@@@ -3036,32 -3097,38 +3097,38 @@@ long btrfs_ioctl_trans_end(struct file 
        return 0;
  }
  
- static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp)
+ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
+                                           void __user *argp)
  {
-       struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
        struct btrfs_trans_handle *trans;
        u64 transid;
        int ret;
  
-       trans = btrfs_start_transaction(root, 0);
-       if (IS_ERR(trans))
-               return PTR_ERR(trans);
+       trans = btrfs_attach_transaction(root);
+       if (IS_ERR(trans)) {
+               if (PTR_ERR(trans) != -ENOENT)
+                       return PTR_ERR(trans);
+               /* No running transaction, don't bother */
+               transid = root->fs_info->last_trans_committed;
+               goto out;
+       }
        transid = trans->transid;
        ret = btrfs_commit_transaction_async(trans, root, 0);
        if (ret) {
                btrfs_end_transaction(trans, root);
                return ret;
        }
+ out:
        if (argp)
                if (copy_to_user(argp, &transid, sizeof(transid)))
                        return -EFAULT;
        return 0;
  }
  
- static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp)
+ static noinline long btrfs_ioctl_wait_sync(struct btrfs_root *root,
+                                          void __user *argp)
  {
-       struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
        u64 transid;
  
        if (argp) {
        return btrfs_wait_for_commit(root, transid);
  }
  
- static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg)
+ static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
  {
-       int ret;
+       struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
        struct btrfs_ioctl_scrub_args *sa;
+       int ret;
  
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
        if (IS_ERR(sa))
                return PTR_ERR(sa);
  
-       ret = btrfs_scrub_dev(root, sa->devid, sa->start, sa->end,
-                             &sa->progress, sa->flags & BTRFS_SCRUB_READONLY);
+       if (!(sa->flags & BTRFS_SCRUB_READONLY)) {
+               ret = mnt_want_write_file(file);
+               if (ret)
+                       goto out;
+       }
+       ret = btrfs_scrub_dev(root->fs_info, sa->devid, sa->start, sa->end,
+                             &sa->progress, sa->flags & BTRFS_SCRUB_READONLY,
+                             0);
  
        if (copy_to_user(arg, sa, sizeof(*sa)))
                ret = -EFAULT;
  
+       if (!(sa->flags & BTRFS_SCRUB_READONLY))
+               mnt_drop_write_file(file);
+ out:
        kfree(sa);
        return ret;
  }
@@@ -3100,7 -3178,7 +3178,7 @@@ static long btrfs_ioctl_scrub_cancel(st
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
  
-       return btrfs_scrub_cancel(root);
+       return btrfs_scrub_cancel(root->fs_info);
  }
  
  static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,
@@@ -3149,6 -3227,51 +3227,51 @@@ static long btrfs_ioctl_get_dev_stats(s
        return ret;
  }
  
+ static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg)
+ {
+       struct btrfs_ioctl_dev_replace_args *p;
+       int ret;
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+       p = memdup_user(arg, sizeof(*p));
+       if (IS_ERR(p))
+               return PTR_ERR(p);
+       switch (p->cmd) {
+       case BTRFS_IOCTL_DEV_REPLACE_CMD_START:
+               if (atomic_xchg(
+                       &root->fs_info->mutually_exclusive_operation_running,
+                       1)) {
+                       pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
+                       ret = -EINPROGRESS;
+               } else {
+                       ret = btrfs_dev_replace_start(root, p);
+                       atomic_set(
+                        &root->fs_info->mutually_exclusive_operation_running,
+                        0);
+               }
+               break;
+       case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS:
+               btrfs_dev_replace_status(root->fs_info, p);
+               ret = 0;
+               break;
+       case BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL:
+               ret = btrfs_dev_replace_cancel(root->fs_info, p);
+               break;
+       default:
+               ret = -EINVAL;
+               break;
+       }
+       if (copy_to_user(arg, p, sizeof(*p)))
+               ret = -EFAULT;
+       kfree(p);
+       return ret;
+ }
  static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
  {
        int ret = 0;
@@@ -3315,6 -3438,7 +3438,7 @@@ static long btrfs_ioctl_balance(struct 
        struct btrfs_ioctl_balance_args *bargs;
        struct btrfs_balance_control *bctl;
        int ret;
+       int need_to_clear_lock = 0;
  
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
                bargs = NULL;
        }
  
-       if (fs_info->balance_ctl) {
+       if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+                       1)) {
+               pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
                ret = -EINPROGRESS;
                goto out_bargs;
        }
+       need_to_clear_lock = 1;
  
        bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
        if (!bctl) {
@@@ -3387,6 -3514,9 +3514,9 @@@ do_balance
  out_bargs:
        kfree(bargs);
  out:
+       if (need_to_clear_lock)
+               atomic_set(&root->fs_info->mutually_exclusive_operation_running,
+                          0);
        mutex_unlock(&fs_info->balance_mutex);
        mutex_unlock(&fs_info->volume_mutex);
        mnt_drop_write_file(file);
@@@ -3441,8 -3571,9 +3571,9 @@@ out
        return ret;
  }
  
- static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg)
+ static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
  {
+       struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
        struct btrfs_ioctl_quota_ctl_args *sa;
        struct btrfs_trans_handle *trans = NULL;
        int ret;
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
  
-       if (root->fs_info->sb->s_flags & MS_RDONLY)
-               return -EROFS;
+       ret = mnt_want_write_file(file);
+       if (ret)
+               return ret;
  
        sa = memdup_user(arg, sizeof(*sa));
-       if (IS_ERR(sa))
-               return PTR_ERR(sa);
+       if (IS_ERR(sa)) {
+               ret = PTR_ERR(sa);
+               goto drop_write;
+       }
  
        if (sa->cmd != BTRFS_QUOTA_CTL_RESCAN) {
                trans = btrfs_start_transaction(root, 2);
                if (err && !ret)
                        ret = err;
        }
  out:
        kfree(sa);
+ drop_write:
+       mnt_drop_write_file(file);
        return ret;
  }
  
- static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg)
+ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
  {
+       struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
        struct btrfs_ioctl_qgroup_assign_args *sa;
        struct btrfs_trans_handle *trans;
        int ret;
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
  
-       if (root->fs_info->sb->s_flags & MS_RDONLY)
-               return -EROFS;
+       ret = mnt_want_write_file(file);
+       if (ret)
+               return ret;
  
        sa = memdup_user(arg, sizeof(*sa));
-       if (IS_ERR(sa))
-               return PTR_ERR(sa);
+       if (IS_ERR(sa)) {
+               ret = PTR_ERR(sa);
+               goto drop_write;
+       }
  
        trans = btrfs_join_transaction(root);
        if (IS_ERR(trans)) {
  
  out:
        kfree(sa);
+ drop_write:
+       mnt_drop_write_file(file);
        return ret;
  }
  
- static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg)
+ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
  {
+       struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
        struct btrfs_ioctl_qgroup_create_args *sa;
        struct btrfs_trans_handle *trans;
        int ret;
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
  
-       if (root->fs_info->sb->s_flags & MS_RDONLY)
-               return -EROFS;
+       ret = mnt_want_write_file(file);
+       if (ret)
+               return ret;
  
        sa = memdup_user(arg, sizeof(*sa));
-       if (IS_ERR(sa))
-               return PTR_ERR(sa);
+       if (IS_ERR(sa)) {
+               ret = PTR_ERR(sa);
+               goto drop_write;
+       }
  
        trans = btrfs_join_transaction(root);
        if (IS_ERR(trans)) {
  
  out:
        kfree(sa);
+ drop_write:
+       mnt_drop_write_file(file);
        return ret;
  }
  
- static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg)
+ static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg)
  {
+       struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
        struct btrfs_ioctl_qgroup_limit_args *sa;
        struct btrfs_trans_handle *trans;
        int ret;
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
  
-       if (root->fs_info->sb->s_flags & MS_RDONLY)
-               return -EROFS;
+       ret = mnt_want_write_file(file);
+       if (ret)
+               return ret;
  
        sa = memdup_user(arg, sizeof(*sa));
-       if (IS_ERR(sa))
-               return PTR_ERR(sa);
+       if (IS_ERR(sa)) {
+               ret = PTR_ERR(sa);
+               goto drop_write;
+       }
  
        trans = btrfs_join_transaction(root);
        if (IS_ERR(trans)) {
  
  out:
        kfree(sa);
+ drop_write:
+       mnt_drop_write_file(file);
        return ret;
  }
  
@@@ -3735,11 -3888,11 +3888,11 @@@ long btrfs_ioctl(struct file *file, uns
        case BTRFS_IOC_DEFRAG_RANGE:
                return btrfs_ioctl_defrag(file, argp);
        case BTRFS_IOC_RESIZE:
-               return btrfs_ioctl_resize(root, argp);
+               return btrfs_ioctl_resize(file, argp);
        case BTRFS_IOC_ADD_DEV:
                return btrfs_ioctl_add_dev(root, argp);
        case BTRFS_IOC_RM_DEV:
-               return btrfs_ioctl_rm_dev(root, argp);
+               return btrfs_ioctl_rm_dev(file, argp);
        case BTRFS_IOC_FS_INFO:
                return btrfs_ioctl_fs_info(root, argp);
        case BTRFS_IOC_DEV_INFO:
                btrfs_sync_fs(file->f_dentry->d_sb, 1);
                return 0;
        case BTRFS_IOC_START_SYNC:
-               return btrfs_ioctl_start_sync(file, argp);
+               return btrfs_ioctl_start_sync(root, argp);
        case BTRFS_IOC_WAIT_SYNC:
-               return btrfs_ioctl_wait_sync(file, argp);
+               return btrfs_ioctl_wait_sync(root, argp);
        case BTRFS_IOC_SCRUB:
-               return btrfs_ioctl_scrub(root, argp);
+               return btrfs_ioctl_scrub(file, argp);
        case BTRFS_IOC_SCRUB_CANCEL:
                return btrfs_ioctl_scrub_cancel(root, argp);
        case BTRFS_IOC_SCRUB_PROGRESS:
        case BTRFS_IOC_GET_DEV_STATS:
                return btrfs_ioctl_get_dev_stats(root, argp);
        case BTRFS_IOC_QUOTA_CTL:
-               return btrfs_ioctl_quota_ctl(root, argp);
+               return btrfs_ioctl_quota_ctl(file, argp);
        case BTRFS_IOC_QGROUP_ASSIGN:
-               return btrfs_ioctl_qgroup_assign(root, argp);
+               return btrfs_ioctl_qgroup_assign(file, argp);
        case BTRFS_IOC_QGROUP_CREATE:
-               return btrfs_ioctl_qgroup_create(root, argp);
+               return btrfs_ioctl_qgroup_create(file, argp);
        case BTRFS_IOC_QGROUP_LIMIT:
-               return btrfs_ioctl_qgroup_limit(root, argp);
+               return btrfs_ioctl_qgroup_limit(file, argp);
+       case BTRFS_IOC_DEV_REPLACE:
+               return btrfs_ioctl_dev_replace(root, argp);
        }
  
        return -ENOTTY;
diff --combined fs/btrfs/ordered-data.h
@@@ -76,7 -76,7 +76,7 @@@ struct btrfs_ordered_sum 
  
  #define BTRFS_ORDERED_IOERR 6 /* We had an io error when writing this out */
  
 -#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates wether this ordered extent
 +#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates whether this ordered extent
                                       * has done its due diligence in updating
                                       * the isize. */
  
@@@ -128,8 -128,11 +128,11 @@@ struct btrfs_ordered_extent 
        struct list_head root_extent_list;
  
        struct btrfs_work work;
- };
  
+       struct completion completion;
+       struct btrfs_work flush_work;
+       struct list_head work_list;
+ };
  
  /*
   * calculates the total size you need to allocate for an ordered sum
@@@ -186,7 -189,7 +189,7 @@@ struct btrfs_ordered_extent *btrfs_look
  int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
                                struct btrfs_ordered_extent *ordered);
  int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
void btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
  void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
                                 struct inode *inode);