Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux...

author Linus Torvalds <torvalds@linux-foundation.org>

Tue, 18 Dec 2012 17:42:05 +0000 (09:42 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Tue, 18 Dec 2012 17:42:05 +0000 (09:42 -0800)
author Linus Torvalds <torvalds@linux-foundation.org>
Tue, 18 Dec 2012 17:42:05 +0000 (09:42 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Tue, 18 Dec 2012 17:42:05 +0000 (09:42 -0800)
diff --combined fs/btrfs/disk-io.c

index 22a0439,65f0367..a8f652d
--- 1/fs/btrfs/disk-io.c
--- 2/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@@ -45,6 -45,7 +45,7 @@@
   #include "inode-map.h"
   #include "check-integrity.h"
   #include "rcu-string.h"
+ #include "dev-replace.h"
   
   #ifdef CONFIG_X86
   #include <asm/cpufeature.h>
@@@ -387,7 -388,7 +388,7 @@@ static int btree_read_extent_buffer_pag
                 if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
                         break;
   
-               num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
+               num_copies = btrfs_num_copies(root->fs_info,
                                               eb->start, eb->len);
                 if (num_copies == 1)
                         break;
@@@ -852,11 -853,16 +853,16 @@@ static int __btree_submit_bio_done(stru
                                  int mirror_num, unsigned long bio_flags,
                                  u64 bio_offset)
   {
+       int ret;
+ 
         /*
          * when we're called for a write, we're already in the async
          * submission context.  Just jump into btrfs_map_bio
          */
-       return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
+       ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
+       if (ret)
+               bio_endio(bio, ret);
+       return ret;
   }
   
   static int check_async_write(struct inode *inode, unsigned long bio_flags)
@@@ -878,7 -884,6 +884,6 @@@ static int btree_submit_bio_hook(struc
         int ret;
   
         if (!(rw & REQ_WRITE)) {
- 
                 /*
                  * called for a read, do the setup so that checksum validation
                  * can happen in the async kernel threads
@@@ -886,26 -891,32 +891,32 @@@
                 ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
                                           bio, 1);
                 if (ret)
-                       return ret;
-               return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
-                                    mirror_num, 0);
+                       goto out_w_error;
+               ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
+                                   mirror_num, 0);
         } else if (!async) {
                 ret = btree_csum_one_bio(bio);
                 if (ret)
-                       return ret;
-               return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
-                                    mirror_num, 0);
+                       goto out_w_error;
+               ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
+                                   mirror_num, 0);
+       } else {
+               /*
+                * kthread helpers are used to submit writes so that
+                * checksumming can happen in parallel across all CPUs
+                */
+               ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
+                                         inode, rw, bio, mirror_num, 0,
+                                         bio_offset,
+                                         __btree_submit_bio_start,
+                                         __btree_submit_bio_done);
         }
   
-       /*
-        * kthread helpers are used to submit writes so that checksumming
-        * can happen in parallel across all CPUs
-        */
-       return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
-                                  inode, rw, bio, mirror_num, 0,
-                                  bio_offset,
-                                  __btree_submit_bio_start,
-                                  __btree_submit_bio_done);
+       if (ret) {
+ out_w_error:
+               bio_endio(bio, ret);
+       }
+       return ret;
   }
   
   #ifdef CONFIG_MIGRATION
@@@ -990,6 -1001,7 +1001,7 @@@ static void btree_invalidatepage(struc
   
   static int btree_set_page_dirty(struct page *page)
   {
+ #ifdef DEBUG
         struct extent_buffer *eb;
   
         BUG_ON(!PagePrivate(page));
@@@ -998,6 -1010,7 +1010,7 @@@
         BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
         BUG_ON(!atomic_read(&eb->refs));
         btrfs_assert_tree_locked(eb);
+ #endif
         return __set_page_dirty_nobuffers(page);
   }
   
@@@ -1129,11 -1142,11 +1142,11 @@@ void clean_tree_block(struct btrfs_tran
                                           root->fs_info->dirty_metadata_bytes);
                         }
                         spin_unlock(&root->fs_info->delalloc_lock);
-               }
   
-               /* ugh, clear_extent_buffer_dirty needs to lock the page */
-               btrfs_set_lock_blocking(buf);
-               clear_extent_buffer_dirty(buf);
+                       /* ugh, clear_extent_buffer_dirty needs to lock the page */
+                       btrfs_set_lock_blocking(buf);
+                       clear_extent_buffer_dirty(buf);
+               }
         }
   }
   
@@@ -1193,7 -1206,7 +1206,7 @@@ static void __setup_root(u32 nodesize, 
         root->root_key.objectid = objectid;
         root->anon_dev = 0;
   
-       spin_lock_init(&root->root_times_lock);
+       spin_lock_init(&root->root_item_lock);
   }
   
   static int __must_check find_and_setup_root(struct btrfs_root *tree_root,
@@@ -2131,6 -2144,11 +2144,11 @@@ int open_ctree(struct super_block *sb
         init_rwsem(&fs_info->extent_commit_sem);
         init_rwsem(&fs_info->cleanup_work_sem);
         init_rwsem(&fs_info->subvol_sem);
+       fs_info->dev_replace.lock_owner = 0;
+       atomic_set(&fs_info->dev_replace.nesting_level, 0);
+       mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
+       mutex_init(&fs_info->dev_replace.lock_management_lock);
+       mutex_init(&fs_info->dev_replace.lock);
   
         spin_lock_init(&fs_info->qgroup_lock);
         fs_info->qgroup_tree = RB_ROOT;
@@@ -2279,6 -2297,10 +2297,10 @@@
                            fs_info->thread_pool_size,
                            &fs_info->generic_worker);
   
+       btrfs_init_workers(&fs_info->flush_workers, "flush_delalloc",
+                          fs_info->thread_pool_size,
+                          &fs_info->generic_worker);
+ 
         btrfs_init_workers(&fs_info->submit_workers, "submit",
                            min_t(u64, fs_devices->num_devices,
                            fs_info->thread_pool_size),
@@@ -2350,6 -2372,7 +2372,7 @@@
         ret |= btrfs_start_workers(&fs_info->delayed_workers);
         ret |= btrfs_start_workers(&fs_info->caching_workers);
         ret |= btrfs_start_workers(&fs_info->readahead_workers);
+       ret |= btrfs_start_workers(&fs_info->flush_workers);
         if (ret) {
                 err = -ENOMEM;
                 goto fail_sb_buffer;
@@@ -2418,7 -2441,11 +2441,11 @@@
                 goto fail_tree_roots;
         }
   
-       btrfs_close_extra_devices(fs_devices);
+       /*
+        * keep the device that is marked to be the target device for the
+        * dev_replace procedure
+        */
+       btrfs_close_extra_devices(fs_info, fs_devices, 0);
   
         if (!fs_devices->latest_bdev) {
                 printk(KERN_CRIT "btrfs: failed to read devices on %s\n",
@@@ -2490,6 -2517,14 +2517,14 @@@ retry_root_backup
                 goto fail_block_groups;
         }
   
+       ret = btrfs_init_dev_replace(fs_info);
+       if (ret) {
+               pr_err("btrfs: failed to init dev_replace: %d\n", ret);
+               goto fail_block_groups;
+       }
+ 
+       btrfs_close_extra_devices(fs_info, fs_devices, 1);
+ 
         ret = btrfs_init_space_info(fs_info);
         if (ret) {
                 printk(KERN_ERR "Failed to initial space info: %d\n", ret);
@@@ -2503,6 -2538,13 +2538,13 @@@
         }
         fs_info->num_tolerated_disk_barrier_failures =
                 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
+       if (fs_info->fs_devices->missing_devices >
+            fs_info->num_tolerated_disk_barrier_failures &&
+           !(sb->s_flags & MS_RDONLY)) {
+               printk(KERN_WARNING
+                      "Btrfs: too many missing devices, writeable mount is not allowed\n");
+               goto fail_block_groups;
+       }
   
         fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
                                                "btrfs-cleaner");
@@@ -2631,6 -2673,13 +2673,13 @@@
                 return ret;
         }
   
+       ret = btrfs_resume_dev_replace_async(fs_info);
+       if (ret) {
+               pr_warn("btrfs: failed to resume dev_replace\n");
+               close_ctree(tree_root);
+               return ret;
+       }
+ 
         return 0;
   
   fail_qgroup:
@@@ -2667,6 -2716,7 +2716,7 @@@ fail_sb_buffer
         btrfs_stop_workers(&fs_info->submit_workers);
         btrfs_stop_workers(&fs_info->delayed_workers);
         btrfs_stop_workers(&fs_info->caching_workers);
+       btrfs_stop_workers(&fs_info->flush_workers);
   fail_alloc:
   fail_iput:
         btrfs_mapping_tree_free(&fs_info->mapping_tree);
@@@ -3270,16 -3320,18 +3320,18 @@@ int close_ctree(struct btrfs_root *root
         smp_mb();
   
         /* pause restriper - we want to resume on mount */
-       btrfs_pause_balance(root->fs_info);
+       btrfs_pause_balance(fs_info);
   
-       btrfs_scrub_cancel(root);
+       btrfs_dev_replace_suspend_for_unmount(fs_info);
+ 
+       btrfs_scrub_cancel(fs_info);
   
         /* wait for any defraggers to finish */
         wait_event(fs_info->transaction_wait,
                    (atomic_read(&fs_info->defrag_running) == 0));
   
         /* clear out the rbtree of defraggable inodes */
-       btrfs_run_defrag_inodes(fs_info);
+       btrfs_cleanup_defrag_inodes(fs_info);
   
         if (!(fs_info->sb->s_flags & MS_RDONLY)) {
                 ret = btrfs_commit_super(root);
@@@ -3339,6 -3391,7 +3391,7 @@@
         btrfs_stop_workers(&fs_info->delayed_workers);
         btrfs_stop_workers(&fs_info->caching_workers);
         btrfs_stop_workers(&fs_info->readahead_workers);
+       btrfs_stop_workers(&fs_info->flush_workers);
   
   #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
         if (btrfs_test_opt(root, CHECK_INTEGRITY))
@@@ -3383,14 -3436,12 +3436,12 @@@ void btrfs_mark_buffer_dirty(struct ext
         int was_dirty;
   
         btrfs_assert_tree_locked(buf);
-       if (transid != root->fs_info->generation) {
-               printk(KERN_CRIT "btrfs transid mismatch buffer %llu, "
+       if (transid != root->fs_info->generation)
+               WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, "
                        "found %llu running %llu\n",
                         (unsigned long long)buf->start,
                         (unsigned long long)transid,
                         (unsigned long long)root->fs_info->generation);
-               WARN_ON(1);
-       }
         was_dirty = set_extent_buffer_dirty(buf);
         if (!was_dirty) {
                 spin_lock(&root->fs_info->delalloc_lock);
@@@ -3399,7 -3450,8 +3450,8 @@@
         }
   }
   
- void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
+ static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
+                                       int flush_delayed)
   {
         /*
          * looks as though older kernels can get into trouble with
@@@ -3411,36 -3463,26 +3463,26 @@@
         if (current->flags & PF_MEMALLOC)
                 return;
   
-       btrfs_balance_delayed_items(root);
+       if (flush_delayed)
+               btrfs_balance_delayed_items(root);
   
         num_dirty = root->fs_info->dirty_metadata_bytes;
   
         if (num_dirty > thresh) {
- -              balance_dirty_pages_ratelimited_nr(
- -                                 root->fs_info->btree_inode->i_mapping, 1);
+ +              balance_dirty_pages_ratelimited(
+ +                                 root->fs_info->btree_inode->i_mapping);
         }
         return;
   }
   
- void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
+ void btrfs_btree_balance_dirty(struct btrfs_root *root)
   {
-       /*
-        * looks as though older kernels can get into trouble with
-        * this code, they end up stuck in balance_dirty_pages forever
-        */
-       u64 num_dirty;
-       unsigned long thresh = 32 * 1024 * 1024;
- 
-       if (current->flags & PF_MEMALLOC)
-               return;
- 
-       num_dirty = root->fs_info->dirty_metadata_bytes;
+       __btrfs_btree_balance_dirty(root, 1);
+ }
   
-       if (num_dirty > thresh) {
-               balance_dirty_pages_ratelimited(
-                                  root->fs_info->btree_inode->i_mapping);
-       }
-       return;
+ void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root)
+ {
+       __btrfs_btree_balance_dirty(root, 0);
   }
   
   int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
diff --combined fs/btrfs/extent-tree.c

index 06b2635,d133edf..521e9d4
--- 1/fs/btrfs/extent-tree.c
--- 2/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@@ -33,6 -33,7 +33,7 @@@
   #include "volumes.h"
   #include "locking.h"
   #include "free-space-cache.h"
+ #include "math.h"
   
   #undef SCRAMBLE_DELAYED_REFS
   
@@@ -649,24 -650,6 +650,6 @@@ void btrfs_clear_space_info_full(struc
         rcu_read_unlock();
   }
   
- static u64 div_factor(u64 num, int factor)
- {
-       if (factor == 10)
-               return num;
-       num *= factor;
-       do_div(num, 10);
-       return num;
- }
- 
- static u64 div_factor_fine(u64 num, int factor)
- {
-       if (factor == 100)
-               return num;
-       num *= factor;
-       do_div(num, 100);
-       return num;
- }
- 
   u64 btrfs_find_block_group(struct btrfs_root *root,
                            u64 search_start, u64 search_hint, int owner)
   {
@@@ -1835,7 -1818,7 +1818,7 @@@ static int btrfs_discard_extent(struct 
   
   
         /* Tell the block device(s) that the sectors can be discarded */
-       ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD,
+       ret = btrfs_map_block(root->fs_info, REQ_DISCARD,
                               bytenr, &num_bytes, &bbio, 0);
         /* Error condition is -ENOMEM */
         if (!ret) {
@@@ -2314,6 -2297,9 +2297,9 @@@ static noinline int run_clustered_refs(
                                 kfree(extent_op);
   
                                 if (ret) {
+                                       list_del_init(&locked_ref->cluster);
+                                       mutex_unlock(&locked_ref->mutex);
+ 
                                         printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret);
                                         spin_lock(&delayed_refs->lock);
                                         return ret;
@@@ -2356,6 -2342,10 +2342,10 @@@
                 count++;
   
                 if (ret) {
+                       if (locked_ref) {
+                               list_del_init(&locked_ref->cluster);
+                               mutex_unlock(&locked_ref->mutex);
+                       }
                         printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret);
                         spin_lock(&delayed_refs->lock);
                         return ret;
@@@ -3661,7 -3651,7 +3651,7 @@@ out
   
   static int can_overcommit(struct btrfs_root *root,
                           struct btrfs_space_info *space_info, u64 bytes,
-                         int flush)
+                         enum btrfs_reserve_flush_enum flush)
   {
         u64 profile = btrfs_get_alloc_profile(root, 0);
         u64 avail;
@@@ -3685,11 -3675,11 +3675,11 @@@
                 avail >>= 1;
   
         /*
-        * If we aren't flushing don't let us overcommit too much, say
-        * 1/8th of the space.  If we can flush, let it overcommit up to
-        * 1/2 of the space.
+        * If we aren't flushing all things, let us overcommit up to
+        * 1/2th of the space. If we can flush, don't let us overcommit
+        * too much, let it overcommit up to 1/8 of the space.
          */
-       if (flush)
+       if (flush == BTRFS_RESERVE_FLUSH_ALL)
                 avail >>= 3;
         else
                 avail >>= 1;
@@@ -3699,6 -3689,20 +3689,20 @@@
         return 0;
   }
   
+ static int writeback_inodes_sb_nr_if_idle_safe(struct super_block *sb,
+                                              unsigned long nr_pages,
+                                              enum wb_reason reason)
+ {
+       if (!writeback_in_progress(sb->s_bdi) &&
+           down_read_trylock(&sb->s_umount)) {
+               writeback_inodes_sb_nr(sb, nr_pages, reason);
+               up_read(&sb->s_umount);
+               return 1;
+       }
+ 
+       return 0;
+ }
+ 
   /*
    * shrink metadata reservation for delalloc
    */
@@@ -3713,6 -3717,7 +3717,7 @@@ static void shrink_delalloc(struct btrf
         long time_left;
         unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
         int loops = 0;
+       enum btrfs_reserve_flush_enum flush;
   
         trans = (struct btrfs_trans_handle *)current->journal_info;
         block_rsv = &root->fs_info->delalloc_block_rsv;
@@@ -3730,8 -3735,9 +3735,9 @@@
         while (delalloc_bytes && loops < 3) {
                 max_reclaim = min(delalloc_bytes, to_reclaim);
                 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
-               writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages,
-                                              WB_REASON_FS_FREE_SPACE);
+               writeback_inodes_sb_nr_if_idle_safe(root->fs_info->sb,
+                                                   nr_pages,
+                                                   WB_REASON_FS_FREE_SPACE);
   
                 /*
                  * We need to wait for the async pages to actually start before
@@@ -3740,8 -3746,12 +3746,12 @@@
                 wait_event(root->fs_info->async_submit_wait,
                            !atomic_read(&root->fs_info->async_delalloc_pages));
   
+               if (!trans)
+                       flush = BTRFS_RESERVE_FLUSH_ALL;
+               else
+                       flush = BTRFS_RESERVE_NO_FLUSH;
                 spin_lock(&space_info->lock);
-               if (can_overcommit(root, space_info, orig, !trans)) {
+               if (can_overcommit(root, space_info, orig, flush)) {
                         spin_unlock(&space_info->lock);
                         break;
                 }
@@@ -3888,7 -3898,7 +3898,7 @@@ static int flush_space(struct btrfs_roo
    * @root - the root we're allocating for
    * @block_rsv - the block_rsv we're allocating for
    * @orig_bytes - the number of bytes we want
- - * @flush - wether or not we can flush to make our reservation
+ + * @flush - whether or not we can flush to make our reservation
    *
    * This will reserve orgi_bytes number of bytes from the space info associated
    * with the block_rsv.  If there is not enough space it will make an attempt to
@@@ -3899,7 -3909,8 +3909,8 @@@
    */
   static int reserve_metadata_bytes(struct btrfs_root *root,
                                   struct btrfs_block_rsv *block_rsv,
-                                 u64 orig_bytes, int flush)
+                                 u64 orig_bytes,
+                                 enum btrfs_reserve_flush_enum flush)
   {
         struct btrfs_space_info *space_info = block_rsv->space_info;
         u64 used;
@@@ -3912,10 -3923,11 +3923,11 @@@ again
         ret = 0;
         spin_lock(&space_info->lock);
         /*
-        * We only want to wait if somebody other than us is flushing and we are
-        * actually alloed to flush.
+        * We only want to wait if somebody other than us is flushing and we
+        * are actually allowed to flush all things.
          */
-       while (flush && !flushing && space_info->flush) {
+       while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing &&
+              space_info->flush) {
                 spin_unlock(&space_info->lock);
                 /*
                  * If we have a trans handle we can't wait because the flusher
@@@ -3981,23 -3993,40 +3993,40 @@@
          * Couldn't make our reservation, save our place so while we're trying
          * to reclaim space we can actually use it instead of somebody else
          * stealing it from us.
+        *
+        * We make the other tasks wait for the flush only when we can flush
+        * all things.
          */
-       if (ret && flush) {
+       if (ret && flush == BTRFS_RESERVE_FLUSH_ALL) {
                 flushing = true;
                 space_info->flush = 1;
         }
   
         spin_unlock(&space_info->lock);
   
-       if (!ret || !flush)
+       if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
                 goto out;
   
         ret = flush_space(root, space_info, num_bytes, orig_bytes,
                           flush_state);
         flush_state++;
+ 
+       /*
+        * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock
+        * would happen. So skip delalloc flush.
+        */
+       if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
+           (flush_state == FLUSH_DELALLOC ||
+            flush_state == FLUSH_DELALLOC_WAIT))
+               flush_state = ALLOC_CHUNK;
+ 
         if (!ret)
                 goto again;
-       else if (flush_state <= COMMIT_TRANS)
+       else if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
+                flush_state < COMMIT_TRANS)
+               goto again;
+       else if (flush == BTRFS_RESERVE_FLUSH_ALL &&
+                flush_state <= COMMIT_TRANS)
                 goto again;
   
   out:
@@@ -4148,9 -4177,9 +4177,9 @@@ void btrfs_free_block_rsv(struct btrfs_
         kfree(rsv);
   }
   
- static inline int __block_rsv_add(struct btrfs_root *root,
-                                 struct btrfs_block_rsv *block_rsv,
-                                 u64 num_bytes, int flush)
+ int btrfs_block_rsv_add(struct btrfs_root *root,
+                       struct btrfs_block_rsv *block_rsv, u64 num_bytes,
+                       enum btrfs_reserve_flush_enum flush)
   {
         int ret;
   
@@@ -4166,20 -4195,6 +4195,6 @@@
         return ret;
   }
   
- int btrfs_block_rsv_add(struct btrfs_root *root,
-                       struct btrfs_block_rsv *block_rsv,
-                       u64 num_bytes)
- {
-       return __block_rsv_add(root, block_rsv, num_bytes, 1);
- }
- 
- int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
-                               struct btrfs_block_rsv *block_rsv,
-                               u64 num_bytes)
- {
-       return __block_rsv_add(root, block_rsv, num_bytes, 0);
- }
- 
   int btrfs_block_rsv_check(struct btrfs_root *root,
                           struct btrfs_block_rsv *block_rsv, int min_factor)
   {
@@@ -4198,9 -4213,9 +4213,9 @@@
         return ret;
   }
   
- static inline int __btrfs_block_rsv_refill(struct btrfs_root *root,
-                                          struct btrfs_block_rsv *block_rsv,
-                                          u64 min_reserved, int flush)
+ int btrfs_block_rsv_refill(struct btrfs_root *root,
+                          struct btrfs_block_rsv *block_rsv, u64 min_reserved,
+                          enum btrfs_reserve_flush_enum flush)
   {
         u64 num_bytes = 0;
         int ret = -ENOSPC;
@@@ -4228,20 -4243,6 +4243,6 @@@
         return ret;
   }
   
- int btrfs_block_rsv_refill(struct btrfs_root *root,
-                          struct btrfs_block_rsv *block_rsv,
-                          u64 min_reserved)
- {
-       return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1);
- }
- 
- int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
-                                  struct btrfs_block_rsv *block_rsv,
-                                  u64 min_reserved)
- {
-       return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0);
- }
- 
   int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
                             struct btrfs_block_rsv *dst_rsv,
                             u64 num_bytes)
@@@ -4532,17 -4533,27 +4533,27 @@@ int btrfs_delalloc_reserve_metadata(str
         u64 csum_bytes;
         unsigned nr_extents = 0;
         int extra_reserve = 0;
-       int flush = 1;
+       enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
         int ret;
+       bool delalloc_lock = true;
   
-       /* Need to be holding the i_mutex here if we aren't free space cache */
-       if (btrfs_is_free_space_inode(inode))
-               flush = 0;
+       /* If we are a free space inode we need to not flush since we will be in
+        * the middle of a transaction commit.  We also don't need the delalloc
+        * mutex since we won't race with anybody.  We need this mostly to make
+        * lockdep shut its filthy mouth.
+        */
+       if (btrfs_is_free_space_inode(inode)) {
+               flush = BTRFS_RESERVE_NO_FLUSH;
+               delalloc_lock = false;
+       }
   
-       if (flush && btrfs_transaction_in_commit(root->fs_info))
+       if (flush != BTRFS_RESERVE_NO_FLUSH &&
+           btrfs_transaction_in_commit(root->fs_info))
                 schedule_timeout(1);
   
-       mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
+       if (delalloc_lock)
+               mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
+ 
         num_bytes = ALIGN(num_bytes, root->sectorsize);
   
         spin_lock(&BTRFS_I(inode)->lock);
@@@ -4572,7 -4583,11 +4583,11 @@@
                 ret = btrfs_qgroup_reserve(root, num_bytes +
                                            nr_extents * root->leafsize);
                 if (ret) {
-                       mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
+                       spin_lock(&BTRFS_I(inode)->lock);
+                       calc_csum_metadata_size(inode, num_bytes, 0);
+                       spin_unlock(&BTRFS_I(inode)->lock);
+                       if (delalloc_lock)
+                               mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
                         return ret;
                 }
         }
@@@ -4607,7 -4622,12 +4622,12 @@@
                                                       btrfs_ino(inode),
                                                       to_free, 0);
                 }
-               mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
+               if (root->fs_info->quota_enabled) {
+                       btrfs_qgroup_free(root, num_bytes +
+                                               nr_extents * root->leafsize);
+               }
+               if (delalloc_lock)
+                       mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
                 return ret;
         }
   
@@@ -4619,7 -4639,9 +4639,9 @@@
         }
         BTRFS_I(inode)->reserved_extents += nr_extents;
         spin_unlock(&BTRFS_I(inode)->lock);
-       mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
+ 
+       if (delalloc_lock)
+               mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
   
         if (to_reserve)
                 trace_btrfs_space_reservation(root->fs_info,"delalloc",
@@@ -4969,9 -4991,13 +4991,13 @@@ static int unpin_extent_range(struct bt
   {
         struct btrfs_fs_info *fs_info = root->fs_info;
         struct btrfs_block_group_cache *cache = NULL;
+       struct btrfs_space_info *space_info;
+       struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
         u64 len;
+       bool readonly;
   
         while (start <= end) {
+               readonly = false;
                 if (!cache ||
                     start >= cache->key.objectid + cache->key.offset) {
                         if (cache)
@@@ -4989,15 -5015,30 +5015,30 @@@
                 }
   
                 start += len;
+               space_info = cache->space_info;
   
-               spin_lock(&cache->space_info->lock);
+               spin_lock(&space_info->lock);
                 spin_lock(&cache->lock);
                 cache->pinned -= len;
-               cache->space_info->bytes_pinned -= len;
-               if (cache->ro)
-                       cache->space_info->bytes_readonly += len;
+               space_info->bytes_pinned -= len;
+               if (cache->ro) {
+                       space_info->bytes_readonly += len;
+                       readonly = true;
+               }
                 spin_unlock(&cache->lock);
-               spin_unlock(&cache->space_info->lock);
+               if (!readonly && global_rsv->space_info == space_info) {
+                       spin_lock(&global_rsv->lock);
+                       if (!global_rsv->full) {
+                               len = min(len, global_rsv->size -
+                                         global_rsv->reserved);
+                               global_rsv->reserved += len;
+                               space_info->bytes_may_use += len;
+                               if (global_rsv->reserved >= global_rsv->size)
+                                       global_rsv->full = 1;
+                       }
+                       spin_unlock(&global_rsv->lock);
+               }
+               spin_unlock(&space_info->lock);
         }
   
         if (cache)
@@@ -5466,7 -5507,7 +5507,7 @@@ wait_block_group_cache_done(struct btrf
         return 0;
   }
   
- static int __get_block_group_index(u64 flags)
+ int __get_raid_index(u64 flags)
   {
         int index;
   
@@@ -5486,7 -5527,7 +5527,7 @@@
   
   static int get_block_group_index(struct btrfs_block_group_cache *cache)
   {
-       return __get_block_group_index(cache->flags);
+       return __get_raid_index(cache->flags);
   }
   
   enum btrfs_loop_type {
@@@ -6269,7 -6310,8 +6310,8 @@@ use_block_rsv(struct btrfs_trans_handl
         block_rsv = get_block_rsv(trans, root);
   
         if (block_rsv->size == 0) {
-               ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
+               ret = reserve_metadata_bytes(root, block_rsv, blocksize,
+                                            BTRFS_RESERVE_NO_FLUSH);
                 /*
                  * If we couldn't reserve metadata bytes try and use some from
                  * the global reserve.
@@@ -6292,11 -6334,11 +6334,11 @@@
                 static DEFINE_RATELIMIT_STATE(_rs,
                                 DEFAULT_RATELIMIT_INTERVAL,
                                 /*DEFAULT_RATELIMIT_BURST*/ 2);
-               if (__ratelimit(&_rs)) {
-                       printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret);
-                       WARN_ON(1);
-               }
-               ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
+               if (__ratelimit(&_rs))
+                       WARN(1, KERN_DEBUG "btrfs: block rsv returned %d\n",
+                            ret);
+               ret = reserve_metadata_bytes(root, block_rsv, blocksize,
+                                            BTRFS_RESERVE_NO_FLUSH);
                 if (!ret) {
                         return block_rsv;
                 } else if (ret && block_rsv != global_rsv) {
@@@ -7427,7 -7469,7 +7469,7 @@@ int btrfs_can_relocate(struct btrfs_roo
          */
         target = get_restripe_target(root->fs_info, block_group->flags);
         if (target) {
-               index = __get_block_group_index(extended_to_chunk(target));
+               index = __get_raid_index(extended_to_chunk(target));
         } else {
                 /*
                  * this is just a balance, so if we were marked as full
@@@ -7461,7 -7503,8 +7503,8 @@@
                  * check to make sure we can actually find a chunk with enough
                  * space to fit our block group in.
                  */
-               if (device->total_bytes > device->bytes_used + min_free) {
+               if (device->total_bytes > device->bytes_used + min_free &&
+                   !device->is_tgtdev_for_dev_replace) {
                         ret = find_free_dev_extent(device, min_free,
                                                    &dev_offset, NULL);
                         if (!ret)
diff --combined fs/btrfs/extent_map.c

index ce9f792,fff2c28..f169d6b
--- 1/fs/btrfs/extent_map.c
--- 2/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@@ -49,7 -49,7 +49,7 @@@ void extent_map_tree_init(struct extent
   struct extent_map *alloc_extent_map(void)
   {
         struct extent_map *em;
-       em = kmem_cache_alloc(extent_map_cache, GFP_NOFS);
+       em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS);
         if (!em)
                 return NULL;
         em->in_tree = 0;
@@@ -198,16 -198,15 +198,15 @@@ static void try_merge_map(struct extent
                         merge = rb_entry(rb, struct extent_map, rb_node);
                 if (rb && mergable_maps(merge, em)) {
                         em->start = merge->start;
+                       em->orig_start = merge->orig_start;
                         em->len += merge->len;
                         em->block_len += merge->block_len;
                         em->block_start = merge->block_start;
                         merge->in_tree = 0;
-                       if (merge->generation > em->generation) {
-                               em->mod_start = em->start;
-                               em->mod_len = em->len;
-                               em->generation = merge->generation;
-                               list_move(&em->list, &tree->modified_extents);
-                       }
+                       em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start;
+                       em->mod_start = merge->mod_start;
+                       em->generation = max(em->generation, merge->generation);
+                       list_move(&em->list, &tree->modified_extents);
   
                         list_del_init(&merge->list);
                         rb_erase(&merge->rb_node, &tree->map);
@@@ -223,22 -222,20 +222,19 @@@
                 em->block_len += merge->len;
                 rb_erase(&merge->rb_node, &tree->map);
                 merge->in_tree = 0;
-               if (merge->generation > em->generation) {
-                       em->mod_len = em->len;
-                       em->generation = merge->generation;
-                       list_move(&em->list, &tree->modified_extents);
-               }
+               em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
+               em->generation = max(em->generation, merge->generation);
                 list_del_init(&merge->list);
                 free_extent_map(merge);
         }
   }
   
   /**
- - * unpint_extent_cache - unpin an extent from the cache
+ + * unpin_extent_cache - unpin an extent from the cache
    * @tree:     tree to unpin the extent in
    * @start:    logical offset in the file
    * @len:      length of the extent
    * @gen:      generation that this extent has been modified in
- - * @prealloc: if this is set we need to clear the prealloc flag
    *
    * Called after an extent has been written to disk properly.  Set the generation
    * to the generation that actually added the file item to the inode so we know
@@@ -265,9 -262,9 +261,9 @@@ int unpin_extent_cache(struct extent_ma
         em->mod_start = em->start;
         em->mod_len = em->len;
   
-       if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+       if (test_bit(EXTENT_FLAG_FILLING, &em->flags)) {
                 prealloc = true;
-               clear_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+               clear_bit(EXTENT_FLAG_FILLING, &em->flags);
         }
   
         try_merge_map(tree, em);
diff --combined fs/btrfs/file.c

index 9c6673a,20452c1..77061bf
--- 1/fs/btrfs/file.c
--- 2/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@@ -41,6 -41,7 +41,7 @@@
   #include "compat.h"
   #include "volumes.h"
   
+ static struct kmem_cache *btrfs_inode_defrag_cachep;
   /*
    * when auto defrag is enabled we
    * queue up these defrag structs to remember which
@@@ -90,7 -91,7 +91,7 @@@ static int __compare_inode_defrag(struc
    * If an existing record is found the defrag item you
    * pass in is freed
    */
- static void __btrfs_add_inode_defrag(struct inode *inode,
+ static int __btrfs_add_inode_defrag(struct inode *inode,
                                     struct inode_defrag *defrag)
   {
         struct btrfs_root *root = BTRFS_I(inode)->root;
@@@ -118,18 -119,24 +119,24 @@@
                                 entry->transid = defrag->transid;
                         if (defrag->last_offset > entry->last_offset)
                                 entry->last_offset = defrag->last_offset;
-                       goto exists;
+                       return -EEXIST;
                 }
         }
         set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
         rb_link_node(&defrag->rb_node, parent, p);
         rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
-       return;
+       return 0;
+ }
   
- exists:
-       kfree(defrag);
-       return;
+ static inline int __need_auto_defrag(struct btrfs_root *root)
+ {
+       if (!btrfs_test_opt(root, AUTO_DEFRAG))
+               return 0;
+ 
+       if (btrfs_fs_closing(root->fs_info))
+               return 0;
   
+       return 1;
   }
   
   /*
@@@ -142,11 -149,9 +149,9 @@@ int btrfs_add_inode_defrag(struct btrfs
         struct btrfs_root *root = BTRFS_I(inode)->root;
         struct inode_defrag *defrag;
         u64 transid;
+       int ret;
   
-       if (!btrfs_test_opt(root, AUTO_DEFRAG))
-               return 0;
- 
-       if (btrfs_fs_closing(root->fs_info))
+       if (!__need_auto_defrag(root))
                 return 0;
   
         if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
@@@ -157,7 -162,7 +162,7 @@@
         else
                 transid = BTRFS_I(inode)->root->last_trans;
   
-       defrag = kzalloc(sizeof(*defrag), GFP_NOFS);
+       defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
         if (!defrag)
                 return -ENOMEM;
   
@@@ -166,20 -171,56 +171,56 @@@
         defrag->root = root->root_key.objectid;
   
         spin_lock(&root->fs_info->defrag_inodes_lock);
-       if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
-               __btrfs_add_inode_defrag(inode, defrag);
-       else
-               kfree(defrag);
+       if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) {
+               /*
+                * If we set IN_DEFRAG flag and evict the inode from memory,
+                * and then re-read this inode, this new inode doesn't have
+                * IN_DEFRAG flag. At the case, we may find the existed defrag.
+                */
+               ret = __btrfs_add_inode_defrag(inode, defrag);
+               if (ret)
+                       kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+       } else {
+               kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+       }
         spin_unlock(&root->fs_info->defrag_inodes_lock);
         return 0;
   }
   
   /*
-  * must be called with the defrag_inodes lock held
+  * Requeue the defrag object. If there is a defrag object that points to
+  * the same inode in the tree, we will merge them together (by
+  * __btrfs_add_inode_defrag()) and free the one that we want to requeue.
    */
- struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info,
-                                            u64 root, u64 ino,
-                                            struct rb_node **next)
+ void btrfs_requeue_inode_defrag(struct inode *inode,
+                               struct inode_defrag *defrag)
+ {
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       int ret;
+ 
+       if (!__need_auto_defrag(root))
+               goto out;
+ 
+       /*
+        * Here we don't check the IN_DEFRAG flag, because we need merge
+        * them together.
+        */
+       spin_lock(&root->fs_info->defrag_inodes_lock);
+       ret = __btrfs_add_inode_defrag(inode, defrag);
+       spin_unlock(&root->fs_info->defrag_inodes_lock);
+       if (ret)
+               goto out;
+       return;
+ out:
+       kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+ }
+ 
+ /*
+  * pick the defragable inode that we want, if it doesn't exist, we will get
+  * the next one.
+  */
+ static struct inode_defrag *
+ btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino)
   {
         struct inode_defrag *entry = NULL;
         struct inode_defrag tmp;
@@@ -190,7 -231,8 +231,8 @@@
         tmp.ino = ino;
         tmp.root = root;
   
-       p = info->defrag_inodes.rb_node;
+       spin_lock(&fs_info->defrag_inodes_lock);
+       p = fs_info->defrag_inodes.rb_node;
         while (p) {
                 parent = p;
                 entry = rb_entry(parent, struct inode_defrag, rb_node);
@@@ -201,52 -243,131 +243,131 @@@
                 else if (ret > 0)
                         p = parent->rb_right;
                 else
-                       return entry;
+                       goto out;
         }
   
-       if (next) {
-               while (parent && __compare_inode_defrag(&tmp, entry) > 0) {
-                       parent = rb_next(parent);
+       if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
+               parent = rb_next(parent);
+               if (parent)
                         entry = rb_entry(parent, struct inode_defrag, rb_node);
-               }
-               *next = parent;
+               else
+                       entry = NULL;
         }
-       return NULL;
+ out:
+       if (entry)
+               rb_erase(parent, &fs_info->defrag_inodes);
+       spin_unlock(&fs_info->defrag_inodes_lock);
+       return entry;
   }
   
- /*
-  * run through the list of inodes in the FS that need
-  * defragging
-  */
- int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
+ void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
   {
         struct inode_defrag *defrag;
+       struct rb_node *node;
+ 
+       spin_lock(&fs_info->defrag_inodes_lock);
+       node = rb_first(&fs_info->defrag_inodes);
+       while (node) {
+               rb_erase(node, &fs_info->defrag_inodes);
+               defrag = rb_entry(node, struct inode_defrag, rb_node);
+               kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+ 
+               if (need_resched()) {
+                       spin_unlock(&fs_info->defrag_inodes_lock);
+                       cond_resched();
+                       spin_lock(&fs_info->defrag_inodes_lock);
+               }
+ 
+               node = rb_first(&fs_info->defrag_inodes);
+       }
+       spin_unlock(&fs_info->defrag_inodes_lock);
+ }
+ 
+ #define BTRFS_DEFRAG_BATCH    1024
+ 
+ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
+                                   struct inode_defrag *defrag)
+ {
         struct btrfs_root *inode_root;
         struct inode *inode;
-       struct rb_node *n;
         struct btrfs_key key;
         struct btrfs_ioctl_defrag_range_args range;
-       u64 first_ino = 0;
-       u64 root_objectid = 0;
         int num_defrag;
-       int defrag_batch = 1024;
   
+       /* get the inode */
+       key.objectid = defrag->root;
+       btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+       key.offset = (u64)-1;
+       inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
+       if (IS_ERR(inode_root)) {
+               kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+               return PTR_ERR(inode_root);
+       }
+ 
+       key.objectid = defrag->ino;
+       btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+       key.offset = 0;
+       inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
+       if (IS_ERR(inode)) {
+               kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+               return PTR_ERR(inode);
+       }
+ 
+       /* do a chunk of defrag */
+       clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
         memset(&range, 0, sizeof(range));
         range.len = (u64)-1;
+       range.start = defrag->last_offset;
+ 
+       sb_start_write(fs_info->sb);
+       num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
+                                      BTRFS_DEFRAG_BATCH);
+       sb_end_write(fs_info->sb);
+       /*
+        * if we filled the whole defrag batch, there
+        * must be more work to do.  Queue this defrag
+        * again
+        */
+       if (num_defrag == BTRFS_DEFRAG_BATCH) {
+               defrag->last_offset = range.start;
+               btrfs_requeue_inode_defrag(inode, defrag);
+       } else if (defrag->last_offset && !defrag->cycled) {
+               /*
+                * we didn't fill our defrag batch, but
+                * we didn't start at zero.  Make sure we loop
+                * around to the start of the file.
+                */
+               defrag->last_offset = 0;
+               defrag->cycled = 1;
+               btrfs_requeue_inode_defrag(inode, defrag);
+       } else {
+               kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+       }
+ 
+       iput(inode);
+       return 0;
+ }
+ 
+ /*
+  * run through the list of inodes in the FS that need
+  * defragging
+  */
+ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
+ {
+       struct inode_defrag *defrag;
+       u64 first_ino = 0;
+       u64 root_objectid = 0;
   
         atomic_inc(&fs_info->defrag_running);
-       spin_lock(&fs_info->defrag_inodes_lock);
         while(1) {
-               n = NULL;
+               if (!__need_auto_defrag(fs_info->tree_root))
+                       break;
   
                 /* find an inode to defrag */
-               defrag = btrfs_find_defrag_inode(fs_info, root_objectid,
-                                                first_ino, &n);
+               defrag = btrfs_pick_defrag_inode(fs_info, root_objectid,
+                                                first_ino);
                 if (!defrag) {
-                       if (n) {
-                               defrag = rb_entry(n, struct inode_defrag,
-                                                 rb_node);
-                       } else if (root_objectid || first_ino) {
+                       if (root_objectid || first_ino) {
                                 root_objectid = 0;
                                 first_ino = 0;
                                 continue;
@@@ -255,70 -376,11 +376,11 @@@
                         }
                 }
   
-               /* remove it from the rbtree */
                 first_ino = defrag->ino + 1;
                 root_objectid = defrag->root;
-               rb_erase(&defrag->rb_node, &fs_info->defrag_inodes);
- 
-               if (btrfs_fs_closing(fs_info))
-                       goto next_free;
- 
-               spin_unlock(&fs_info->defrag_inodes_lock);
- 
-               /* get the inode */
-               key.objectid = defrag->root;
-               btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
-               key.offset = (u64)-1;
-               inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
-               if (IS_ERR(inode_root))
-                       goto next;
- 
-               key.objectid = defrag->ino;
-               btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
-               key.offset = 0;
- 
-               inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
-               if (IS_ERR(inode))
-                       goto next;
   
-               /* do a chunk of defrag */
-               clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
-               range.start = defrag->last_offset;
-               num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
-                                              defrag_batch);
-               /*
-                * if we filled the whole defrag batch, there
-                * must be more work to do.  Queue this defrag
-                * again
-                */
-               if (num_defrag == defrag_batch) {
-                       defrag->last_offset = range.start;
-                       __btrfs_add_inode_defrag(inode, defrag);
-                       /*
-                        * we don't want to kfree defrag, we added it back to
-                        * the rbtree
-                        */
-                       defrag = NULL;
-               } else if (defrag->last_offset && !defrag->cycled) {
-                       /*
-                        * we didn't fill our defrag batch, but
-                        * we didn't start at zero.  Make sure we loop
-                        * around to the start of the file.
-                        */
-                       defrag->last_offset = 0;
-                       defrag->cycled = 1;
-                       __btrfs_add_inode_defrag(inode, defrag);
-                       defrag = NULL;
-               }
- 
-               iput(inode);
- next:
-               spin_lock(&fs_info->defrag_inodes_lock);
- next_free:
-               kfree(defrag);
+               __btrfs_run_defrag_inode(fs_info, defrag);
         }
-       spin_unlock(&fs_info->defrag_inodes_lock);
- 
         atomic_dec(&fs_info->defrag_running);
   
         /*
@@@ -526,6 -588,8 +588,8 @@@ void btrfs_drop_extent_cache(struct ino
                                 split->block_len = em->block_len;
                         else
                                 split->block_len = split->len;
+                       split->orig_block_len = max(split->block_len,
+                                                   em->orig_block_len);
                         split->generation = gen;
                         split->bdev = em->bdev;
                         split->flags = flags;
@@@ -547,6 -611,8 +611,8 @@@
                         split->flags = flags;
                         split->compress_type = em->compress_type;
                         split->generation = gen;
+                       split->orig_block_len = max(em->block_len,
+                                                   em->orig_block_len);
   
                         if (compressed) {
                                 split->block_len = em->block_len;
@@@ -555,7 -621,7 +621,7 @@@
                         } else {
                                 split->block_len = split->len;
                                 split->block_start = em->block_start + diff;
-                               split->orig_start = split->start;
+                               split->orig_start = em->orig_start;
                         }
   
                         ret = add_extent_mapping(em_tree, split);
@@@ -1346,9 -1412,10 +1412,9 @@@ static noinline ssize_t __btrfs_buffere
   
                 cond_resched();
   
- -              balance_dirty_pages_ratelimited_nr(inode->i_mapping,
- -                                                 dirty_pages);
+ +              balance_dirty_pages_ratelimited(inode->i_mapping);
                 if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
-                       btrfs_btree_balance_dirty(root, 1);
+                       btrfs_btree_balance_dirty(root);
   
                 pos += copied;
                 num_written += copied;
@@@ -1397,6 -1464,24 +1463,24 @@@ out
         return written ? written : err;
   }
   
+ static void update_time_for_write(struct inode *inode)
+ {
+       struct timespec now;
+ 
+       if (IS_NOCMTIME(inode))
+               return;
+ 
+       now = current_fs_time(inode->i_sb);
+       if (!timespec_equal(&inode->i_mtime, &now))
+               inode->i_mtime = now;
+ 
+       if (!timespec_equal(&inode->i_ctime, &now))
+               inode->i_ctime = now;
+ 
+       if (IS_I_VERSION(inode))
+               inode_inc_iversion(inode);
+ }
+ 
   static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
                                     const struct iovec *iov,
                                     unsigned long nr_segs, loff_t pos)
@@@ -1409,6 -1494,7 +1493,7 @@@
         ssize_t num_written = 0;
         ssize_t err = 0;
         size_t count, ocount;
+       bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);
   
         sb_start_write(inode->i_sb);
   
@@@ -1451,11 -1537,13 +1536,13 @@@
                 goto out;
         }
   
-       err = file_update_time(file);
-       if (err) {
-               mutex_unlock(&inode->i_mutex);
-               goto out;
-       }
+       /*
+        * We reserve space for updating the inode when we reserve space for the
+        * extent we are going to write, so we will enospc out there.  We don't
+        * need to start yet another transaction to update the inode as we will
+        * update the inode when we finish writing whatever data we write.
+        */
+       update_time_for_write(inode);
   
         start_pos = round_down(pos, root->sectorsize);
         if (start_pos > i_size_read(inode)) {
@@@ -1466,6 -1554,9 +1553,9 @@@
                 }
         }
   
+       if (sync)
+               atomic_inc(&BTRFS_I(inode)->sync_writers);
+ 
         if (unlikely(file->f_flags & O_DIRECT)) {
                 num_written = __btrfs_direct_write(iocb, iov, nr_segs,
                                                    pos, ppos, count, ocount);
@@@ -1492,14 -1583,21 +1582,21 @@@
          * this will either be one more than the running transaction
          * or the generation used for the next transaction if there isn't
          * one running right now.
+        *
+        * We also have to set last_sub_trans to the current log transid,
+        * otherwise subsequent syncs to a file that's been synced in this
+        * transaction will appear to have already occured.
          */
         BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
+       BTRFS_I(inode)->last_sub_trans = root->log_transid;
         if (num_written > 0 || num_written == -EIOCBQUEUED) {
                 err = generic_write_sync(file, pos, num_written);
                 if (err < 0 && num_written > 0)
                         num_written = err;
         }
   out:
+       if (sync)
+               atomic_dec(&BTRFS_I(inode)->sync_writers);
         sb_end_write(inode->i_sb);
         current->backing_dev_info = NULL;
         return num_written ? num_written : err;
@@@ -1550,7 -1648,9 +1647,9 @@@ int btrfs_sync_file(struct file *file, 
          * out of the ->i_mutex. If so, we can flush the dirty pages by
          * multi-task, and make the performance up.
          */
+       atomic_inc(&BTRFS_I(inode)->sync_writers);
         ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+       atomic_dec(&BTRFS_I(inode)->sync_writers);
         if (ret)
                 return ret;
   
@@@ -1561,7 -1661,7 +1660,7 @@@
          * range being left.
          */
         atomic_inc(&root->log_batch);
-       btrfs_wait_ordered_range(inode, start, end);
+       btrfs_wait_ordered_range(inode, start, end - start + 1);
         atomic_inc(&root->log_batch);
   
         /*
@@@ -1767,6 -1867,7 +1866,7 @@@ out
   
                 hole_em->block_start = EXTENT_MAP_HOLE;
                 hole_em->block_len = 0;
+               hole_em->orig_block_len = 0;
                 hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
                 hole_em->compress_type = BTRFS_COMPRESS_NONE;
                 hole_em->generation = trans->transid;
@@@ -1796,48 -1897,51 +1896,51 @@@ static int btrfs_punch_hole(struct inod
         struct btrfs_path *path;
         struct btrfs_block_rsv *rsv;
         struct btrfs_trans_handle *trans;
-       u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
-       u64 lockstart = (offset + mask) & ~mask;
-       u64 lockend = ((offset + len) & ~mask) - 1;
+       u64 lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize);
+       u64 lockend = round_down(offset + len,
+                                BTRFS_I(inode)->root->sectorsize) - 1;
         u64 cur_offset = lockstart;
         u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
         u64 drop_end;
-       unsigned long nr;
         int ret = 0;
         int err = 0;
-       bool same_page = (offset >> PAGE_CACHE_SHIFT) ==
-               ((offset + len) >> PAGE_CACHE_SHIFT);
+       bool same_page = ((offset >> PAGE_CACHE_SHIFT) ==
+                         ((offset + len - 1) >> PAGE_CACHE_SHIFT));
   
         btrfs_wait_ordered_range(inode, offset, len);
   
         mutex_lock(&inode->i_mutex);
-       if (offset >= inode->i_size) {
-               mutex_unlock(&inode->i_mutex);
-               return 0;
-       }
- 
+       /*
+        * We needn't truncate any page which is beyond the end of the file
+        * because we are sure there is no data there.
+        */
         /*
          * Only do this if we are in the same page and we aren't doing the
          * entire page.
          */
         if (same_page && len < PAGE_CACHE_SIZE) {
-               ret = btrfs_truncate_page(inode, offset, len, 0);
+               if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE))
+                       ret = btrfs_truncate_page(inode, offset, len, 0);
                 mutex_unlock(&inode->i_mutex);
                 return ret;
         }
   
         /* zero back part of the first page */
-       ret = btrfs_truncate_page(inode, offset, 0, 0);
-       if (ret) {
-               mutex_unlock(&inode->i_mutex);
-               return ret;
+       if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) {
+               ret = btrfs_truncate_page(inode, offset, 0, 0);
+               if (ret) {
+                       mutex_unlock(&inode->i_mutex);
+                       return ret;
+               }
         }
   
         /* zero the front end of the last page */
-       ret = btrfs_truncate_page(inode, offset + len, 0, 1);
-       if (ret) {
-               mutex_unlock(&inode->i_mutex);
-               return ret;
+       if (offset + len < round_up(inode->i_size, PAGE_CACHE_SIZE)) {
+               ret = btrfs_truncate_page(inode, offset + len, 0, 1);
+               if (ret) {
+                       mutex_unlock(&inode->i_mutex);
+                       return ret;
+               }
         }
   
         if (lockend < lockstart) {
@@@ -1930,9 -2034,8 +2033,8 @@@
                         break;
                 }
   
-               nr = trans->blocks_used;
                 btrfs_end_transaction(trans, root);
-               btrfs_btree_balance_dirty(root, nr);
+               btrfs_btree_balance_dirty(root);
   
                 trans = btrfs_start_transaction(root, 3);
                 if (IS_ERR(trans)) {
@@@ -1963,11 -2066,13 +2065,13 @@@ out_trans
         if (!trans)
                 goto out_free;
   
+       inode_inc_iversion(inode);
+       inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ 
         trans->block_rsv = &root->fs_info->trans_block_rsv;
         ret = btrfs_update_inode(trans, root, inode);
-       nr = trans->blocks_used;
         btrfs_end_transaction(trans, root);
-       btrfs_btree_balance_dirty(root, nr);
+       btrfs_btree_balance_dirty(root);
   out_free:
         btrfs_free_path(path);
         btrfs_free_block_rsv(root, rsv);
@@@ -1991,12 -2096,12 +2095,12 @@@ static long btrfs_fallocate(struct fil
         u64 alloc_end;
         u64 alloc_hint = 0;
         u64 locked_end;
-       u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
         struct extent_map *em;
+       int blocksize = BTRFS_I(inode)->root->sectorsize;
         int ret;
   
-       alloc_start = offset & ~mask;
-       alloc_end =  (offset + len + mask) & ~mask;
+       alloc_start = round_down(offset, blocksize);
+       alloc_end = round_up(offset + len, blocksize);
   
         /* Make sure we aren't being give some crap mode */
         if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
@@@ -2009,7 -2114,7 +2113,7 @@@
          * Make sure we have enough space before we do the
          * allocation.
          */
-       ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start + 1);
+       ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
         if (ret)
                 return ret;
   
@@@ -2077,7 -2182,7 +2181,7 @@@
                 }
                 last_byte = min(extent_map_end(em), alloc_end);
                 actual_end = min_t(u64, extent_map_end(em), offset + len);
-               last_byte = (last_byte + mask) & ~mask;
+               last_byte = ALIGN(last_byte, blocksize);
   
                 if (em->block_start == EXTENT_MAP_HOLE ||
                     (cur_offset >= inode->i_size &&
@@@ -2116,11 -2221,11 +2220,11 @@@
   out:
         mutex_unlock(&inode->i_mutex);
         /* Let go of our reservation. */
-       btrfs_free_reserved_data_space(inode, alloc_end - alloc_start + 1);
+       btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
         return ret;
   }
   
- -static int find_desired_extent(struct inode *inode, loff_t *offset, int origin)
+ +static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
   {
         struct btrfs_root *root = BTRFS_I(inode)->root;
         struct extent_map *em;
@@@ -2154,7 -2259,7 +2258,7 @@@
          * before the position we want in case there is outstanding delalloc
          * going on here.
          */
- -      if (origin == SEEK_HOLE && start != 0) {
+ +      if (whence == SEEK_HOLE && start != 0) {
                 if (start <= root->sectorsize)
                         em = btrfs_get_extent_fiemap(inode, NULL, 0, 0,
                                                      root->sectorsize, 0);
@@@ -2188,13 -2293,13 +2292,13 @@@
                                 }
                         }
   
- -                      if (origin == SEEK_HOLE) {
+ +                      if (whence == SEEK_HOLE) {
                                 *offset = start;
                                 free_extent_map(em);
                                 break;
                         }
                 } else {
- -                      if (origin == SEEK_DATA) {
+ +                      if (whence == SEEK_DATA) {
                                 if (em->block_start == EXTENT_MAP_DELALLOC) {
                                         if (start >= inode->i_size) {
                                                 free_extent_map(em);
@@@ -2231,16 -2336,16 +2335,16 @@@ out
         return ret;
   }
   
- -static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin)
+ +static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
   {
         struct inode *inode = file->f_mapping->host;
         int ret;
   
         mutex_lock(&inode->i_mutex);
- -      switch (origin) {
+ +      switch (whence) {
         case SEEK_END:
         case SEEK_CUR:
- -              offset = generic_file_llseek(file, offset, origin);
+ +              offset = generic_file_llseek(file, offset, whence);
                 goto out;
         case SEEK_DATA:
         case SEEK_HOLE:
@@@ -2249,7 -2354,7 +2353,7 @@@
                         return -ENXIO;
                 }
   
- -              ret = find_desired_extent(inode, &offset, origin);
+ +              ret = find_desired_extent(inode, &offset, whence);
                 if (ret) {
                         mutex_unlock(&inode->i_mutex);
                         return ret;
@@@ -2292,3 -2397,21 +2396,21 @@@ const struct file_operations btrfs_file
         .compat_ioctl   = btrfs_ioctl,
   #endif
   };
+ 
+ void btrfs_auto_defrag_exit(void)
+ {
+       if (btrfs_inode_defrag_cachep)
+               kmem_cache_destroy(btrfs_inode_defrag_cachep);
+ }
+ 
+ int btrfs_auto_defrag_init(void)
+ {
+       btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
+                                       sizeof(struct inode_defrag), 0,
+                                       SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+                                       NULL);
+       if (!btrfs_inode_defrag_cachep)
+               return -ENOMEM;
+ 
+       return 0;
+ }
diff --combined fs/btrfs/ioctl.c

index 5b3429a,7624212..4b45167
--- 1/fs/btrfs/ioctl.c
--- 2/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@@ -55,6 -55,7 +55,7 @@@
   #include "backref.h"
   #include "rcu-string.h"
   #include "send.h"
+ #include "dev-replace.h"
   
   /* Mask out flags that are inappropriate for the given type of inode. */
   static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
@@@ -140,8 -141,11 +141,11 @@@ void btrfs_inherit_iflags(struct inode 
                 BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
         }
   
-       if (flags & BTRFS_INODE_NODATACOW)
+       if (flags & BTRFS_INODE_NODATACOW) {
                 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
+               if (S_ISREG(inode->i_mode))
+                       BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
+       }
   
         btrfs_update_iflags(inode);
   }
@@@ -571,8 -575,12 +575,12 @@@ static int create_snapshot(struct btrfs
                 ret = btrfs_commit_transaction(trans,
                                                root->fs_info->extent_root);
         }
-       if (ret)
+       if (ret) {
+               /* cleanup_transaction has freed this for us */
+               if (trans->aborted)
+                       pending_snapshot = NULL;
                 goto fail;
+       }
   
         ret = pending_snapshot->error;
         if (ret)
@@@ -705,6 -713,16 +713,16 @@@ static noinline int btrfs_mksubvol(stru
         if (error)
                 goto out_dput;
   
+       /*
+        * even if this name doesn't exist, we may get hash collisions.
+        * check for them now when we can safely fail
+        */
+       error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root,
+                                              dir->i_ino, name,
+                                              namelen);
+       if (error)
+               goto out_dput;
+ 
         down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
   
         if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0)
@@@ -1225,7 -1243,7 +1243,7 @@@ int btrfs_defrag_file(struct inode *ino
                 }
   
                 defrag_count += ret;
- -              balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret);
+ +              balance_dirty_pages_ratelimited(inode->i_mapping);
                 mutex_unlock(&inode->i_mutex);
   
                 if (newer_than) {
@@@ -1293,12 -1311,13 +1311,13 @@@ out_ra
         return ret;
   }
   
- static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
+ static noinline int btrfs_ioctl_resize(struct file *file,
                                         void __user *arg)
   {
         u64 new_size;
         u64 old_size;
         u64 devid = 1;
+       struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
         struct btrfs_ioctl_vol_args *vol_args;
         struct btrfs_trans_handle *trans;
         struct btrfs_device *device = NULL;
@@@ -1313,13 -1332,17 +1332,17 @@@
         if (!capable(CAP_SYS_ADMIN))
                 return -EPERM;
   
-       mutex_lock(&root->fs_info->volume_mutex);
-       if (root->fs_info->balance_ctl) {
-               printk(KERN_INFO "btrfs: balance in progress\n");
-               ret = -EINVAL;
-               goto out;
+       ret = mnt_want_write_file(file);
+       if (ret)
+               return ret;
+ 
+       if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+                       1)) {
+               pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
+               return -EINPROGRESS;
         }
   
+       mutex_lock(&root->fs_info->volume_mutex);
         vol_args = memdup_user(arg, sizeof(*vol_args));
         if (IS_ERR(vol_args)) {
                 ret = PTR_ERR(vol_args);
@@@ -1339,7 -1362,7 +1362,7 @@@
                 printk(KERN_INFO "btrfs: resizing devid %llu\n",
                        (unsigned long long)devid);
         }
-       device = btrfs_find_device(root, devid, NULL, NULL);
+       device = btrfs_find_device(root->fs_info, devid, NULL, NULL);
         if (!device) {
                 printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
                        (unsigned long long)devid);
@@@ -1371,6 -1394,11 +1394,11 @@@
                 }
         }
   
+       if (device->is_tgtdev_for_dev_replace) {
+               ret = -EINVAL;
+               goto out_free;
+       }
+ 
         old_size = device->total_bytes;
   
         if (mod < 0) {
@@@ -1409,12 -1437,14 +1437,14 @@@
                 btrfs_commit_transaction(trans, root);
         } else if (new_size < old_size) {
                 ret = btrfs_shrink_device(device, new_size);
-       }
+       } /* equal, nothing need to do */
   
   out_free:
         kfree(vol_args);
   out:
         mutex_unlock(&root->fs_info->volume_mutex);
+       mnt_drop_write_file(file);
+       atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
         return ret;
   }
   
@@@ -2156,9 -2186,17 +2186,17 @@@ static int btrfs_ioctl_defrag(struct fi
         if (btrfs_root_readonly(root))
                 return -EROFS;
   
+       if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+                       1)) {
+               pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
+               return -EINPROGRESS;
+       }
         ret = mnt_want_write_file(file);
-       if (ret)
+       if (ret) {
+               atomic_set(&root->fs_info->mutually_exclusive_operation_running,
+                          0);
                 return ret;
+       }
   
         switch (inode->i_mode & S_IFMT) {
         case S_IFDIR:
@@@ -2210,6 -2248,7 +2248,7 @@@
         }
   out:
         mnt_drop_write_file(file);
+       atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
         return ret;
   }
   
@@@ -2221,13 -2260,13 +2260,13 @@@ static long btrfs_ioctl_add_dev(struct 
         if (!capable(CAP_SYS_ADMIN))
                 return -EPERM;
   
-       mutex_lock(&root->fs_info->volume_mutex);
-       if (root->fs_info->balance_ctl) {
-               printk(KERN_INFO "btrfs: balance in progress\n");
-               ret = -EINVAL;
-               goto out;
+       if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+                       1)) {
+               pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
+               return -EINPROGRESS;
         }
   
+       mutex_lock(&root->fs_info->volume_mutex);
         vol_args = memdup_user(arg, sizeof(*vol_args));
         if (IS_ERR(vol_args)) {
                 ret = PTR_ERR(vol_args);
@@@ -2240,27 -2279,31 +2279,31 @@@
         kfree(vol_args);
   out:
         mutex_unlock(&root->fs_info->volume_mutex);
+       atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
         return ret;
   }
   
- static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
+ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
   {
+       struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
         struct btrfs_ioctl_vol_args *vol_args;
         int ret;
   
         if (!capable(CAP_SYS_ADMIN))
                 return -EPERM;
   
-       if (root->fs_info->sb->s_flags & MS_RDONLY)
-               return -EROFS;
+       ret = mnt_want_write_file(file);
+       if (ret)
+               return ret;
   
-       mutex_lock(&root->fs_info->volume_mutex);
-       if (root->fs_info->balance_ctl) {
-               printk(KERN_INFO "btrfs: balance in progress\n");
-               ret = -EINVAL;
-               goto out;
+       if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+                       1)) {
+               pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
+               mnt_drop_write_file(file);
+               return -EINPROGRESS;
         }
   
+       mutex_lock(&root->fs_info->volume_mutex);
         vol_args = memdup_user(arg, sizeof(*vol_args));
         if (IS_ERR(vol_args)) {
                 ret = PTR_ERR(vol_args);
@@@ -2273,6 -2316,8 +2316,8 @@@
         kfree(vol_args);
   out:
         mutex_unlock(&root->fs_info->volume_mutex);
+       mnt_drop_write_file(file);
+       atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
         return ret;
   }
   
@@@ -2328,7 -2373,7 +2373,7 @@@ static long btrfs_ioctl_dev_info(struc
                 s_uuid = di_args->uuid;
   
         mutex_lock(&fs_devices->device_list_mutex);
-       dev = btrfs_find_device(root, di_args->devid, s_uuid, NULL);
+       dev = btrfs_find_device(root->fs_info, di_args->devid, s_uuid, NULL);
         mutex_unlock(&fs_devices->device_list_mutex);
   
         if (!dev) {
@@@ -2821,12 -2866,19 +2866,19 @@@ static long btrfs_ioctl_default_subvol(
         struct btrfs_disk_key disk_key;
         u64 objectid = 0;
         u64 dir_id;
+       int ret;
   
         if (!capable(CAP_SYS_ADMIN))
                 return -EPERM;
   
-       if (copy_from_user(&objectid, argp, sizeof(objectid)))
-               return -EFAULT;
+       ret = mnt_want_write_file(file);
+       if (ret)
+               return ret;
+ 
+       if (copy_from_user(&objectid, argp, sizeof(objectid))) {
+               ret = -EFAULT;
+               goto out;
+       }
   
         if (!objectid)
                 objectid = root->root_key.objectid;
@@@ -2836,21 -2888,28 +2888,28 @@@
         location.offset = (u64)-1;
   
         new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
-       if (IS_ERR(new_root))
-               return PTR_ERR(new_root);
+       if (IS_ERR(new_root)) {
+               ret = PTR_ERR(new_root);
+               goto out;
+       }
   
-       if (btrfs_root_refs(&new_root->root_item) == 0)
-               return -ENOENT;
+       if (btrfs_root_refs(&new_root->root_item) == 0) {
+               ret = -ENOENT;
+               goto out;
+       }
   
         path = btrfs_alloc_path();
-       if (!path)
-               return -ENOMEM;
+       if (!path) {
+               ret = -ENOMEM;
+               goto out;
+       }
         path->leave_spinning = 1;
   
         trans = btrfs_start_transaction(root, 1);
         if (IS_ERR(trans)) {
                 btrfs_free_path(path);
-               return PTR_ERR(trans);
+               ret = PTR_ERR(trans);
+               goto out;
         }
   
         dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
@@@ -2861,7 -2920,8 +2920,8 @@@
                 btrfs_end_transaction(trans, root);
                 printk(KERN_ERR "Umm, you don't have the default dir item, "
                        "this isn't going to work\n");
-               return -ENOENT;
+               ret = -ENOENT;
+               goto out;
         }
   
         btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key);
@@@ -2871,8 -2931,9 +2931,9 @@@
   
         btrfs_set_fs_incompat(root->fs_info, DEFAULT_SUBVOL);
         btrfs_end_transaction(trans, root);
- 
-       return 0;
+ out:
+       mnt_drop_write_file(file);
+       return ret;
   }
   
   void btrfs_get_block_group_info(struct list_head *groups_list,
@@@ -3036,32 -3097,38 +3097,38 @@@ long btrfs_ioctl_trans_end(struct file 
         return 0;
   }
   
- static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp)
+ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
+                                           void __user *argp)
   {
-       struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
         struct btrfs_trans_handle *trans;
         u64 transid;
         int ret;
   
-       trans = btrfs_start_transaction(root, 0);
-       if (IS_ERR(trans))
-               return PTR_ERR(trans);
+       trans = btrfs_attach_transaction(root);
+       if (IS_ERR(trans)) {
+               if (PTR_ERR(trans) != -ENOENT)
+                       return PTR_ERR(trans);
+ 
+               /* No running transaction, don't bother */
+               transid = root->fs_info->last_trans_committed;
+               goto out;
+       }
         transid = trans->transid;
         ret = btrfs_commit_transaction_async(trans, root, 0);
         if (ret) {
                 btrfs_end_transaction(trans, root);
                 return ret;
         }
- 
+ out:
         if (argp)
                 if (copy_to_user(argp, &transid, sizeof(transid)))
                         return -EFAULT;
         return 0;
   }
   
- static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp)
+ static noinline long btrfs_ioctl_wait_sync(struct btrfs_root *root,
+                                          void __user *argp)
   {
-       struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
         u64 transid;
   
         if (argp) {
@@@ -3073,10 -3140,11 +3140,11 @@@
         return btrfs_wait_for_commit(root, transid);
   }
   
- static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg)
+ static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
   {
-       int ret;
+       struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
         struct btrfs_ioctl_scrub_args *sa;
+       int ret;
   
         if (!capable(CAP_SYS_ADMIN))
                 return -EPERM;
@@@ -3085,12 -3153,22 +3153,22 @@@
         if (IS_ERR(sa))
                 return PTR_ERR(sa);
   
-       ret = btrfs_scrub_dev(root, sa->devid, sa->start, sa->end,
-                             &sa->progress, sa->flags & BTRFS_SCRUB_READONLY);
+       if (!(sa->flags & BTRFS_SCRUB_READONLY)) {
+               ret = mnt_want_write_file(file);
+               if (ret)
+                       goto out;
+       }
+ 
+       ret = btrfs_scrub_dev(root->fs_info, sa->devid, sa->start, sa->end,
+                             &sa->progress, sa->flags & BTRFS_SCRUB_READONLY,
+                             0);
   
         if (copy_to_user(arg, sa, sizeof(*sa)))
                 ret = -EFAULT;
   
+       if (!(sa->flags & BTRFS_SCRUB_READONLY))
+               mnt_drop_write_file(file);
+ out:
         kfree(sa);
         return ret;
   }
@@@ -3100,7 -3178,7 +3178,7 @@@ static long btrfs_ioctl_scrub_cancel(st
         if (!capable(CAP_SYS_ADMIN))
                 return -EPERM;
   
-       return btrfs_scrub_cancel(root);
+       return btrfs_scrub_cancel(root->fs_info);
   }
   
   static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,
@@@ -3149,6 -3227,51 +3227,51 @@@ static long btrfs_ioctl_get_dev_stats(s
         return ret;
   }
   
+ static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg)
+ {
+       struct btrfs_ioctl_dev_replace_args *p;
+       int ret;
+ 
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+ 
+       p = memdup_user(arg, sizeof(*p));
+       if (IS_ERR(p))
+               return PTR_ERR(p);
+ 
+       switch (p->cmd) {
+       case BTRFS_IOCTL_DEV_REPLACE_CMD_START:
+               if (atomic_xchg(
+                       &root->fs_info->mutually_exclusive_operation_running,
+                       1)) {
+                       pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
+                       ret = -EINPROGRESS;
+               } else {
+                       ret = btrfs_dev_replace_start(root, p);
+                       atomic_set(
+                        &root->fs_info->mutually_exclusive_operation_running,
+                        0);
+               }
+               break;
+       case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS:
+               btrfs_dev_replace_status(root->fs_info, p);
+               ret = 0;
+               break;
+       case BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL:
+               ret = btrfs_dev_replace_cancel(root->fs_info, p);
+               break;
+       default:
+               ret = -EINVAL;
+               break;
+       }
+ 
+       if (copy_to_user(arg, p, sizeof(*p)))
+               ret = -EFAULT;
+ 
+       kfree(p);
+       return ret;
+ }
+ 
   static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
   {
         int ret = 0;
@@@ -3315,6 -3438,7 +3438,7 @@@ static long btrfs_ioctl_balance(struct 
         struct btrfs_ioctl_balance_args *bargs;
         struct btrfs_balance_control *bctl;
         int ret;
+       int need_to_clear_lock = 0;
   
         if (!capable(CAP_SYS_ADMIN))
                 return -EPERM;
@@@ -3350,10 -3474,13 +3474,13 @@@
                 bargs = NULL;
         }
   
-       if (fs_info->balance_ctl) {
+       if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+                       1)) {
+               pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
                 ret = -EINPROGRESS;
                 goto out_bargs;
         }
+       need_to_clear_lock = 1;
   
         bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
         if (!bctl) {
@@@ -3387,6 -3514,9 +3514,9 @@@ do_balance
   out_bargs:
         kfree(bargs);
   out:
+       if (need_to_clear_lock)
+               atomic_set(&root->fs_info->mutually_exclusive_operation_running,
+                          0);
         mutex_unlock(&fs_info->balance_mutex);
         mutex_unlock(&fs_info->volume_mutex);
         mnt_drop_write_file(file);
@@@ -3441,8 -3571,9 +3571,9 @@@ out
         return ret;
   }
   
- static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg)
+ static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
   {
+       struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
         struct btrfs_ioctl_quota_ctl_args *sa;
         struct btrfs_trans_handle *trans = NULL;
         int ret;
@@@ -3451,12 -3582,15 +3582,15 @@@
         if (!capable(CAP_SYS_ADMIN))
                 return -EPERM;
   
-       if (root->fs_info->sb->s_flags & MS_RDONLY)
-               return -EROFS;
+       ret = mnt_want_write_file(file);
+       if (ret)
+               return ret;
   
         sa = memdup_user(arg, sizeof(*sa));
-       if (IS_ERR(sa))
-               return PTR_ERR(sa);
+       if (IS_ERR(sa)) {
+               ret = PTR_ERR(sa);
+               goto drop_write;
+       }
   
         if (sa->cmd != BTRFS_QUOTA_CTL_RESCAN) {
                 trans = btrfs_start_transaction(root, 2);
@@@ -3489,14 -3623,16 +3623,16 @@@
                 if (err && !ret)
                         ret = err;
         }
- 
   out:
         kfree(sa);
+ drop_write:
+       mnt_drop_write_file(file);
         return ret;
   }
   
- static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg)
+ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
   {
+       struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
         struct btrfs_ioctl_qgroup_assign_args *sa;
         struct btrfs_trans_handle *trans;
         int ret;
@@@ -3505,12 -3641,15 +3641,15 @@@
         if (!capable(CAP_SYS_ADMIN))
                 return -EPERM;
   
-       if (root->fs_info->sb->s_flags & MS_RDONLY)
-               return -EROFS;
+       ret = mnt_want_write_file(file);
+       if (ret)
+               return ret;
   
         sa = memdup_user(arg, sizeof(*sa));
-       if (IS_ERR(sa))
-               return PTR_ERR(sa);
+       if (IS_ERR(sa)) {
+               ret = PTR_ERR(sa);
+               goto drop_write;
+       }
   
         trans = btrfs_join_transaction(root);
         if (IS_ERR(trans)) {
@@@ -3533,11 -3672,14 +3672,14 @@@
   
   out:
         kfree(sa);
+ drop_write:
+       mnt_drop_write_file(file);
         return ret;
   }
   
- static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg)
+ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
   {
+       struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
         struct btrfs_ioctl_qgroup_create_args *sa;
         struct btrfs_trans_handle *trans;
         int ret;
@@@ -3546,12 -3688,15 +3688,15 @@@
         if (!capable(CAP_SYS_ADMIN))
                 return -EPERM;
   
-       if (root->fs_info->sb->s_flags & MS_RDONLY)
-               return -EROFS;
+       ret = mnt_want_write_file(file);
+       if (ret)
+               return ret;
   
         sa = memdup_user(arg, sizeof(*sa));
-       if (IS_ERR(sa))
-               return PTR_ERR(sa);
+       if (IS_ERR(sa)) {
+               ret = PTR_ERR(sa);
+               goto drop_write;
+       }
   
         trans = btrfs_join_transaction(root);
         if (IS_ERR(trans)) {
@@@ -3573,11 -3718,14 +3718,14 @@@
   
   out:
         kfree(sa);
+ drop_write:
+       mnt_drop_write_file(file);
         return ret;
   }
   
- static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg)
+ static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg)
   {
+       struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
         struct btrfs_ioctl_qgroup_limit_args *sa;
         struct btrfs_trans_handle *trans;
         int ret;
@@@ -3587,12 -3735,15 +3735,15 @@@
         if (!capable(CAP_SYS_ADMIN))
                 return -EPERM;
   
-       if (root->fs_info->sb->s_flags & MS_RDONLY)
-               return -EROFS;
+       ret = mnt_want_write_file(file);
+       if (ret)
+               return ret;
   
         sa = memdup_user(arg, sizeof(*sa));
-       if (IS_ERR(sa))
-               return PTR_ERR(sa);
+       if (IS_ERR(sa)) {
+               ret = PTR_ERR(sa);
+               goto drop_write;
+       }
   
         trans = btrfs_join_transaction(root);
         if (IS_ERR(trans)) {
@@@ -3615,6 -3766,8 +3766,8 @@@
   
   out:
         kfree(sa);
+ drop_write:
+       mnt_drop_write_file(file);
         return ret;
   }
   
@@@ -3735,11 -3888,11 +3888,11 @@@ long btrfs_ioctl(struct file *file, uns
         case BTRFS_IOC_DEFRAG_RANGE:
                 return btrfs_ioctl_defrag(file, argp);
         case BTRFS_IOC_RESIZE:
-               return btrfs_ioctl_resize(root, argp);
+               return btrfs_ioctl_resize(file, argp);
         case BTRFS_IOC_ADD_DEV:
                 return btrfs_ioctl_add_dev(root, argp);
         case BTRFS_IOC_RM_DEV:
-               return btrfs_ioctl_rm_dev(root, argp);
+               return btrfs_ioctl_rm_dev(file, argp);
         case BTRFS_IOC_FS_INFO:
                 return btrfs_ioctl_fs_info(root, argp);
         case BTRFS_IOC_DEV_INFO:
@@@ -3768,11 -3921,11 +3921,11 @@@
                 btrfs_sync_fs(file->f_dentry->d_sb, 1);
                 return 0;
         case BTRFS_IOC_START_SYNC:
-               return btrfs_ioctl_start_sync(file, argp);
+               return btrfs_ioctl_start_sync(root, argp);
         case BTRFS_IOC_WAIT_SYNC:
-               return btrfs_ioctl_wait_sync(file, argp);
+               return btrfs_ioctl_wait_sync(root, argp);
         case BTRFS_IOC_SCRUB:
-               return btrfs_ioctl_scrub(root, argp);
+               return btrfs_ioctl_scrub(file, argp);
         case BTRFS_IOC_SCRUB_CANCEL:
                 return btrfs_ioctl_scrub_cancel(root, argp);
         case BTRFS_IOC_SCRUB_PROGRESS:
@@@ -3790,13 -3943,15 +3943,15 @@@
         case BTRFS_IOC_GET_DEV_STATS:
                 return btrfs_ioctl_get_dev_stats(root, argp);
         case BTRFS_IOC_QUOTA_CTL:
-               return btrfs_ioctl_quota_ctl(root, argp);
+               return btrfs_ioctl_quota_ctl(file, argp);
         case BTRFS_IOC_QGROUP_ASSIGN:
-               return btrfs_ioctl_qgroup_assign(root, argp);
+               return btrfs_ioctl_qgroup_assign(file, argp);
         case BTRFS_IOC_QGROUP_CREATE:
-               return btrfs_ioctl_qgroup_create(root, argp);
+               return btrfs_ioctl_qgroup_create(file, argp);
         case BTRFS_IOC_QGROUP_LIMIT:
-               return btrfs_ioctl_qgroup_limit(root, argp);
+               return btrfs_ioctl_qgroup_limit(file, argp);
+       case BTRFS_IOC_DEV_REPLACE:
+               return btrfs_ioctl_dev_replace(root, argp);
         }
   
         return -ENOTTY;
diff --combined fs/btrfs/ordered-data.h

index 853fc7b,efc7c29..f29d4bf
--- 1/fs/btrfs/ordered-data.h
--- 2/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@@ -76,7 -76,7 +76,7 @@@ struct btrfs_ordered_sum 
   
   #define BTRFS_ORDERED_IOERR 6 /* We had an io error when writing this out */
   
- -#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates wether this ordered extent
+ +#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates whether this ordered extent
                                        * has done its due diligence in updating
                                        * the isize. */
   
@@@ -128,8 -128,11 +128,11 @@@ struct btrfs_ordered_extent 
         struct list_head root_extent_list;
   
         struct btrfs_work work;
- };
   
+       struct completion completion;
+       struct btrfs_work flush_work;
+       struct list_head work_list;
+ };
   
   /*
    * calculates the total size you need to allocate for an ordered sum
@@@ -186,7 -189,7 +189,7 @@@ struct btrfs_ordered_extent *btrfs_look
   int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
                                 struct btrfs_ordered_extent *ordered);
   int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
- void btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
+ int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
   void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
                                  struct btrfs_root *root,
                                  struct inode *inode);
author	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 18 Dec 2012 17:42:05 +0000 (09:42 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 18 Dec 2012 17:42:05 +0000 (09:42 -0800)
		1	2
fs/btrfs/disk-io.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/extent-tree.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/extent_map.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/file.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/ioctl.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/ordered-data.h	patch \|	diff1 \|	diff2 \|	blob \| history