Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux...
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 1 Jun 2012 15:37:31 +0000 (08:37 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 1 Jun 2012 15:37:31 +0000 (08:37 -0700)
Pull btrfs updates from Chris Mason:
 "This includes a fairly large change from Josef around data writeback
  completion.  Before, the writeback wasn't completed until the metadata
  insertions for the extent were done, and this made for fairly large
  latency spikes on the last page of each ordered extent.

  We already had a separate mechanism for tracking pending metadata
  insertions, so Josef just needed to tweak things a little to end
  writeback earlier on the page.  Overall it makes us much friendly to
  memory reclaim and lowers latencies quite a lot for synchronous IO.

  Jan Schmidt has finished some background work required to track btree
  blocks as they go through changes in ownership.  It's the missing
  piece he needed for both btrfs send/receive and subvolume quotas.
  Neither of those are ready yet, but the new tracking code is included
  here.  Most of the time, the new code is off.  It is only used by
  scrub and other backref walkers.

  Stefan Behrens has added io failure tracking.  This includes counters
  for which drives are causing the most trouble so the admin (or an
  automated tool) can choose to kick them out.  We're tracking IO
  errors, crc errors, and generation checks we do on each metadata
  block.

  RAID5/6 did miss the cut this time because I'm having trouble with
  corruptions.  I'll nail it down next week and post as a beta testing
  before 3.6"

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs: (58 commits)
  Btrfs: fix tree mod log rewinded level and rewinding of moved keys
  Btrfs: fix tree mod log del_ptr
  Btrfs: add tree_mod_dont_log helper
  Btrfs: add missing spin_lock for insertion into tree mod log
  Btrfs: add inodes before dropping the extent lock in find_all_leafs
  Btrfs: use delayed ref sequence numbers for all fs-tree updates
  Btrfs: fix false positive in check-integrity on unmount
  Btrfs: fix runtime warning in check-integrity check data mode
  Btrfs: set ioprio of scrub readahead to idle
  Btrfs: fix return code in drop_objectid_items
  Btrfs: check to see if the inode is in the log before fsyncing
  Btrfs: return value of btrfs_read_buffer is checked correctly
  Btrfs: read device stats on mount, write modified ones during commit
  Btrfs: add ioctl to get and reset the device stats
  Btrfs: add device counters for detected IO and checksum errors
  btrfs: Drop unused function btrfs_abort_devices()
  Btrfs: fix the same inode id problem when doing auto defragment
  Btrfs: fall back to non-inline if we don't have enough space
  Btrfs: fix how we deal with the orphan block rsv
  Btrfs: convert the inode bit field to use the actual bit operations
  ...

1  2 
fs/btrfs/disk-io.c
fs/btrfs/inode.c

diff --combined fs/btrfs/disk-io.c
@@@ -1153,7 -1153,6 +1153,6 @@@ static void __setup_root(u32 nodesize, 
        root->orphan_block_rsv = NULL;
  
        INIT_LIST_HEAD(&root->dirty_list);
-       INIT_LIST_HEAD(&root->orphan_list);
        INIT_LIST_HEAD(&root->root_list);
        spin_lock_init(&root->orphan_lock);
        spin_lock_init(&root->inode_lock);
        atomic_set(&root->log_commit[0], 0);
        atomic_set(&root->log_commit[1], 0);
        atomic_set(&root->log_writers, 0);
+       atomic_set(&root->orphan_inodes, 0);
        root->log_batch = 0;
        root->log_transid = 0;
        root->last_log_commit = 0;
@@@ -1252,7 -1252,7 +1252,7 @@@ static struct btrfs_root *alloc_log_tre
  
        leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
                                      BTRFS_TREE_LOG_OBJECTID, NULL,
-                                     0, 0, 0, 0);
+                                     0, 0, 0);
        if (IS_ERR(leaf)) {
                kfree(root);
                return ERR_CAST(leaf);
@@@ -1914,11 -1914,14 +1914,14 @@@ int open_ctree(struct super_block *sb
        spin_lock_init(&fs_info->delayed_iput_lock);
        spin_lock_init(&fs_info->defrag_inodes_lock);
        spin_lock_init(&fs_info->free_chunk_lock);
+       spin_lock_init(&fs_info->tree_mod_seq_lock);
+       rwlock_init(&fs_info->tree_mod_log_lock);
        mutex_init(&fs_info->reloc_mutex);
  
        init_completion(&fs_info->kobj_unregister);
        INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
        INIT_LIST_HEAD(&fs_info->space_info);
+       INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
        btrfs_mapping_init(&fs_info->mapping_tree);
        btrfs_init_block_rsv(&fs_info->global_block_rsv);
        btrfs_init_block_rsv(&fs_info->delalloc_block_rsv);
        atomic_set(&fs_info->async_submit_draining, 0);
        atomic_set(&fs_info->nr_async_bios, 0);
        atomic_set(&fs_info->defrag_running, 0);
+       atomic_set(&fs_info->tree_mod_seq, 0);
        fs_info->sb = sb;
        fs_info->max_inline = 8192 * 1024;
        fs_info->metadata_ratio = 0;
        fs_info->defrag_inodes = RB_ROOT;
        fs_info->trans_no_join = 0;
        fs_info->free_chunk_space = 0;
+       fs_info->tree_mod_log = RB_ROOT;
  
        /* readahead state */
        INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
        BTRFS_I(fs_info->btree_inode)->root = tree_root;
        memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
               sizeof(struct btrfs_key));
-       BTRFS_I(fs_info->btree_inode)->dummy_inode = 1;
+       set_bit(BTRFS_INODE_DUMMY,
+               &BTRFS_I(fs_info->btree_inode)->runtime_flags);
        insert_inode_hash(fs_info->btree_inode);
  
        spin_lock_init(&fs_info->block_group_cache_lock);
@@@ -2353,6 -2359,13 +2359,13 @@@ retry_root_backup
        fs_info->generation = generation;
        fs_info->last_trans_committed = generation;
  
+       ret = btrfs_init_dev_stats(fs_info);
+       if (ret) {
+               printk(KERN_ERR "btrfs: failed to init dev_stats: %d\n",
+                      ret);
+               goto fail_block_groups;
+       }
        ret = btrfs_init_space_info(fs_info);
        if (ret) {
                printk(KERN_ERR "Failed to initial space info: %d\n", ret);
@@@ -2556,18 -2569,19 +2569,19 @@@ recovery_tree_root
  
  static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
  {
-       char b[BDEVNAME_SIZE];
        if (uptodate) {
                set_buffer_uptodate(bh);
        } else {
+               struct btrfs_device *device = (struct btrfs_device *)
+                       bh->b_private;
                printk_ratelimited(KERN_WARNING "lost page write due to "
-                                       "I/O error on %s\n",
-                                      bdevname(bh->b_bdev, b));
+                                  "I/O error on %s\n", device->name);
                /* note, we dont' set_buffer_write_io_error because we have
                 * our own ways of dealing with the IO errors
                 */
                clear_buffer_uptodate(bh);
+               btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_WRITE_ERRS);
        }
        unlock_buffer(bh);
        put_bh(bh);
@@@ -2682,6 -2696,7 +2696,7 @@@ static int write_dev_supers(struct btrf
                        set_buffer_uptodate(bh);
                        lock_buffer(bh);
                        bh->b_end_io = btrfs_end_buffer_write_sync;
+                       bh->b_private = device;
                }
  
                /*
@@@ -2740,6 -2755,9 +2755,9 @@@ static int write_dev_flush(struct btrfs
                }
                if (!bio_flagged(bio, BIO_UPTODATE)) {
                        ret = -EIO;
+                       if (!bio_flagged(bio, BIO_EOPNOTSUPP))
+                               btrfs_dev_stat_inc_and_print(device,
+                                       BTRFS_DEV_STAT_FLUSH_ERRS);
                }
  
                /* drop the reference from the wait == 0 run */
         * one reference for us, and we leave it for the
         * caller
         */
 -      device->flush_bio = NULL;;
 +      device->flush_bio = NULL;
        bio = bio_alloc(GFP_NOFS, 0);
        if (!bio)
                return -ENOMEM;
@@@ -2902,19 -2920,6 +2920,6 @@@ int write_ctree_super(struct btrfs_tran
        return ret;
  }
  
- /* Kill all outstanding I/O */
- void btrfs_abort_devices(struct btrfs_root *root)
- {
-       struct list_head *head;
-       struct btrfs_device *dev;
-       mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
-       head = &root->fs_info->fs_devices->devices;
-       list_for_each_entry_rcu(dev, head, dev_list) {
-               blk_abort_queue(dev->bdev->bd_disk->queue);
-       }
-       mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
- }
  void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
  {
        spin_lock(&fs_info->fs_roots_radix_lock);
@@@ -3671,17 -3676,6 +3676,6 @@@ int btrfs_cleanup_transaction(struct bt
        return 0;
  }
  
- static int btree_writepage_io_failed_hook(struct bio *bio, struct page *page,
-                                         u64 start, u64 end,
-                                         struct extent_state *state)
- {
-       struct super_block *sb = page->mapping->host->i_sb;
-       struct btrfs_fs_info *fs_info = btrfs_sb(sb);
-       btrfs_error(fs_info, -EIO,
-                   "Error occured while writing out btree at %llu", start);
-       return -EIO;
- }
  static struct extent_io_ops btree_extent_io_ops = {
        .write_cache_pages_lock_hook = btree_lock_page_hook,
        .readpage_end_io_hook = btree_readpage_end_io_hook,
        .submit_bio_hook = btree_submit_bio_hook,
        /* note we're sharing with inode.c for the merge bio hook */
        .merge_bio_hook = btrfs_merge_bio_hook,
-       .writepage_io_failed_hook = btree_writepage_io_failed_hook,
  };
diff --combined fs/btrfs/inode.c
@@@ -89,7 -89,7 +89,7 @@@ static unsigned char btrfs_type_by_mode
  
  static int btrfs_setsize(struct inode *inode, loff_t newsize);
  static int btrfs_truncate(struct inode *inode);
- static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end);
+ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
  static noinline int cow_file_range(struct inode *inode,
                                   struct page *locked_page,
                                   u64 start, u64 end, int *page_started,
@@@ -257,10 -257,13 +257,13 @@@ static noinline int cow_file_range_inli
        ret = insert_inline_extent(trans, root, inode, start,
                                   inline_len, compressed_size,
                                   compress_type, compressed_pages);
-       if (ret) {
+       if (ret && ret != -ENOSPC) {
                btrfs_abort_transaction(trans, root, ret);
                return ret;
+       } else if (ret == -ENOSPC) {
+               return 1;
        }
        btrfs_delalloc_release_metadata(inode, end + 1 - start);
        btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
        return 0;
@@@ -1572,11 -1575,11 +1575,11 @@@ static int btrfs_submit_bio_hook(struc
        if (btrfs_is_free_space_inode(root, inode))
                metadata = 2;
  
-       ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
-       if (ret)
-               return ret;
        if (!(rw & REQ_WRITE)) {
+               ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
+               if (ret)
+                       return ret;
                if (bio_flags & EXTENT_BIO_COMPRESSED) {
                        return btrfs_submit_compressed_read(inode, bio,
                                                    mirror_num, bio_flags);
   * an ordered extent if the range of bytes in the file it covers are
   * fully written.
   */
- static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
+ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
  {
+       struct inode *inode = ordered_extent->inode;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_trans_handle *trans = NULL;
-       struct btrfs_ordered_extent *ordered_extent = NULL;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        struct extent_state *cached_state = NULL;
        int compress_type = 0;
        int ret;
        bool nolock;
  
-       ret = btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
-                                            end - start + 1);
-       if (!ret)
-               return 0;
-       BUG_ON(!ordered_extent); /* Logic error */
        nolock = btrfs_is_free_space_inode(root, inode);
  
+       if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
+               ret = -EIO;
+               goto out;
+       }
        if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
                BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
                ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
                                   ordered_extent->file_offset,
                                   ordered_extent->len);
        }
-       unlock_extent_cached(io_tree, ordered_extent->file_offset,
-                            ordered_extent->file_offset +
-                            ordered_extent->len - 1, &cached_state, GFP_NOFS);
        if (ret < 0) {
                btrfs_abort_transaction(trans, root, ret);
-               goto out;
+               goto out_unlock;
        }
  
        add_pending_csums(trans, inode, ordered_extent->file_offset,
                ret = btrfs_update_inode_fallback(trans, root, inode);
                if (ret) { /* -ENOMEM or corruption */
                        btrfs_abort_transaction(trans, root, ret);
-                       goto out;
+                       goto out_unlock;
                }
        }
        ret = 0;
+ out_unlock:
+       unlock_extent_cached(io_tree, ordered_extent->file_offset,
+                            ordered_extent->file_offset +
+                            ordered_extent->len - 1, &cached_state, GFP_NOFS);
  out:
        if (root != root->fs_info->tree_root)
                btrfs_delalloc_release_metadata(inode, ordered_extent->len);
                        btrfs_end_transaction(trans, root);
        }
  
+       if (ret)
+               clear_extent_uptodate(io_tree, ordered_extent->file_offset,
+                                     ordered_extent->file_offset +
+                                     ordered_extent->len - 1, NULL, GFP_NOFS);
+       /*
+        * This needs to be dont to make sure anybody waiting knows we are done
+        * upating everything for this ordered extent.
+        */
+       btrfs_remove_ordered_extent(inode, ordered_extent);
        /* once for us */
        btrfs_put_ordered_extent(ordered_extent);
        /* once for the tree */
        btrfs_put_ordered_extent(ordered_extent);
  
-       return 0;
- out_unlock:
-       unlock_extent_cached(io_tree, ordered_extent->file_offset,
-                            ordered_extent->file_offset +
-                            ordered_extent->len - 1, &cached_state, GFP_NOFS);
-       goto out;
+       return ret;
+ }
+ static void finish_ordered_fn(struct btrfs_work *work)
+ {
+       struct btrfs_ordered_extent *ordered_extent;
+       ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
+       btrfs_finish_ordered_io(ordered_extent);
  }
  
  static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
                                struct extent_state *state, int uptodate)
  {
+       struct inode *inode = page->mapping->host;
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct btrfs_ordered_extent *ordered_extent = NULL;
+       struct btrfs_workers *workers;
        trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
  
        ClearPagePrivate2(page);
-       return btrfs_finish_ordered_io(page->mapping->host, start, end);
+       if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
+                                           end - start + 1, uptodate))
+               return 0;
+       ordered_extent->work.func = finish_ordered_fn;
+       ordered_extent->work.flags = 0;
+       if (btrfs_is_free_space_inode(root, inode))
+               workers = &root->fs_info->endio_freespace_worker;
+       else
+               workers = &root->fs_info->endio_write_workers;
+       btrfs_queue_worker(workers, &ordered_extent->work);
+       return 0;
  }
  
  /*
@@@ -2072,12 -2107,12 +2107,12 @@@ void btrfs_orphan_commit_root(struct bt
        struct btrfs_block_rsv *block_rsv;
        int ret;
  
-       if (!list_empty(&root->orphan_list) ||
+       if (atomic_read(&root->orphan_inodes) ||
            root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
                return;
  
        spin_lock(&root->orphan_lock);
-       if (!list_empty(&root->orphan_list)) {
+       if (atomic_read(&root->orphan_inodes)) {
                spin_unlock(&root->orphan_lock);
                return;
        }
@@@ -2134,8 -2169,8 +2169,8 @@@ int btrfs_orphan_add(struct btrfs_trans
                block_rsv = NULL;
        }
  
-       if (list_empty(&BTRFS_I(inode)->i_orphan)) {
-               list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+       if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+                             &BTRFS_I(inode)->runtime_flags)) {
  #if 0
                /*
                 * For proper ENOSPC handling, we should do orphan
                        insert = 1;
  #endif
                insert = 1;
+               atomic_dec(&root->orphan_inodes);
        }
  
-       if (!BTRFS_I(inode)->orphan_meta_reserved) {
-               BTRFS_I(inode)->orphan_meta_reserved = 1;
+       if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
+                             &BTRFS_I(inode)->runtime_flags))
                reserve = 1;
-       }
        spin_unlock(&root->orphan_lock);
  
        /* grab metadata reservation from transaction handle */
        if (insert >= 1) {
                ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
                if (ret && ret != -EEXIST) {
+                       clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+                                 &BTRFS_I(inode)->runtime_flags);
                        btrfs_abort_transaction(trans, root, ret);
                        return ret;
                }
@@@ -2196,15 -2233,13 +2233,13 @@@ int btrfs_orphan_del(struct btrfs_trans
        int ret = 0;
  
        spin_lock(&root->orphan_lock);
-       if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
-               list_del_init(&BTRFS_I(inode)->i_orphan);
+       if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+                              &BTRFS_I(inode)->runtime_flags))
                delete_item = 1;
-       }
  
-       if (BTRFS_I(inode)->orphan_meta_reserved) {
-               BTRFS_I(inode)->orphan_meta_reserved = 0;
+       if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
+                              &BTRFS_I(inode)->runtime_flags))
                release_rsv = 1;
-       }
        spin_unlock(&root->orphan_lock);
  
        if (trans && delete_item) {
                BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */
        }
  
-       if (release_rsv)
+       if (release_rsv) {
                btrfs_orphan_release_metadata(inode);
+               atomic_dec(&root->orphan_inodes);
+       }
  
        return 0;
  }
@@@ -2341,6 -2378,8 +2378,8 @@@ int btrfs_orphan_cleanup(struct btrfs_r
                                ret = PTR_ERR(trans);
                                goto out;
                        }
+                       printk(KERN_ERR "auto deleting %Lu\n",
+                              found_key.objectid);
                        ret = btrfs_del_orphan_item(trans, root,
                                                    found_key.objectid);
                        BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */
                 * add this inode to the orphan list so btrfs_orphan_del does
                 * the proper thing when we hit it
                 */
-               spin_lock(&root->orphan_lock);
-               list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
-               spin_unlock(&root->orphan_lock);
+               set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+                       &BTRFS_I(inode)->runtime_flags);
  
                /* if we have links, this was a truncate, lets do that */
                if (inode->i_nlink) {
@@@ -2510,7 -2548,7 +2548,7 @@@ static void btrfs_read_locked_inode(str
  
        inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
        BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
-       BTRFS_I(inode)->sequence = btrfs_inode_sequence(leaf, inode_item);
+       inode->i_version = btrfs_inode_sequence(leaf, inode_item);
        inode->i_generation = BTRFS_I(inode)->generation;
        inode->i_rdev = 0;
        rdev = btrfs_inode_rdev(leaf, inode_item);
@@@ -2594,7 -2632,7 +2632,7 @@@ static void fill_inode_item(struct btrf
  
        btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
        btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
-       btrfs_set_inode_sequence(leaf, item, BTRFS_I(inode)->sequence);
+       btrfs_set_inode_sequence(leaf, item, inode->i_version);
        btrfs_set_inode_transid(leaf, item, trans->transid);
        btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
        btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
@@@ -2752,6 -2790,8 +2790,8 @@@ err
                goto out;
  
        btrfs_i_size_write(dir, dir->i_size - name_len * 2);
+       inode_inc_iversion(inode);
+       inode_inc_iversion(dir);
        inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
        btrfs_update_inode(trans, root, dir);
  out:
@@@ -3089,6 -3129,7 +3129,7 @@@ int btrfs_unlink_subvol(struct btrfs_tr
        }
  
        btrfs_i_size_write(dir, dir->i_size - name_len * 2);
+       inode_inc_iversion(dir);
        dir->i_mtime = dir->i_ctime = CURRENT_TIME;
        ret = btrfs_update_inode(trans, root, dir);
        if (ret)
@@@ -3607,7 -3648,8 +3648,8 @@@ static int btrfs_setsize(struct inode *
                 * any new writes get down to disk quickly.
                 */
                if (newsize == 0)
-                       BTRFS_I(inode)->ordered_data_close = 1;
+                       set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
+                               &BTRFS_I(inode)->runtime_flags);
  
                /* we don't support swapfiles, so vmtruncate shouldn't fail */
                truncate_setsize(inode, newsize);
@@@ -3638,6 -3680,7 +3680,7 @@@ static int btrfs_setattr(struct dentry 
  
        if (attr->ia_valid) {
                setattr_copy(inode, attr);
+               inode_inc_iversion(inode);
                err = btrfs_dirty_inode(inode);
  
                if (!err && attr->ia_valid & ATTR_MODE)
@@@ -3671,7 -3714,8 +3714,8 @@@ void btrfs_evict_inode(struct inode *in
        btrfs_wait_ordered_range(inode, 0, (u64)-1);
  
        if (root->fs_info->log_root_recovering) {
-               BUG_ON(!list_empty(&BTRFS_I(inode)->i_orphan));
+               BUG_ON(!test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+                                &BTRFS_I(inode)->runtime_flags));
                goto no_delete;
        }
  
        btrfs_end_transaction(trans, root);
        btrfs_btree_balance_dirty(root, nr);
  no_delete:
 -      end_writeback(inode);
 +      clear_inode(inode);
        return;
  }
  
@@@ -4066,7 -4110,7 +4110,7 @@@ static struct inode *new_simple_dir(str
  
        BTRFS_I(inode)->root = root;
        memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
-       BTRFS_I(inode)->dummy_inode = 1;
+       set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
  
        inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
        inode->i_op = &btrfs_dir_ro_inode_operations;
@@@ -4370,7 -4414,7 +4414,7 @@@ int btrfs_write_inode(struct inode *ino
        int ret = 0;
        bool nolock = false;
  
-       if (BTRFS_I(inode)->dummy_inode)
+       if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
                return 0;
  
        if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(root, inode))
@@@ -4403,7 -4447,7 +4447,7 @@@ int btrfs_dirty_inode(struct inode *ino
        struct btrfs_trans_handle *trans;
        int ret;
  
-       if (BTRFS_I(inode)->dummy_inode)
+       if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
                return 0;
  
        trans = btrfs_join_transaction(root);
@@@ -4730,6 -4774,7 +4774,7 @@@ int btrfs_add_link(struct btrfs_trans_h
  
        btrfs_i_size_write(parent_inode, parent_inode->i_size +
                           name_len * 2);
+       inode_inc_iversion(parent_inode);
        parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
        ret = btrfs_update_inode(trans, root, parent_inode);
        if (ret)
@@@ -4937,6 -4982,7 +4982,7 @@@ static int btrfs_link(struct dentry *ol
        }
  
        btrfs_inc_nlink(inode);
+       inode_inc_iversion(inode);
        inode->i_ctime = CURRENT_TIME;
        ihold(inode);
  
@@@ -5903,9 -5949,7 +5949,7 @@@ static void btrfs_endio_direct_write(st
        struct btrfs_dio_private *dip = bio->bi_private;
        struct inode *inode = dip->inode;
        struct btrfs_root *root = BTRFS_I(inode)->root;
-       struct btrfs_trans_handle *trans;
        struct btrfs_ordered_extent *ordered = NULL;
-       struct extent_state *cached_state = NULL;
        u64 ordered_offset = dip->logical_offset;
        u64 ordered_bytes = dip->bytes;
        int ret;
  again:
        ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
                                                   &ordered_offset,
-                                                  ordered_bytes);
+                                                  ordered_bytes, !err);
        if (!ret)
                goto out_test;
  
-       BUG_ON(!ordered);
-       trans = btrfs_join_transaction(root);
-       if (IS_ERR(trans)) {
-               err = -ENOMEM;
-               goto out;
-       }
-       trans->block_rsv = &root->fs_info->delalloc_block_rsv;
-       if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
-               ret = btrfs_ordered_update_i_size(inode, 0, ordered);
-               if (!ret)
-                       err = btrfs_update_inode_fallback(trans, root, inode);
-               goto out;
-       }
-       lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset,
-                        ordered->file_offset + ordered->len - 1, 0,
-                        &cached_state);
-       if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
-               ret = btrfs_mark_extent_written(trans, inode,
-                                               ordered->file_offset,
-                                               ordered->file_offset +
-                                               ordered->len);
-               if (ret) {
-                       err = ret;
-                       goto out_unlock;
-               }
-       } else {
-               ret = insert_reserved_file_extent(trans, inode,
-                                                 ordered->file_offset,
-                                                 ordered->start,
-                                                 ordered->disk_len,
-                                                 ordered->len,
-                                                 ordered->len,
-                                                 0, 0, 0,
-                                                 BTRFS_FILE_EXTENT_REG);
-               unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
-                                  ordered->file_offset, ordered->len);
-               if (ret) {
-                       err = ret;
-                       WARN_ON(1);
-                       goto out_unlock;
-               }
-       }
-       add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
-       ret = btrfs_ordered_update_i_size(inode, 0, ordered);
-       if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags))
-               btrfs_update_inode_fallback(trans, root, inode);
-       ret = 0;
- out_unlock:
-       unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
-                            ordered->file_offset + ordered->len - 1,
-                            &cached_state, GFP_NOFS);
- out:
-       btrfs_delalloc_release_metadata(inode, ordered->len);
-       btrfs_end_transaction(trans, root);
-       ordered_offset = ordered->file_offset + ordered->len;
-       btrfs_put_ordered_extent(ordered);
-       btrfs_put_ordered_extent(ordered);
+       ordered->work.func = finish_ordered_fn;
+       ordered->work.flags = 0;
+       btrfs_queue_worker(&root->fs_info->endio_write_workers,
+                          &ordered->work);
  out_test:
        /*
         * our bio might span multiple ordered extents.  If we haven't
        if (ordered_offset < dip->logical_offset + dip->bytes) {
                ordered_bytes = dip->logical_offset + dip->bytes -
                        ordered_offset;
+               ordered = NULL;
                goto again;
        }
  out_done:
        bio->bi_private = dip->private;
  
-       kfree(dip->csums);
        kfree(dip);
  
        /* If we had an error make sure to clear the uptodate flag */
@@@ -6063,9 -6048,12 +6048,12 @@@ static inline int __btrfs_submit_dio_bi
        int ret;
  
        bio_get(bio);
-       ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
-       if (ret)
-               goto err;
+       if (!write) {
+               ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+               if (ret)
+                       goto err;
+       }
  
        if (skip_sum)
                goto map;
@@@ -6485,13 -6473,13 +6473,13 @@@ static int btrfs_releasepage(struct pag
  
  static void btrfs_invalidatepage(struct page *page, unsigned long offset)
  {
+       struct inode *inode = page->mapping->host;
        struct extent_io_tree *tree;
        struct btrfs_ordered_extent *ordered;
        struct extent_state *cached_state = NULL;
        u64 page_start = page_offset(page);
        u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
  
        /*
         * we have the page locked, so new writeback can't start,
         * and the dirty bit won't be cleared while we are here.
         */
        wait_on_page_writeback(page);
  
-       tree = &BTRFS_I(page->mapping->host)->io_tree;
+       tree = &BTRFS_I(inode)->io_tree;
        if (offset) {
                btrfs_releasepage(page, GFP_NOFS);
                return;
        }
        lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
-       ordered = btrfs_lookup_ordered_extent(page->mapping->host,
+       ordered = btrfs_lookup_ordered_extent(inode,
                                           page_offset(page));
        if (ordered) {
                /*
                 * whoever cleared the private bit is responsible
                 * for the finish_ordered_io
                 */
-               if (TestClearPagePrivate2(page)) {
-                       btrfs_finish_ordered_io(page->mapping->host,
-                                               page_start, page_end);
+               if (TestClearPagePrivate2(page) &&
+                   btrfs_dec_test_ordered_pending(inode, &ordered, page_start,
+                                                  PAGE_CACHE_SIZE, 1)) {
+                       btrfs_finish_ordered_io(ordered);
                }
                btrfs_put_ordered_extent(ordered);
                cached_state = NULL;
@@@ -6771,7 -6760,8 +6760,8 @@@ static int btrfs_truncate(struct inode 
         * using truncate to replace the contents of the file will
         * end up with a zero length file after a crash.
         */
-       if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close)
+       if (inode->i_size == 0 && test_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
+                                          &BTRFS_I(inode)->runtime_flags))
                btrfs_add_ordered_operation(trans, root, inode);
  
        while (1) {
@@@ -6894,7 -6884,6 +6884,6 @@@ struct inode *btrfs_alloc_inode(struct 
        ei->root = NULL;
        ei->space_info = NULL;
        ei->generation = 0;
-       ei->sequence = 0;
        ei->last_trans = 0;
        ei->last_sub_trans = 0;
        ei->logged_trans = 0;
        ei->outstanding_extents = 0;
        ei->reserved_extents = 0;
  
-       ei->ordered_data_close = 0;
-       ei->orphan_meta_reserved = 0;
-       ei->dummy_inode = 0;
-       ei->in_defrag = 0;
-       ei->delalloc_meta_reserved = 0;
+       ei->runtime_flags = 0;
        ei->force_compress = BTRFS_COMPRESS_NONE;
  
        ei->delayed_node = NULL;
        mutex_init(&ei->log_mutex);
        mutex_init(&ei->delalloc_mutex);
        btrfs_ordered_inode_tree_init(&ei->ordered_tree);
-       INIT_LIST_HEAD(&ei->i_orphan);
        INIT_LIST_HEAD(&ei->delalloc_inodes);
        INIT_LIST_HEAD(&ei->ordered_operations);
        RB_CLEAR_NODE(&ei->rb_node);
@@@ -6972,13 -6956,12 +6956,12 @@@ void btrfs_destroy_inode(struct inode *
                spin_unlock(&root->fs_info->ordered_extent_lock);
        }
  
-       spin_lock(&root->orphan_lock);
-       if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
+       if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+                    &BTRFS_I(inode)->runtime_flags)) {
                printk(KERN_INFO "BTRFS: inode %llu still on the orphan list\n",
                       (unsigned long long)btrfs_ino(inode));
-               list_del_init(&BTRFS_I(inode)->i_orphan);
+               atomic_dec(&root->orphan_inodes);
        }
-       spin_unlock(&root->orphan_lock);
  
        while (1) {
                ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
@@@ -7193,6 -7176,9 +7176,9 @@@ static int btrfs_rename(struct inode *o
        if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode))
                btrfs_add_ordered_operation(trans, root, old_inode);
  
+       inode_inc_iversion(old_dir);
+       inode_inc_iversion(new_dir);
+       inode_inc_iversion(old_inode);
        old_dir->i_ctime = old_dir->i_mtime = ctime;
        new_dir->i_ctime = new_dir->i_mtime = ctime;
        old_inode->i_ctime = ctime;
        }
  
        if (new_inode) {
+               inode_inc_iversion(new_inode);
                new_inode->i_ctime = CURRENT_TIME;
                if (unlikely(btrfs_ino(new_inode) ==
                             BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
@@@ -7490,6 -7477,7 +7477,7 @@@ static int __btrfs_prealloc_file_range(
                cur_offset += ins.offset;
                *alloc_hint = ins.objectid + ins.offset;
  
+               inode_inc_iversion(inode);
                inode->i_ctime = CURRENT_TIME;
                BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
                if (!(mode & FALLOC_FL_KEEP_SIZE) &&