Merge branch 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 10 Dec 2009 17:33:29 +0000 (09:33 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 10 Dec 2009 17:33:29 +0000 (09:33 -0800)
* 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (47 commits)
  ext4: Fix potential fiemap deadlock (mmap_sem vs. i_data_sem)
  ext4: Do not override ext2 or ext3 if built they are built as modules
  jbd2: Export jbd2_log_start_commit to fix ext4 build
  ext4: Fix insufficient checks in EXT4_IOC_MOVE_EXT
  ext4: Wait for proper transaction commit on fsync
  ext4: fix incorrect block reservation on quota transfer.
  ext4: quota macros cleanup
  ext4: ext4_get_reserved_space() must return bytes instead of blocks
  ext4: remove blocks from inode prealloc list on failure
  ext4: wait for log to commit when umounting
  ext4: Avoid data / filesystem corruption when write fails to copy data
  ext4: Use ext4 file system driver for ext2/ext3 file system mounts
  ext4: Return the PTR_ERR of the correct pointer in setup_new_group_blocks()
  jbd2: Add ENOMEM checking in and for jbd2_journal_write_metadata_buffer()
  ext4: remove unused parameter wbc from __ext4_journalled_writepage()
  ext4: remove encountered_congestion trace
  ext4: move_extent_per_page() cleanup
  ext4: initialize moved_len before calling ext4_move_extents()
  ext4: Fix double-free of blocks with EXT4_IOC_MOVE_EXT
  ext4: use ext4_data_block_valid() in ext4_free_blocks()
  ...

1  2 
fs/ext4/inode.c
fs/ext4/mballoc.c
include/trace/events/ext4.h

diff --combined fs/ext4/inode.c
@@@ -71,58 -71,6 +71,6 @@@ static int ext4_inode_is_fast_symlink(s
  }
  
  /*
-  * The ext4 forget function must perform a revoke if we are freeing data
-  * which has been journaled.  Metadata (eg. indirect blocks) must be
-  * revoked in all cases.
-  *
-  * "bh" may be NULL: a metadata block may have been freed from memory
-  * but there may still be a record of it in the journal, and that record
-  * still needs to be revoked.
-  *
-  * If the handle isn't valid we're not journaling, but we still need to
-  * call into ext4_journal_revoke() to put the buffer head.
-  */
- int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
-               struct buffer_head *bh, ext4_fsblk_t blocknr)
- {
-       int err;
-       might_sleep();
-       BUFFER_TRACE(bh, "enter");
-       jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
-                 "data mode %x\n",
-                 bh, is_metadata, inode->i_mode,
-                 test_opt(inode->i_sb, DATA_FLAGS));
-       /* Never use the revoke function if we are doing full data
-        * journaling: there is no need to, and a V1 superblock won't
-        * support it.  Otherwise, only skip the revoke on un-journaled
-        * data blocks. */
-       if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
-           (!is_metadata && !ext4_should_journal_data(inode))) {
-               if (bh) {
-                       BUFFER_TRACE(bh, "call jbd2_journal_forget");
-                       return ext4_journal_forget(handle, bh);
-               }
-               return 0;
-       }
-       /*
-        * data!=journal && (is_metadata || should_journal_data(inode))
-        */
-       BUFFER_TRACE(bh, "call ext4_journal_revoke");
-       err = ext4_journal_revoke(handle, blocknr, bh);
-       if (err)
-               ext4_abort(inode->i_sb, __func__,
-                          "error %d when attempting revoke", err);
-       BUFFER_TRACE(bh, "exit");
-       return err;
- }
- /*
   * Work out how many blocks we need to proceed with the next chunk of a
   * truncate transaction.
   */
@@@ -721,7 -669,7 +669,7 @@@ allocated
        return ret;
  failed_out:
        for (i = 0; i < index; i++)
-               ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
+               ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0);
        return ret;
  }
  
@@@ -817,14 -765,20 +765,20 @@@ static int ext4_alloc_branch(handle_t *
        return err;
  failed:
        /* Allocation failed, free what we already allocated */
+       ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0);
        for (i = 1; i <= n ; i++) {
-               BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget");
-               ext4_journal_forget(handle, branch[i].bh);
+               /* 
+                * branch[i].bh is newly allocated, so there is no
+                * need to revoke the block, which is why we don't
+                * need to set EXT4_FREE_BLOCKS_METADATA.
+                */
+               ext4_free_blocks(handle, inode, 0, new_blocks[i], 1,
+                                EXT4_FREE_BLOCKS_FORGET);
        }
-       for (i = 0; i < indirect_blks; i++)
-               ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
+       for (i = n+1; i < indirect_blks; i++)
+               ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0);
  
-       ext4_free_blocks(handle, inode, new_blocks[i], num, 0);
+       ext4_free_blocks(handle, inode, 0, new_blocks[i], num, 0);
  
        return err;
  }
@@@ -903,12 -857,16 +857,16 @@@ static int ext4_splice_branch(handle_t 
  
  err_out:
        for (i = 1; i <= num; i++) {
-               BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget");
-               ext4_journal_forget(handle, where[i].bh);
-               ext4_free_blocks(handle, inode,
-                                       le32_to_cpu(where[i-1].key), 1, 0);
+               /* 
+                * branch[i].bh is newly allocated, so there is no
+                * need to revoke the block, which is why we don't
+                * need to set EXT4_FREE_BLOCKS_METADATA.
+                */
+               ext4_free_blocks(handle, inode, where[i].bh, 0, 1,
+                                EXT4_FREE_BLOCKS_FORGET);
        }
-       ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks, 0);
+       ext4_free_blocks(handle, inode, 0, le32_to_cpu(where[num].key),
+                        blks, 0);
  
        return err;
  }
@@@ -1021,10 -979,12 +979,12 @@@ static int ext4_ind_get_blocks(handle_
        if (!err)
                err = ext4_splice_branch(handle, inode, iblock,
                                         partial, indirect_blks, count);
-       else
+       if (err)
                goto cleanup;
  
        set_buffer_new(bh_result);
+       ext4_update_inode_fsync_trans(handle, inode, 1);
  got_it:
        map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
        if (count > blocks_to_boundary)
@@@ -1052,7 -1012,7 +1012,7 @@@ qsize_t ext4_get_reserved_space(struct 
                EXT4_I(inode)->i_reserved_meta_blocks;
        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
  
-       return total;
+       return (total << inode->i_blkbits);
  }
  /*
   * Calculate the number of metadata blocks need to reserve
@@@ -1534,6 -1494,16 +1494,16 @@@ static int do_journal_get_write_access(
        return ext4_journal_get_write_access(handle, bh);
  }
  
+ /*
+  * Truncate blocks that were not used by write. We have to truncate the
+  * pagecache as well so that corresponding buffers get properly unmapped.
+  */
+ static void ext4_truncate_failed_write(struct inode *inode)
+ {
+       truncate_inode_pages(inode->i_mapping, inode->i_size);
+       ext4_truncate(inode);
+ }
  static int ext4_write_begin(struct file *file, struct address_space *mapping,
                            loff_t pos, unsigned len, unsigned flags,
                            struct page **pagep, void **fsdata)
@@@ -1599,7 -1569,7 +1569,7 @@@ retry
  
                ext4_journal_stop(handle);
                if (pos + len > inode->i_size) {
-                       ext4_truncate(inode);
+                       ext4_truncate_failed_write(inode);
                        /*
                         * If truncate failed early the inode might
                         * still be on the orphan list; we need to
@@@ -1709,7 -1679,7 +1679,7 @@@ static int ext4_ordered_write_end(struc
                ret = ret2;
  
        if (pos + len > inode->i_size) {
-               ext4_truncate(inode);
+               ext4_truncate_failed_write(inode);
                /*
                 * If truncate failed early the inode might still be
                 * on the orphan list; we need to make sure the inode
@@@ -1751,7 -1721,7 +1721,7 @@@ static int ext4_writeback_write_end(str
                ret = ret2;
  
        if (pos + len > inode->i_size) {
-               ext4_truncate(inode);
+               ext4_truncate_failed_write(inode);
                /*
                 * If truncate failed early the inode might still be
                 * on the orphan list; we need to make sure the inode
@@@ -1814,7 -1784,7 +1784,7 @@@ static int ext4_journalled_write_end(st
        if (!ret)
                ret = ret2;
        if (pos + len > inode->i_size) {
-               ext4_truncate(inode);
+               ext4_truncate_failed_write(inode);
                /*
                 * If truncate failed early the inode might still be
                 * on the orphan list; we need to make sure the inode
@@@ -2600,7 -2570,6 +2570,6 @@@ static int bput_one(handle_t *handle, s
  }
  
  static int __ext4_journalled_writepage(struct page *page,
-                                      struct writeback_control *wbc,
                                       unsigned int len)
  {
        struct address_space *mapping = page->mapping;
@@@ -2758,7 -2727,7 +2727,7 @@@ static int ext4_writepage(struct page *
                 * doesn't seem much point in redirtying the page here.
                 */
                ClearPageChecked(page);
-               return __ext4_journalled_writepage(page, wbc, len);
+               return __ext4_journalled_writepage(page, len);
        }
  
        if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
@@@ -2788,7 -2757,7 +2757,7 @@@ static int ext4_da_writepages_trans_blo
         * number of contiguous block. So we will limit
         * number of contiguous block to a sane value
         */
-       if (!(inode->i_flags & EXT4_EXTENTS_FL) &&
+       if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) &&
            (max_blocks > EXT4_MAX_TRANS_DATA))
                max_blocks = EXT4_MAX_TRANS_DATA;
  
@@@ -2933,7 -2902,7 +2902,7 @@@ retry
                ret = write_cache_pages(mapping, wbc, __mpage_da_writepage,
                                        &mpd);
                /*
 -               * If we have a contigous extent of pages and we
 +               * If we have a contiguous extent of pages and we
                 * haven't done the I/O yet, map the blocks and submit
                 * them for I/O.
                 */
@@@ -3091,7 -3060,7 +3060,7 @@@ retry
                 * i_size_read because we hold i_mutex.
                 */
                if (pos + len > inode->i_size)
-                       ext4_truncate(inode);
+                       ext4_truncate_failed_write(inode);
        }
  
        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@@ -4064,7 -4033,7 +4033,7 @@@ static Indirect *ext4_find_shared(struc
        int k, err;
  
        *top = 0;
 -      /* Make k index the deepest non-null offest + 1 */
 +      /* Make k index the deepest non-null offset + 1 */
        for (k = depth; k > 1 && !offsets[k-1]; k--)
                ;
        partial = ext4_get_branch(inode, k, offsets, chain, &err);
@@@ -4120,6 -4089,11 +4089,11 @@@ static void ext4_clear_blocks(handle_t 
                              __le32 *last)
  {
        __le32 *p;
+       int     flags = EXT4_FREE_BLOCKS_FORGET;
+       if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+               flags |= EXT4_FREE_BLOCKS_METADATA;
        if (try_to_extend_transaction(handle, inode)) {
                if (bh) {
                        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
                }
        }
  
-       /*
-        * Any buffers which are on the journal will be in memory. We
-        * find them on the hash table so jbd2_journal_revoke() will
-        * run jbd2_journal_forget() on them.  We've already detached
-        * each block from the file, so bforget() in
-        * jbd2_journal_forget() should be safe.
-        *
-        * AKPM: turn on bforget in jbd2_journal_forget()!!!
-        */
-       for (p = first; p < last; p++) {
-               u32 nr = le32_to_cpu(*p);
-               if (nr) {
-                       struct buffer_head *tbh;
+       for (p = first; p < last; p++)
+               *p = 0;
  
-                       *p = 0;
-                       tbh = sb_find_get_block(inode->i_sb, nr);
-                       ext4_forget(handle, 0, inode, tbh, nr);
-               }
-       }
-       ext4_free_blocks(handle, inode, block_to_free, count, 0);
+       ext4_free_blocks(handle, inode, 0, block_to_free, count, flags);
  }
  
  /**
@@@ -4342,7 -4299,8 +4299,8 @@@ static void ext4_free_branches(handle_
                                            blocks_for_truncate(inode));
                        }
  
-                       ext4_free_blocks(handle, inode, nr, 1, 1);
+                       ext4_free_blocks(handle, inode, 0, nr, 1,
+                                        EXT4_FREE_BLOCKS_METADATA);
  
                        if (parent_bh) {
                                /*
@@@ -4781,8 -4739,8 +4739,8 @@@ struct inode *ext4_iget(struct super_bl
        struct ext4_iloc iloc;
        struct ext4_inode *raw_inode;
        struct ext4_inode_info *ei;
-       struct buffer_head *bh;
        struct inode *inode;
+       journal_t *journal = EXT4_SB(sb)->s_journal;
        long ret;
        int block;
  
                return inode;
  
        ei = EXT4_I(inode);
+       iloc.bh = 0;
  
        ret = __ext4_get_inode_loc(inode, &iloc, 0);
        if (ret < 0)
                goto bad_inode;
-       bh = iloc.bh;
        raw_inode = ext4_raw_inode(&iloc);
        inode->i_mode = le16_to_cpu(raw_inode->i_mode);
        inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
                if (inode->i_mode == 0 ||
                    !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
                        /* this inode is deleted */
-                       brelse(bh);
                        ret = -ESTALE;
                        goto bad_inode;
                }
                ei->i_data[block] = raw_inode->i_block[block];
        INIT_LIST_HEAD(&ei->i_orphan);
  
+       /*
+        * Set transaction id's of transactions that have to be committed
+        * to finish f[data]sync. We set them to currently running transaction
+        * as we cannot be sure that the inode or some of its metadata isn't
+        * part of the transaction - the inode could have been reclaimed and
+        * now it is reread from disk.
+        */
+       if (journal) {
+               transaction_t *transaction;
+               tid_t tid;
+               spin_lock(&journal->j_state_lock);
+               if (journal->j_running_transaction)
+                       transaction = journal->j_running_transaction;
+               else
+                       transaction = journal->j_committing_transaction;
+               if (transaction)
+                       tid = transaction->t_tid;
+               else
+                       tid = journal->j_commit_sequence;
+               spin_unlock(&journal->j_state_lock);
+               ei->i_sync_tid = tid;
+               ei->i_datasync_tid = tid;
+       }
        if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
                ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
                if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
                    EXT4_INODE_SIZE(inode->i_sb)) {
-                       brelse(bh);
                        ret = -EIO;
                        goto bad_inode;
                }
  
        ret = 0;
        if (ei->i_file_acl &&
-           ((ei->i_file_acl <
-             (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) +
-              EXT4_SB(sb)->s_gdb_count)) ||
-            (ei->i_file_acl >= ext4_blocks_count(EXT4_SB(sb)->s_es)))) {
+           !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
                ext4_error(sb, __func__,
                           "bad extended attribute block %llu in inode #%lu",
                           ei->i_file_acl, inode->i_ino);
                /* Validate block references which are part of inode */
                ret = ext4_check_inode_blockref(inode);
        }
-       if (ret) {
-               brelse(bh);
+       if (ret)
                goto bad_inode;
-       }
  
        if (S_ISREG(inode->i_mode)) {
                inode->i_op = &ext4_file_inode_operations;
                        init_special_inode(inode, inode->i_mode,
                           new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
        } else {
-               brelse(bh);
                ret = -EIO;
                ext4_error(inode->i_sb, __func__,
                           "bogus i_mode (%o) for inode=%lu",
        return inode;
  
  bad_inode:
+       brelse(iloc.bh);
        iget_failed(inode);
        return ERR_PTR(ret);
  }
@@@ -5108,6 -5084,7 +5084,7 @@@ static int ext4_do_update_inode(handle_
                err = rc;
        ei->i_state &= ~EXT4_STATE_NEW;
  
+       ext4_update_inode_fsync_trans(handle, inode, 0);
  out_brelse:
        brelse(bh);
        ext4_std_error(inode->i_sb, err);
@@@ -5227,8 -5204,8 +5204,8 @@@ int ext4_setattr(struct dentry *dentry
  
                /* (user+group)*(old+new) structure, inode write (sb,
                 * inode block, ? - but truncate inode update has it) */
-               handle = ext4_journal_start(inode, 2*(EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)+
-                                       EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3);
+               handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
+                                       EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3);
                if (IS_ERR(handle)) {
                        error = PTR_ERR(handle);
                        goto err_out;
@@@ -5376,7 -5353,7 +5353,7 @@@ static int ext4_index_trans_blocks(stru
   * worse case, the indexs blocks spread over different block groups
   *
   * If datablocks are discontiguous, they are possible to spread over
 - * different block groups too. If they are contiugous, with flexbg,
 + * different block groups too. If they are contiuguous, with flexbg,
   * they could still across block group boundary.
   *
   * Also account for superblock, inode, quota and xattr blocks
@@@ -5452,7 -5429,7 +5429,7 @@@ int ext4_writepage_trans_blocks(struct 
   * Calculate the journal credits for a chunk of data modification.
   *
   * This is called from DIO, fallocate or whoever calling
 - * ext4_get_blocks() to map/allocate a chunk of contigous disk blocks.
 + * ext4_get_blocks() to map/allocate a chunk of contiguous disk blocks.
   *
   * journal buffers for data blocks are not included here, as DIO
   * and fallocate do no need to journal data buffers.
diff --combined fs/ext4/mballoc.c
   * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
   * value of s_mb_order2_reqs can be tuned via
   * /sys/fs/ext4/<partition>/mb_order2_req.  If the request len is equal to
 - * stripe size (sbi->s_stripe), we try to search for contigous block in
 + * stripe size (sbi->s_stripe), we try to search for contiguous block in
   * stripe size. This should result in better allocation on RAID setups. If
   * not, we search in the specific group using bitmap for best extents. The
   * tunable min_to_scan and max_to_scan control the behaviour here.
@@@ -2529,7 -2529,6 +2529,6 @@@ static void release_blocks_on_commit(jo
        struct ext4_group_info *db;
        int err, count = 0, count2 = 0;
        struct ext4_free_data *entry;
-       ext4_fsblk_t discard_block;
        struct list_head *l, *ltmp;
  
        list_for_each_safe(l, ltmp, &txn->t_private_list) {
                        page_cache_release(e4b.bd_bitmap_page);
                }
                ext4_unlock_group(sb, entry->group);
-               discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb)
-                       + entry->start_blk
-                       + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
-               trace_ext4_discard_blocks(sb, (unsigned long long)discard_block,
-                                         entry->count);
-               sb_issue_discard(sb, discard_block, entry->count);
+               if (test_opt(sb, DISCARD)) {
+                       ext4_fsblk_t discard_block;
+                       struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+                       discard_block = (ext4_fsblk_t)entry->group *
+                                               EXT4_BLOCKS_PER_GROUP(sb)
+                                       + entry->start_blk
+                                       + le32_to_cpu(es->s_first_data_block);
+                       trace_ext4_discard_blocks(sb,
+                                       (unsigned long long)discard_block,
+                                       entry->count);
+                       sb_issue_discard(sb, discard_block, entry->count);
+               }
                kmem_cache_free(ext4_free_ext_cachep, entry);
                ext4_mb_release_desc(&e4b);
        }
@@@ -3006,6 -3011,24 +3011,24 @@@ static void ext4_mb_collect_stats(struc
  }
  
  /*
+  * Called on failure; free up any blocks from the inode PA for this
+  * context.  We don't need this for MB_GROUP_PA because we only change
+  * pa_free in ext4_mb_release_context(), but on failure, we've already
+  * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed.
+  */
+ static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
+ {
+       struct ext4_prealloc_space *pa = ac->ac_pa;
+       int len;
+       if (pa && pa->pa_type == MB_INODE_PA) {
+               len = ac->ac_b_ex.fe_len;
+               pa->pa_free += len;
+       }
+ }
+ /*
   * use blocks preallocated to inode
   */
  static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
@@@ -4290,6 -4313,7 +4313,7 @@@ repeat
                        ac->ac_status = AC_STATUS_CONTINUE;
                        goto repeat;
                } else if (*errp) {
+                       ext4_discard_allocated_blocks(ac);
                        ac->ac_b_ex.fe_len = 0;
                        ar->len = 0;
                        ext4_mb_show_ac(ac);
@@@ -4422,18 -4446,24 +4446,24 @@@ ext4_mb_free_metadata(handle_t *handle
        return 0;
  }
  
- /*
-  * Main entry point into mballoc to free blocks
+ /**
+  * ext4_free_blocks() -- Free given blocks and update quota
+  * @handle:           handle for this transaction
+  * @inode:            inode
+  * @block:            start physical block to free
+  * @count:            number of blocks to count
+  * @metadata:                 Are these metadata blocks
   */
- void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
-                       ext4_fsblk_t block, unsigned long count,
-                       int metadata, unsigned long *freed)
+ void ext4_free_blocks(handle_t *handle, struct inode *inode,
+                     struct buffer_head *bh, ext4_fsblk_t block,
+                     unsigned long count, int flags)
  {
        struct buffer_head *bitmap_bh = NULL;
        struct super_block *sb = inode->i_sb;
        struct ext4_allocation_context *ac = NULL;
        struct ext4_group_desc *gdp;
        struct ext4_super_block *es;
+       unsigned long freed = 0;
        unsigned int overflow;
        ext4_grpblk_t bit;
        struct buffer_head *gd_bh;
        int err = 0;
        int ret;
  
-       *freed = 0;
+       if (bh) {
+               if (block)
+                       BUG_ON(block != bh->b_blocknr);
+               else
+                       block = bh->b_blocknr;
+       }
  
        sbi = EXT4_SB(sb);
        es = EXT4_SB(sb)->s_es;
-       if (block < le32_to_cpu(es->s_first_data_block) ||
-           block + count < block ||
-           block + count > ext4_blocks_count(es)) {
+       if (!ext4_data_block_valid(sbi, block, count)) {
                ext4_error(sb, __func__,
                            "Freeing blocks not in datazone - "
                            "block = %llu, count = %lu", block, count);
        }
  
        ext4_debug("freeing block %llu\n", block);
-       trace_ext4_free_blocks(inode, block, count, metadata);
+       trace_ext4_free_blocks(inode, block, count, flags);
+       if (flags & EXT4_FREE_BLOCKS_FORGET) {
+               struct buffer_head *tbh = bh;
+               int i;
+               BUG_ON(bh && (count > 1));
+               for (i = 0; i < count; i++) {
+                       if (!bh)
+                               tbh = sb_find_get_block(inode->i_sb,
+                                                       block + i);
+                       ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, 
+                                   inode, tbh, block + i);
+               }
+       }
+       /* 
+        * We need to make sure we don't reuse the freed block until
+        * after the transaction is committed, which we can do by
+        * treating the block as metadata, below.  We make an
+        * exception if the inode is to be written in writeback mode
+        * since writeback mode has weak data consistency guarantees.
+        */
+       if (!ext4_should_writeback_data(inode))
+               flags |= EXT4_FREE_BLOCKS_METADATA;
  
        ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
        if (ac) {
@@@ -4533,7 -4591,8 +4591,8 @@@ do_more
        err = ext4_mb_load_buddy(sb, block_group, &e4b);
        if (err)
                goto error_return;
-       if (metadata && ext4_handle_valid(handle)) {
+       if ((flags & EXT4_FREE_BLOCKS_METADATA) && ext4_handle_valid(handle)) {
                struct ext4_free_data *new_entry;
                /*
                 * blocks being freed are metadata. these blocks shouldn't
  
        ext4_mb_release_desc(&e4b);
  
-       *freed += count;
+       freed += count;
  
        /* We dirtied the bitmap block */
        BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
        }
        sb->s_dirt = 1;
  error_return:
+       if (freed)
+               vfs_dq_free_block(inode, freed);
        brelse(bitmap_bh);
        ext4_std_error(sb, err);
        if (ac)
@@@ -38,7 -38,7 +38,7 @@@ TRACE_EVENT(ext4_free_inode
                __entry->blocks = inode->i_blocks;
        ),
  
-       TP_printk("dev %s ino %lu mode %d uid %u gid %u blocks %llu",
+       TP_printk("dev %s ino %lu mode 0%o uid %u gid %u blocks %llu",
                  jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
                  __entry->mode, __entry->uid, __entry->gid,
                  (unsigned long long) __entry->blocks)
@@@ -61,7 -61,7 +61,7 @@@ TRACE_EVENT(ext4_request_inode
                __entry->mode   = mode;
        ),
  
-       TP_printk("dev %s dir %lu mode %d",
+       TP_printk("dev %s dir %lu mode 0%o",
                  jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->dir,
                  __entry->mode)
  );
@@@ -85,12 -85,12 +85,12 @@@ TRACE_EVENT(ext4_allocate_inode
                __entry->mode   = mode;
        ),
  
-       TP_printk("dev %s ino %lu dir %lu mode %d",
+       TP_printk("dev %s ino %lu dir %lu mode 0%o",
                  jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
                  (unsigned long) __entry->dir, __entry->mode)
  );
  
 -TRACE_EVENT(ext4_write_begin,
 +DECLARE_EVENT_CLASS(ext4__write_begin,
  
        TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
                 unsigned int flags),
                  __entry->pos, __entry->len, __entry->flags)
  );
  
 -TRACE_EVENT(ext4_ordered_write_end,
 +DEFINE_EVENT(ext4__write_begin, ext4_write_begin,
 +
 +      TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
 +               unsigned int flags),
 +
 +      TP_ARGS(inode, pos, len, flags)
 +);
 +
 +DEFINE_EVENT(ext4__write_begin, ext4_da_write_begin,
 +
 +      TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
 +               unsigned int flags),
 +
 +      TP_ARGS(inode, pos, len, flags)
 +);
 +
 +DECLARE_EVENT_CLASS(ext4__write_end,
        TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
                        unsigned int copied),
  
                  __entry->pos, __entry->len, __entry->copied)
  );
  
 -TRACE_EVENT(ext4_writeback_write_end,
 +DEFINE_EVENT(ext4__write_end, ext4_ordered_write_end,
 +
        TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
                 unsigned int copied),
  
 -      TP_ARGS(inode, pos, len, copied),
 +      TP_ARGS(inode, pos, len, copied)
 +);
  
 -      TP_STRUCT__entry(
 -              __field(        dev_t,  dev                     )
 -              __field(        ino_t,  ino                     )
 -              __field(        loff_t, pos                     )
 -              __field(        unsigned int, len               )
 -              __field(        unsigned int, copied            )
 -      ),
 +DEFINE_EVENT(ext4__write_end, ext4_writeback_write_end,
  
 -      TP_fast_assign(
 -              __entry->dev    = inode->i_sb->s_dev;
 -              __entry->ino    = inode->i_ino;
 -              __entry->pos    = pos;
 -              __entry->len    = len;
 -              __entry->copied = copied;
 -      ),
 +      TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
 +               unsigned int copied),
  
 -      TP_printk("dev %s ino %lu pos %llu len %u copied %u",
 -                jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
 -                __entry->pos, __entry->len, __entry->copied)
 +      TP_ARGS(inode, pos, len, copied)
  );
  
 -TRACE_EVENT(ext4_journalled_write_end,
 +DEFINE_EVENT(ext4__write_end, ext4_journalled_write_end,
 +
        TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
                 unsigned int copied),
 -      TP_ARGS(inode, pos, len, copied),
  
 -      TP_STRUCT__entry(
 -              __field(        dev_t,  dev                     )
 -              __field(        ino_t,  ino                     )
 -              __field(        loff_t, pos                     )
 -              __field(        unsigned int, len               )
 -              __field(        unsigned int, copied            )
 -      ),
 +      TP_ARGS(inode, pos, len, copied)
 +);
  
 -      TP_fast_assign(
 -              __entry->dev    = inode->i_sb->s_dev;
 -              __entry->ino    = inode->i_ino;
 -              __entry->pos    = pos;
 -              __entry->len    = len;
 -              __entry->copied = copied;
 -      ),
 +DEFINE_EVENT(ext4__write_end, ext4_da_write_end,
  
 -      TP_printk("dev %s ino %lu pos %llu len %u copied %u",
 -                jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
 -                __entry->pos, __entry->len, __entry->copied)
 +      TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
 +               unsigned int copied),
 +
 +      TP_ARGS(inode, pos, len, copied)
  );
  
  TRACE_EVENT(ext4_writepage,
@@@ -305,7 -310,6 +305,6 @@@ TRACE_EVENT(ext4_da_writepages_result
                __field(        int,    ret                     )
                __field(        int,    pages_written           )
                __field(        long,   pages_skipped           )
-               __field(        char,   encountered_congestion  )
                __field(        char,   more_io                 )       
                __field(        char,   no_nrwrite_index_update )
                __field(       pgoff_t, writeback_index         )
                __entry->ret            = ret;
                __entry->pages_written  = pages_written;
                __entry->pages_skipped  = wbc->pages_skipped;
-               __entry->encountered_congestion = wbc->encountered_congestion;
                __entry->more_io        = wbc->more_io;
                __entry->no_nrwrite_index_update = wbc->no_nrwrite_index_update;
                __entry->writeback_index = inode->i_mapping->writeback_index;
        ),
  
-       TP_printk("dev %s ino %lu ret %d pages_written %d pages_skipped %ld congestion %d more_io %d no_nrwrite_index_update %d writeback_index %lu",
+       TP_printk("dev %s ino %lu ret %d pages_written %d pages_skipped %ld more_io %d no_nrwrite_index_update %d writeback_index %lu",
                  jbd2_dev_to_name(__entry->dev),
                  (unsigned long) __entry->ino, __entry->ret,
                  __entry->pages_written, __entry->pages_skipped,
-                 __entry->encountered_congestion, __entry->more_io,
+                 __entry->more_io,
                  __entry->no_nrwrite_index_update,
                  (unsigned long) __entry->writeback_index)
  );
  
 -TRACE_EVENT(ext4_da_write_begin,
 -      TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
 -                      unsigned int flags),
 -
 -      TP_ARGS(inode, pos, len, flags),
 -
 -      TP_STRUCT__entry(
 -              __field(        dev_t,  dev                     )
 -              __field(        ino_t,  ino                     )
 -              __field(        loff_t, pos                     )
 -              __field(        unsigned int, len               )
 -              __field(        unsigned int, flags             )
 -      ),
 -
 -      TP_fast_assign(
 -              __entry->dev    = inode->i_sb->s_dev;
 -              __entry->ino    = inode->i_ino;
 -              __entry->pos    = pos;
 -              __entry->len    = len;
 -              __entry->flags  = flags;
 -      ),
 -
 -      TP_printk("dev %s ino %lu pos %llu len %u flags %u",
 -                jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
 -                __entry->pos, __entry->len, __entry->flags)
 -);
 -
 -TRACE_EVENT(ext4_da_write_end,
 -      TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
 -                      unsigned int copied),
 -
 -      TP_ARGS(inode, pos, len, copied),
 -
 -      TP_STRUCT__entry(
 -              __field(        dev_t,  dev                     )
 -              __field(        ino_t,  ino                     )
 -              __field(        loff_t, pos                     )
 -              __field(        unsigned int, len               )
 -              __field(        unsigned int, copied            )
 -      ),
 -
 -      TP_fast_assign(
 -              __entry->dev    = inode->i_sb->s_dev;
 -              __entry->ino    = inode->i_ino;
 -              __entry->pos    = pos;
 -              __entry->len    = len;
 -              __entry->copied = copied;
 -      ),
 -
 -      TP_printk("dev %s ino %lu pos %llu len %u copied %u",
 -                jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
 -                __entry->pos, __entry->len, __entry->copied)
 -);
 -
  TRACE_EVENT(ext4_discard_blocks,
        TP_PROTO(struct super_block *sb, unsigned long long blk,
                        unsigned long long count),
@@@ -591,30 -648,32 +589,32 @@@ TRACE_EVENT(ext4_allocate_blocks
  
  TRACE_EVENT(ext4_free_blocks,
        TP_PROTO(struct inode *inode, __u64 block, unsigned long count,
-                       int metadata),
+                int flags),
  
-       TP_ARGS(inode, block, count, metadata),
+       TP_ARGS(inode, block, count, flags),
  
        TP_STRUCT__entry(
                __field(        dev_t,  dev                     )
                __field(        ino_t,  ino                     )
+               __field(      umode_t, mode                     )
                __field(        __u64,  block                   )
                __field(        unsigned long,  count           )
-               __field(        int,    metadata                )
+               __field(         int,   flags                   )
        ),
  
        TP_fast_assign(
                __entry->dev            = inode->i_sb->s_dev;
                __entry->ino            = inode->i_ino;
+               __entry->mode           = inode->i_mode;
                __entry->block          = block;
                __entry->count          = count;
-               __entry->metadata       = metadata;
+               __entry->flags          = flags;
        ),
  
-       TP_printk("dev %s ino %lu block %llu count %lu metadata %d",
+       TP_printk("dev %s ino %lu mode 0%o block %llu count %lu flags %d",
                  jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
-                 __entry->block, __entry->count, __entry->metadata)
+                 __entry->mode, __entry->block, __entry->count,
+                 __entry->flags)
  );
  
  TRACE_EVENT(ext4_sync_file,
@@@ -848,6 -907,32 +848,32 @@@ TRACE_EVENT(ext4_mballoc_free
                  __entry->result_len, __entry->result_logical)
  );
  
+ TRACE_EVENT(ext4_forget,
+       TP_PROTO(struct inode *inode, int is_metadata, __u64 block),
+       TP_ARGS(inode, is_metadata, block),
+       TP_STRUCT__entry(
+               __field(        dev_t,  dev                     )
+               __field(        ino_t,  ino                     )
+               __field(        umode_t, mode                   )
+               __field(        int,    is_metadata             )
+               __field(        __u64,  block                   )
+       ),
+       TP_fast_assign(
+               __entry->dev    = inode->i_sb->s_dev;
+               __entry->ino    = inode->i_ino;
+               __entry->mode   = inode->i_mode;
+               __entry->is_metadata = is_metadata;
+               __entry->block  = block;
+       ),
+       TP_printk("dev %s ino %lu mode 0%o is_metadata %d block %llu",
+                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
+                 __entry->mode, __entry->is_metadata, __entry->block)
+ );
  #endif /* _TRACE_EXT4_H */
  
  /* This part must be outside protection */