Merge branch 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

author Linus Torvalds <torvalds@linux-foundation.org>

Thu, 10 Dec 2009 17:33:29 +0000 (09:33 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Thu, 10 Dec 2009 17:33:29 +0000 (09:33 -0800)
author Linus Torvalds <torvalds@linux-foundation.org>
Thu, 10 Dec 2009 17:33:29 +0000 (09:33 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Thu, 10 Dec 2009 17:33:29 +0000 (09:33 -0800)
diff --combined fs/ext4/inode.c

index 4e8e2f1,f1bc1e3..5352db1
--- 1/fs/ext4/inode.c
--- 2/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@@ -71,58 -71,6 +71,6 @@@ static int ext4_inode_is_fast_symlink(s
   }
   
   /*
-  * The ext4 forget function must perform a revoke if we are freeing data
-  * which has been journaled.  Metadata (eg. indirect blocks) must be
-  * revoked in all cases.
-  *
-  * "bh" may be NULL: a metadata block may have been freed from memory
-  * but there may still be a record of it in the journal, and that record
-  * still needs to be revoked.
-  *
-  * If the handle isn't valid we're not journaling, but we still need to
-  * call into ext4_journal_revoke() to put the buffer head.
-  */
- int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
-               struct buffer_head *bh, ext4_fsblk_t blocknr)
- {
-       int err;
- 
-       might_sleep();
- 
-       BUFFER_TRACE(bh, "enter");
- 
-       jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
-                 "data mode %x\n",
-                 bh, is_metadata, inode->i_mode,
-                 test_opt(inode->i_sb, DATA_FLAGS));
- 
-       /* Never use the revoke function if we are doing full data
-        * journaling: there is no need to, and a V1 superblock won't
-        * support it.  Otherwise, only skip the revoke on un-journaled
-        * data blocks. */
- 
-       if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
-           (!is_metadata && !ext4_should_journal_data(inode))) {
-               if (bh) {
-                       BUFFER_TRACE(bh, "call jbd2_journal_forget");
-                       return ext4_journal_forget(handle, bh);
-               }
-               return 0;
-       }
- 
-       /*
-        * data!=journal && (is_metadata || should_journal_data(inode))
-        */
-       BUFFER_TRACE(bh, "call ext4_journal_revoke");
-       err = ext4_journal_revoke(handle, blocknr, bh);
-       if (err)
-               ext4_abort(inode->i_sb, __func__,
-                          "error %d when attempting revoke", err);
-       BUFFER_TRACE(bh, "exit");
-       return err;
- }
- 
- /*
    * Work out how many blocks we need to proceed with the next chunk of a
    * truncate transaction.
    */
@@@ -721,7 -669,7 +669,7 @@@ allocated
         return ret;
   failed_out:
         for (i = 0; i < index; i++)
-               ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
+               ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0);
         return ret;
   }
   
@@@ -817,14 -765,20 +765,20 @@@ static int ext4_alloc_branch(handle_t *
         return err;
   failed:
         /* Allocation failed, free what we already allocated */
+       ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0);
         for (i = 1; i <= n ; i++) {
-               BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget");
-               ext4_journal_forget(handle, branch[i].bh);
+               /* 
+                * branch[i].bh is newly allocated, so there is no
+                * need to revoke the block, which is why we don't
+                * need to set EXT4_FREE_BLOCKS_METADATA.
+                */
+               ext4_free_blocks(handle, inode, 0, new_blocks[i], 1,
+                                EXT4_FREE_BLOCKS_FORGET);
         }
-       for (i = 0; i < indirect_blks; i++)
-               ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
+       for (i = n+1; i < indirect_blks; i++)
+               ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0);
   
-       ext4_free_blocks(handle, inode, new_blocks[i], num, 0);
+       ext4_free_blocks(handle, inode, 0, new_blocks[i], num, 0);
   
         return err;
   }
@@@ -903,12 -857,16 +857,16 @@@ static int ext4_splice_branch(handle_t 
   
   err_out:
         for (i = 1; i <= num; i++) {
-               BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget");
-               ext4_journal_forget(handle, where[i].bh);
-               ext4_free_blocks(handle, inode,
-                                       le32_to_cpu(where[i-1].key), 1, 0);
+               /* 
+                * branch[i].bh is newly allocated, so there is no
+                * need to revoke the block, which is why we don't
+                * need to set EXT4_FREE_BLOCKS_METADATA.
+                */
+               ext4_free_blocks(handle, inode, where[i].bh, 0, 1,
+                                EXT4_FREE_BLOCKS_FORGET);
         }
-       ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks, 0);
+       ext4_free_blocks(handle, inode, 0, le32_to_cpu(where[num].key),
+                        blks, 0);
   
         return err;
   }
@@@ -1021,10 -979,12 +979,12 @@@ static int ext4_ind_get_blocks(handle_
         if (!err)
                 err = ext4_splice_branch(handle, inode, iblock,
                                          partial, indirect_blks, count);
-       else
+       if (err)
                 goto cleanup;
   
         set_buffer_new(bh_result);
+ 
+       ext4_update_inode_fsync_trans(handle, inode, 1);
   got_it:
         map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
         if (count > blocks_to_boundary)
@@@ -1052,7 -1012,7 +1012,7 @@@ qsize_t ext4_get_reserved_space(struct 
                 EXT4_I(inode)->i_reserved_meta_blocks;
         spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
   
-       return total;
+       return (total << inode->i_blkbits);
   }
   /*
    * Calculate the number of metadata blocks need to reserve
@@@ -1534,6 -1494,16 +1494,16 @@@ static int do_journal_get_write_access(
         return ext4_journal_get_write_access(handle, bh);
   }
   
+ /*
+  * Truncate blocks that were not used by write. We have to truncate the
+  * pagecache as well so that corresponding buffers get properly unmapped.
+  */
+ static void ext4_truncate_failed_write(struct inode *inode)
+ {
+       truncate_inode_pages(inode->i_mapping, inode->i_size);
+       ext4_truncate(inode);
+ }
+ 
   static int ext4_write_begin(struct file *file, struct address_space *mapping,
                             loff_t pos, unsigned len, unsigned flags,
                             struct page **pagep, void **fsdata)
@@@ -1599,7 -1569,7 +1569,7 @@@ retry
   
                 ext4_journal_stop(handle);
                 if (pos + len > inode->i_size) {
-                       ext4_truncate(inode);
+                       ext4_truncate_failed_write(inode);
                         /*
                          * If truncate failed early the inode might
                          * still be on the orphan list; we need to
@@@ -1709,7 -1679,7 +1679,7 @@@ static int ext4_ordered_write_end(struc
                 ret = ret2;
   
         if (pos + len > inode->i_size) {
-               ext4_truncate(inode);
+               ext4_truncate_failed_write(inode);
                 /*
                  * If truncate failed early the inode might still be
                  * on the orphan list; we need to make sure the inode
@@@ -1751,7 -1721,7 +1721,7 @@@ static int ext4_writeback_write_end(str
                 ret = ret2;
   
         if (pos + len > inode->i_size) {
-               ext4_truncate(inode);
+               ext4_truncate_failed_write(inode);
                 /*
                  * If truncate failed early the inode might still be
                  * on the orphan list; we need to make sure the inode
@@@ -1814,7 -1784,7 +1784,7 @@@ static int ext4_journalled_write_end(st
         if (!ret)
                 ret = ret2;
         if (pos + len > inode->i_size) {
-               ext4_truncate(inode);
+               ext4_truncate_failed_write(inode);
                 /*
                  * If truncate failed early the inode might still be
                  * on the orphan list; we need to make sure the inode
@@@ -2600,7 -2570,6 +2570,6 @@@ static int bput_one(handle_t *handle, s
   }
   
   static int __ext4_journalled_writepage(struct page *page,
-                                      struct writeback_control *wbc,
                                        unsigned int len)
   {
         struct address_space *mapping = page->mapping;
@@@ -2758,7 -2727,7 +2727,7 @@@ static int ext4_writepage(struct page *
                  * doesn't seem much point in redirtying the page here.
                  */
                 ClearPageChecked(page);
-               return __ext4_journalled_writepage(page, wbc, len);
+               return __ext4_journalled_writepage(page, len);
         }
   
         if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
@@@ -2788,7 -2757,7 +2757,7 @@@ static int ext4_da_writepages_trans_blo
          * number of contiguous block. So we will limit
          * number of contiguous block to a sane value
          */
-       if (!(inode->i_flags & EXT4_EXTENTS_FL) &&
+       if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) &&
             (max_blocks > EXT4_MAX_TRANS_DATA))
                 max_blocks = EXT4_MAX_TRANS_DATA;
   
@@@ -2933,7 -2902,7 +2902,7 @@@ retry
                 ret = write_cache_pages(mapping, wbc, __mpage_da_writepage,
                                         &mpd);
                 /*
- -               * If we have a contigous extent of pages and we
+ +               * If we have a contiguous extent of pages and we
                  * haven't done the I/O yet, map the blocks and submit
                  * them for I/O.
                  */
@@@ -3091,7 -3060,7 +3060,7 @@@ retry
                  * i_size_read because we hold i_mutex.
                  */
                 if (pos + len > inode->i_size)
-                       ext4_truncate(inode);
+                       ext4_truncate_failed_write(inode);
         }
   
         if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@@ -4064,7 -4033,7 +4033,7 @@@ static Indirect *ext4_find_shared(struc
         int k, err;
   
         *top = 0;
- -      /* Make k index the deepest non-null offest + 1 */
+ +      /* Make k index the deepest non-null offset + 1 */
         for (k = depth; k > 1 && !offsets[k-1]; k--)
                 ;
         partial = ext4_get_branch(inode, k, offsets, chain, &err);
@@@ -4120,6 -4089,11 +4089,11 @@@ static void ext4_clear_blocks(handle_t 
                               __le32 *last)
   {
         __le32 *p;
+       int     flags = EXT4_FREE_BLOCKS_FORGET;
+ 
+       if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+               flags |= EXT4_FREE_BLOCKS_METADATA;
+ 
         if (try_to_extend_transaction(handle, inode)) {
                 if (bh) {
                         BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
@@@ -4134,27 -4108,10 +4108,10 @@@
                 }
         }
   
-       /*
-        * Any buffers which are on the journal will be in memory. We
-        * find them on the hash table so jbd2_journal_revoke() will
-        * run jbd2_journal_forget() on them.  We've already detached
-        * each block from the file, so bforget() in
-        * jbd2_journal_forget() should be safe.
-        *
-        * AKPM: turn on bforget in jbd2_journal_forget()!!!
-        */
-       for (p = first; p < last; p++) {
-               u32 nr = le32_to_cpu(*p);
-               if (nr) {
-                       struct buffer_head *tbh;
+       for (p = first; p < last; p++)
+               *p = 0;
   
-                       *p = 0;
-                       tbh = sb_find_get_block(inode->i_sb, nr);
-                       ext4_forget(handle, 0, inode, tbh, nr);
-               }
-       }
- 
-       ext4_free_blocks(handle, inode, block_to_free, count, 0);
+       ext4_free_blocks(handle, inode, 0, block_to_free, count, flags);
   }
   
   /**
@@@ -4342,7 -4299,8 +4299,8 @@@ static void ext4_free_branches(handle_
                                             blocks_for_truncate(inode));
                         }
   
-                       ext4_free_blocks(handle, inode, nr, 1, 1);
+                       ext4_free_blocks(handle, inode, 0, nr, 1,
+                                        EXT4_FREE_BLOCKS_METADATA);
   
                         if (parent_bh) {
                                 /*
@@@ -4781,8 -4739,8 +4739,8 @@@ struct inode *ext4_iget(struct super_bl
         struct ext4_iloc iloc;
         struct ext4_inode *raw_inode;
         struct ext4_inode_info *ei;
-       struct buffer_head *bh;
         struct inode *inode;
+       journal_t *journal = EXT4_SB(sb)->s_journal;
         long ret;
         int block;
   
@@@ -4793,11 -4751,11 +4751,11 @@@
                 return inode;
   
         ei = EXT4_I(inode);
+       iloc.bh = 0;
   
         ret = __ext4_get_inode_loc(inode, &iloc, 0);
         if (ret < 0)
                 goto bad_inode;
-       bh = iloc.bh;
         raw_inode = ext4_raw_inode(&iloc);
         inode->i_mode = le16_to_cpu(raw_inode->i_mode);
         inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
@@@ -4820,7 -4778,6 +4778,6 @@@
                 if (inode->i_mode == 0 ||
                     !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
                         /* this inode is deleted */
-                       brelse(bh);
                         ret = -ESTALE;
                         goto bad_inode;
                 }
@@@ -4848,11 -4805,35 +4805,35 @@@
                 ei->i_data[block] = raw_inode->i_block[block];
         INIT_LIST_HEAD(&ei->i_orphan);
   
+       /*
+        * Set transaction id's of transactions that have to be committed
+        * to finish f[data]sync. We set them to currently running transaction
+        * as we cannot be sure that the inode or some of its metadata isn't
+        * part of the transaction - the inode could have been reclaimed and
+        * now it is reread from disk.
+        */
+       if (journal) {
+               transaction_t *transaction;
+               tid_t tid;
+ 
+               spin_lock(&journal->j_state_lock);
+               if (journal->j_running_transaction)
+                       transaction = journal->j_running_transaction;
+               else
+                       transaction = journal->j_committing_transaction;
+               if (transaction)
+                       tid = transaction->t_tid;
+               else
+                       tid = journal->j_commit_sequence;
+               spin_unlock(&journal->j_state_lock);
+               ei->i_sync_tid = tid;
+               ei->i_datasync_tid = tid;
+       }
+ 
         if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
                 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
                 if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
                     EXT4_INODE_SIZE(inode->i_sb)) {
-                       brelse(bh);
                         ret = -EIO;
                         goto bad_inode;
                 }
@@@ -4884,10 -4865,7 +4865,7 @@@
   
         ret = 0;
         if (ei->i_file_acl &&
-           ((ei->i_file_acl <
-             (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) +
-              EXT4_SB(sb)->s_gdb_count)) ||
-            (ei->i_file_acl >= ext4_blocks_count(EXT4_SB(sb)->s_es)))) {
+           !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
                 ext4_error(sb, __func__,
                            "bad extended attribute block %llu in inode #%lu",
                            ei->i_file_acl, inode->i_ino);
@@@ -4905,10 -4883,8 +4883,8 @@@
                 /* Validate block references which are part of inode */
                 ret = ext4_check_inode_blockref(inode);
         }
-       if (ret) {
-               brelse(bh);
+       if (ret)
                 goto bad_inode;
-       }
   
         if (S_ISREG(inode->i_mode)) {
                 inode->i_op = &ext4_file_inode_operations;
@@@ -4936,7 -4912,6 +4912,6 @@@
                         init_special_inode(inode, inode->i_mode,
                            new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
         } else {
-               brelse(bh);
                 ret = -EIO;
                 ext4_error(inode->i_sb, __func__,
                            "bogus i_mode (%o) for inode=%lu",
@@@ -4949,6 -4924,7 +4924,7 @@@
         return inode;
   
   bad_inode:
+       brelse(iloc.bh);
         iget_failed(inode);
         return ERR_PTR(ret);
   }
@@@ -5108,6 -5084,7 +5084,7 @@@ static int ext4_do_update_inode(handle_
                 err = rc;
         ei->i_state &= ~EXT4_STATE_NEW;
   
+       ext4_update_inode_fsync_trans(handle, inode, 0);
   out_brelse:
         brelse(bh);
         ext4_std_error(inode->i_sb, err);
@@@ -5227,8 -5204,8 +5204,8 @@@ int ext4_setattr(struct dentry *dentry
   
                 /* (user+group)*(old+new) structure, inode write (sb,
                  * inode block, ? - but truncate inode update has it) */
-               handle = ext4_journal_start(inode, 2*(EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)+
-                                       EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3);
+               handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
+                                       EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3);
                 if (IS_ERR(handle)) {
                         error = PTR_ERR(handle);
                         goto err_out;
@@@ -5376,7 -5353,7 +5353,7 @@@ static int ext4_index_trans_blocks(stru
    * worse case, the indexs blocks spread over different block groups
    *
    * If datablocks are discontiguous, they are possible to spread over
- - * different block groups too. If they are contiugous, with flexbg,
+ + * different block groups too. If they are contiuguous, with flexbg,
    * they could still across block group boundary.
    *
    * Also account for superblock, inode, quota and xattr blocks
@@@ -5452,7 -5429,7 +5429,7 @@@ int ext4_writepage_trans_blocks(struct 
    * Calculate the journal credits for a chunk of data modification.
    *
    * This is called from DIO, fallocate or whoever calling
- - * ext4_get_blocks() to map/allocate a chunk of contigous disk blocks.
+ + * ext4_get_blocks() to map/allocate a chunk of contiguous disk blocks.
    *
    * journal buffers for data blocks are not included here, as DIO
    * and fallocate do no need to journal data buffers.
diff --combined fs/ext4/mballoc.c

index 74e495d,19635c3..c1e19d5
--- 1/fs/ext4/mballoc.c
--- 2/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@@ -142,7 -142,7 +142,7 @@@
    * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
    * value of s_mb_order2_reqs can be tuned via
    * /sys/fs/ext4/<partition>/mb_order2_req.  If the request len is equal to
- - * stripe size (sbi->s_stripe), we try to search for contigous block in
+ + * stripe size (sbi->s_stripe), we try to search for contiguous block in
    * stripe size. This should result in better allocation on RAID setups. If
    * not, we search in the specific group using bitmap for best extents. The
    * tunable min_to_scan and max_to_scan control the behaviour here.
@@@ -2529,7 -2529,6 +2529,6 @@@ static void release_blocks_on_commit(jo
         struct ext4_group_info *db;
         int err, count = 0, count2 = 0;
         struct ext4_free_data *entry;
-       ext4_fsblk_t discard_block;
         struct list_head *l, *ltmp;
   
         list_for_each_safe(l, ltmp, &txn->t_private_list) {
@@@ -2559,13 -2558,19 +2558,19 @@@
                         page_cache_release(e4b.bd_bitmap_page);
                 }
                 ext4_unlock_group(sb, entry->group);
-               discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb)
-                       + entry->start_blk
-                       + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
-               trace_ext4_discard_blocks(sb, (unsigned long long)discard_block,
-                                         entry->count);
-               sb_issue_discard(sb, discard_block, entry->count);
- 
+               if (test_opt(sb, DISCARD)) {
+                       ext4_fsblk_t discard_block;
+                       struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+ 
+                       discard_block = (ext4_fsblk_t)entry->group *
+                                               EXT4_BLOCKS_PER_GROUP(sb)
+                                       + entry->start_blk
+                                       + le32_to_cpu(es->s_first_data_block);
+                       trace_ext4_discard_blocks(sb,
+                                       (unsigned long long)discard_block,
+                                       entry->count);
+                       sb_issue_discard(sb, discard_block, entry->count);
+               }
                 kmem_cache_free(ext4_free_ext_cachep, entry);
                 ext4_mb_release_desc(&e4b);
         }
@@@ -3006,6 -3011,24 +3011,24 @@@ static void ext4_mb_collect_stats(struc
   }
   
   /*
+  * Called on failure; free up any blocks from the inode PA for this
+  * context.  We don't need this for MB_GROUP_PA because we only change
+  * pa_free in ext4_mb_release_context(), but on failure, we've already
+  * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed.
+  */
+ static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
+ {
+       struct ext4_prealloc_space *pa = ac->ac_pa;
+       int len;
+ 
+       if (pa && pa->pa_type == MB_INODE_PA) {
+               len = ac->ac_b_ex.fe_len;
+               pa->pa_free += len;
+       }
+ 
+ }
+ 
+ /*
    * use blocks preallocated to inode
    */
   static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
@@@ -4290,6 -4313,7 +4313,7 @@@ repeat
                         ac->ac_status = AC_STATUS_CONTINUE;
                         goto repeat;
                 } else if (*errp) {
+                       ext4_discard_allocated_blocks(ac);
                         ac->ac_b_ex.fe_len = 0;
                         ar->len = 0;
                         ext4_mb_show_ac(ac);
@@@ -4422,18 -4446,24 +4446,24 @@@ ext4_mb_free_metadata(handle_t *handle
         return 0;
   }
   
- /*
-  * Main entry point into mballoc to free blocks
+ /**
+  * ext4_free_blocks() -- Free given blocks and update quota
+  * @handle:           handle for this transaction
+  * @inode:            inode
+  * @block:            start physical block to free
+  * @count:            number of blocks to count
+  * @metadata:                 Are these metadata blocks
    */
- void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
-                       ext4_fsblk_t block, unsigned long count,
-                       int metadata, unsigned long *freed)
+ void ext4_free_blocks(handle_t *handle, struct inode *inode,
+                     struct buffer_head *bh, ext4_fsblk_t block,
+                     unsigned long count, int flags)
   {
         struct buffer_head *bitmap_bh = NULL;
         struct super_block *sb = inode->i_sb;
         struct ext4_allocation_context *ac = NULL;
         struct ext4_group_desc *gdp;
         struct ext4_super_block *es;
+       unsigned long freed = 0;
         unsigned int overflow;
         ext4_grpblk_t bit;
         struct buffer_head *gd_bh;
@@@ -4443,13 -4473,16 +4473,16 @@@
         int err = 0;
         int ret;
   
-       *freed = 0;
+       if (bh) {
+               if (block)
+                       BUG_ON(block != bh->b_blocknr);
+               else
+                       block = bh->b_blocknr;
+       }
   
         sbi = EXT4_SB(sb);
         es = EXT4_SB(sb)->s_es;
-       if (block < le32_to_cpu(es->s_first_data_block) ||
-           block + count < block ||
-           block + count > ext4_blocks_count(es)) {
+       if (!ext4_data_block_valid(sbi, block, count)) {
                 ext4_error(sb, __func__,
                             "Freeing blocks not in datazone - "
                             "block = %llu, count = %lu", block, count);
@@@ -4457,7 -4490,32 +4490,32 @@@
         }
   
         ext4_debug("freeing block %llu\n", block);
-       trace_ext4_free_blocks(inode, block, count, metadata);
+       trace_ext4_free_blocks(inode, block, count, flags);
+ 
+       if (flags & EXT4_FREE_BLOCKS_FORGET) {
+               struct buffer_head *tbh = bh;
+               int i;
+ 
+               BUG_ON(bh && (count > 1));
+ 
+               for (i = 0; i < count; i++) {
+                       if (!bh)
+                               tbh = sb_find_get_block(inode->i_sb,
+                                                       block + i);
+                       ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, 
+                                   inode, tbh, block + i);
+               }
+       }
+ 
+       /* 
+        * We need to make sure we don't reuse the freed block until
+        * after the transaction is committed, which we can do by
+        * treating the block as metadata, below.  We make an
+        * exception if the inode is to be written in writeback mode
+        * since writeback mode has weak data consistency guarantees.
+        */
+       if (!ext4_should_writeback_data(inode))
+               flags |= EXT4_FREE_BLOCKS_METADATA;
   
         ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
         if (ac) {
@@@ -4533,7 -4591,8 +4591,8 @@@ do_more
         err = ext4_mb_load_buddy(sb, block_group, &e4b);
         if (err)
                 goto error_return;
-       if (metadata && ext4_handle_valid(handle)) {
+ 
+       if ((flags & EXT4_FREE_BLOCKS_METADATA) && ext4_handle_valid(handle)) {
                 struct ext4_free_data *new_entry;
                 /*
                  * blocks being freed are metadata. these blocks shouldn't
@@@ -4572,7 -4631,7 +4631,7 @@@
   
         ext4_mb_release_desc(&e4b);
   
-       *freed += count;
+       freed += count;
   
         /* We dirtied the bitmap block */
         BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
@@@ -4592,6 -4651,8 +4651,8 @@@
         }
         sb->s_dirt = 1;
   error_return:
+       if (freed)
+               vfs_dq_free_block(inode, freed);
         brelse(bitmap_bh);
         ext4_std_error(sb, err);
         if (ac)
diff --combined include/trace/events/ext4.h

index 318f765,f4c62d3..d0b6cd3
--- 1/include/trace/events/ext4.h
--- 2/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@@ -38,7 -38,7 +38,7 @@@ TRACE_EVENT(ext4_free_inode
                 __entry->blocks = inode->i_blocks;
         ),
   
-       TP_printk("dev %s ino %lu mode %d uid %u gid %u blocks %llu",
+       TP_printk("dev %s ino %lu mode 0%o uid %u gid %u blocks %llu",
                   jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
                   __entry->mode, __entry->uid, __entry->gid,
                   (unsigned long long) __entry->blocks)
@@@ -61,7 -61,7 +61,7 @@@ TRACE_EVENT(ext4_request_inode
                 __entry->mode   = mode;
         ),
   
-       TP_printk("dev %s dir %lu mode %d",
+       TP_printk("dev %s dir %lu mode 0%o",
                   jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->dir,
                   __entry->mode)
   );
@@@ -85,12 -85,12 +85,12 @@@ TRACE_EVENT(ext4_allocate_inode
                 __entry->mode   = mode;
         ),
   
-       TP_printk("dev %s ino %lu dir %lu mode %d",
+       TP_printk("dev %s ino %lu dir %lu mode 0%o",
                   jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
                   (unsigned long) __entry->dir, __entry->mode)
   );
   
- -TRACE_EVENT(ext4_write_begin,
+ +DECLARE_EVENT_CLASS(ext4__write_begin,
   
         TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
                  unsigned int flags),
@@@ -118,23 -118,7 +118,23 @@@
                   __entry->pos, __entry->len, __entry->flags)
   );
   
- -TRACE_EVENT(ext4_ordered_write_end,
+ +DEFINE_EVENT(ext4__write_begin, ext4_write_begin,
+ +
+ +      TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
+ +               unsigned int flags),
+ +
+ +      TP_ARGS(inode, pos, len, flags)
+ +);
+ +
+ +DEFINE_EVENT(ext4__write_begin, ext4_da_write_begin,
+ +
+ +      TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
+ +               unsigned int flags),
+ +
+ +      TP_ARGS(inode, pos, len, flags)
+ +);
+ +
+ +DECLARE_EVENT_CLASS(ext4__write_end,
         TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
                         unsigned int copied),
   
@@@ -161,36 -145,57 +161,36 @@@
                   __entry->pos, __entry->len, __entry->copied)
   );
   
- -TRACE_EVENT(ext4_writeback_write_end,
+ +DEFINE_EVENT(ext4__write_end, ext4_ordered_write_end,
+ +
         TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
                  unsigned int copied),
   
- -      TP_ARGS(inode, pos, len, copied),
+ +      TP_ARGS(inode, pos, len, copied)
+ +);
   
- -      TP_STRUCT__entry(
- -              __field(        dev_t,  dev                     )
- -              __field(        ino_t,  ino                     )
- -              __field(        loff_t, pos                     )
- -              __field(        unsigned int, len               )
- -              __field(        unsigned int, copied            )
- -      ),
+ +DEFINE_EVENT(ext4__write_end, ext4_writeback_write_end,
   
- -      TP_fast_assign(
- -              __entry->dev    = inode->i_sb->s_dev;
- -              __entry->ino    = inode->i_ino;
- -              __entry->pos    = pos;
- -              __entry->len    = len;
- -              __entry->copied = copied;
- -      ),
+ +      TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
+ +               unsigned int copied),
   
- -      TP_printk("dev %s ino %lu pos %llu len %u copied %u",
- -                jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
- -                __entry->pos, __entry->len, __entry->copied)
+ +      TP_ARGS(inode, pos, len, copied)
   );
   
- -TRACE_EVENT(ext4_journalled_write_end,
+ +DEFINE_EVENT(ext4__write_end, ext4_journalled_write_end,
+ +
         TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
                  unsigned int copied),
- -      TP_ARGS(inode, pos, len, copied),
   
- -      TP_STRUCT__entry(
- -              __field(        dev_t,  dev                     )
- -              __field(        ino_t,  ino                     )
- -              __field(        loff_t, pos                     )
- -              __field(        unsigned int, len               )
- -              __field(        unsigned int, copied            )
- -      ),
+ +      TP_ARGS(inode, pos, len, copied)
+ +);
   
- -      TP_fast_assign(
- -              __entry->dev    = inode->i_sb->s_dev;
- -              __entry->ino    = inode->i_ino;
- -              __entry->pos    = pos;
- -              __entry->len    = len;
- -              __entry->copied = copied;
- -      ),
+ +DEFINE_EVENT(ext4__write_end, ext4_da_write_end,
   
- -      TP_printk("dev %s ino %lu pos %llu len %u copied %u",
- -                jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
- -                __entry->pos, __entry->len, __entry->copied)
+ +      TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
+ +               unsigned int copied),
+ +
+ +      TP_ARGS(inode, pos, len, copied)
   );
   
   TRACE_EVENT(ext4_writepage,
@@@ -305,7 -310,6 +305,6 @@@ TRACE_EVENT(ext4_da_writepages_result
                 __field(        int,    ret                     )
                 __field(        int,    pages_written           )
                 __field(        long,   pages_skipped           )
-               __field(        char,   encountered_congestion  )
                 __field(        char,   more_io                 )       
                 __field(        char,   no_nrwrite_index_update )
                 __field(       pgoff_t, writeback_index         )
@@@ -317,21 -321,74 +316,20 @@@
                 __entry->ret            = ret;
                 __entry->pages_written  = pages_written;
                 __entry->pages_skipped  = wbc->pages_skipped;
-               __entry->encountered_congestion = wbc->encountered_congestion;
                 __entry->more_io        = wbc->more_io;
                 __entry->no_nrwrite_index_update = wbc->no_nrwrite_index_update;
                 __entry->writeback_index = inode->i_mapping->writeback_index;
         ),
   
-       TP_printk("dev %s ino %lu ret %d pages_written %d pages_skipped %ld congestion %d more_io %d no_nrwrite_index_update %d writeback_index %lu",
+       TP_printk("dev %s ino %lu ret %d pages_written %d pages_skipped %ld more_io %d no_nrwrite_index_update %d writeback_index %lu",
                   jbd2_dev_to_name(__entry->dev),
                   (unsigned long) __entry->ino, __entry->ret,
                   __entry->pages_written, __entry->pages_skipped,
-                 __entry->encountered_congestion, __entry->more_io,
+                 __entry->more_io,
                   __entry->no_nrwrite_index_update,
                   (unsigned long) __entry->writeback_index)
   );
   
- -TRACE_EVENT(ext4_da_write_begin,
- -      TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
- -                      unsigned int flags),
- -
- -      TP_ARGS(inode, pos, len, flags),
- -
- -      TP_STRUCT__entry(
- -              __field(        dev_t,  dev                     )
- -              __field(        ino_t,  ino                     )
- -              __field(        loff_t, pos                     )
- -              __field(        unsigned int, len               )
- -              __field(        unsigned int, flags             )
- -      ),
- -
- -      TP_fast_assign(
- -              __entry->dev    = inode->i_sb->s_dev;
- -              __entry->ino    = inode->i_ino;
- -              __entry->pos    = pos;
- -              __entry->len    = len;
- -              __entry->flags  = flags;
- -      ),
- -
- -      TP_printk("dev %s ino %lu pos %llu len %u flags %u",
- -                jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
- -                __entry->pos, __entry->len, __entry->flags)
- -);
- -
- -TRACE_EVENT(ext4_da_write_end,
- -      TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
- -                      unsigned int copied),
- -
- -      TP_ARGS(inode, pos, len, copied),
- -
- -      TP_STRUCT__entry(
- -              __field(        dev_t,  dev                     )
- -              __field(        ino_t,  ino                     )
- -              __field(        loff_t, pos                     )
- -              __field(        unsigned int, len               )
- -              __field(        unsigned int, copied            )
- -      ),
- -
- -      TP_fast_assign(
- -              __entry->dev    = inode->i_sb->s_dev;
- -              __entry->ino    = inode->i_ino;
- -              __entry->pos    = pos;
- -              __entry->len    = len;
- -              __entry->copied = copied;
- -      ),
- -
- -      TP_printk("dev %s ino %lu pos %llu len %u copied %u",
- -                jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
- -                __entry->pos, __entry->len, __entry->copied)
- -);
- -
   TRACE_EVENT(ext4_discard_blocks,
         TP_PROTO(struct super_block *sb, unsigned long long blk,
                         unsigned long long count),
@@@ -591,30 -648,32 +589,32 @@@ TRACE_EVENT(ext4_allocate_blocks
   
   TRACE_EVENT(ext4_free_blocks,
         TP_PROTO(struct inode *inode, __u64 block, unsigned long count,
-                       int metadata),
+                int flags),
   
-       TP_ARGS(inode, block, count, metadata),
+       TP_ARGS(inode, block, count, flags),
   
         TP_STRUCT__entry(
                 __field(        dev_t,  dev                     )
                 __field(        ino_t,  ino                     )
+               __field(      umode_t, mode                     )
                 __field(        __u64,  block                   )
                 __field(        unsigned long,  count           )
-               __field(        int,    metadata                )
- 
+               __field(         int,   flags                   )
         ),
   
         TP_fast_assign(
                 __entry->dev            = inode->i_sb->s_dev;
                 __entry->ino            = inode->i_ino;
+               __entry->mode           = inode->i_mode;
                 __entry->block          = block;
                 __entry->count          = count;
-               __entry->metadata       = metadata;
+               __entry->flags          = flags;
         ),
   
-       TP_printk("dev %s ino %lu block %llu count %lu metadata %d",
+       TP_printk("dev %s ino %lu mode 0%o block %llu count %lu flags %d",
                   jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
-                 __entry->block, __entry->count, __entry->metadata)
+                 __entry->mode, __entry->block, __entry->count,
+                 __entry->flags)
   );
   
   TRACE_EVENT(ext4_sync_file,
@@@ -848,6 -907,32 +848,32 @@@ TRACE_EVENT(ext4_mballoc_free
                   __entry->result_len, __entry->result_logical)
   );
   
+ TRACE_EVENT(ext4_forget,
+       TP_PROTO(struct inode *inode, int is_metadata, __u64 block),
+ 
+       TP_ARGS(inode, is_metadata, block),
+ 
+       TP_STRUCT__entry(
+               __field(        dev_t,  dev                     )
+               __field(        ino_t,  ino                     )
+               __field(        umode_t, mode                   )
+               __field(        int,    is_metadata             )
+               __field(        __u64,  block                   )
+       ),
+ 
+       TP_fast_assign(
+               __entry->dev    = inode->i_sb->s_dev;
+               __entry->ino    = inode->i_ino;
+               __entry->mode   = inode->i_mode;
+               __entry->is_metadata = is_metadata;
+               __entry->block  = block;
+       ),
+ 
+       TP_printk("dev %s ino %lu mode 0%o is_metadata %d block %llu",
+                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
+                 __entry->mode, __entry->is_metadata, __entry->block)
+ );
+ 
   #endif /* _TRACE_EXT4_H */
   
   /* This part must be outside protection */
author	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 10 Dec 2009 17:33:29 +0000 (09:33 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 10 Dec 2009 17:33:29 +0000 (09:33 -0800)
		1	2
fs/ext4/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ext4/mballoc.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/trace/events/ext4.h	patch \|	diff1 \|	diff2 \|	blob \| history