Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso...

author Linus Torvalds <torvalds@linux-foundation.org>

Sun, 7 Oct 2012 21:36:39 +0000 (06:36 +0900)

committer Linus Torvalds <torvalds@linux-foundation.org>

Sun, 7 Oct 2012 21:36:39 +0000 (06:36 +0900)
author Linus Torvalds <torvalds@linux-foundation.org>
Sun, 7 Oct 2012 21:36:39 +0000 (06:36 +0900)
committer Linus Torvalds <torvalds@linux-foundation.org>
Sun, 7 Oct 2012 21:36:39 +0000 (06:36 +0900)
diff --git a/Documentation/ABI/testing/sysfs-fs-ext4 b/Documentation/ABI/testing/sysfs-fs-ext4

index f22ac08..c631253 100644 (file)
--- a/Documentation/ABI/testing/sysfs-fs-ext4
+++ b/Documentation/ABI/testing/sysfs-fs-ext4
@@ -96,3 +96,16 @@ Contact:     "Theodore Ts'o" <tytso@mit.edu>
  Description:
                 The maximum number of megabytes the writeback code will
                 try to write out before move on to another inode.
+
+What:          /sys/fs/ext4/<disk>/extent_max_zeroout_kb
+Date:          August 2012
+Contact:       "Theodore Ts'o" <tytso@mit.edu>
+Description:
+               The maximum number of kilobytes which will be zeroed
+               out in preference to creating a new uninitialized
+               extent when manipulating an inode's extent tree.  Note
+               that using a larger value will increase the
+               variability of time necessary to complete a random
+               write operation (since a 4k random write might turn
+               into a much larger write due to the zeroout
+               operation).
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt

index 1b7f9ac..104322b 100644 (file)
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -375,6 +375,16 @@ dioread_nolock             locking. If the dioread_nolock option is specified
                         Because of the restrictions this options comprises
                         it is off by default (e.g. dioread_lock).
  
+max_dir_size_kb=n      This limits the size of directories so that any
+                       attempt to expand them beyond the specified
+                       limit in kilobytes will cause an ENOSPC error.
+                       This is useful in memory constrained
+                       environments, where a very large directory can
+                       cause severe performance problems or even
+                       provoke the Out Of Memory killer.  (For example,
+                       if there is only 512mb memory available, a 176mb
+                       directory may seriously cramp the system's style.)
+
  i_version              Enable 64-bit inode version support. This option is
                         off by default.
  
diff --git a/fs/buffer.c b/fs/buffer.c

index 58e2e7b..b5f0442 100644 (file)
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2312,12 +2312,6 @@ int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
         loff_t size;
         int ret;
  
-       /*
-        * Update file times before taking page lock. We may end up failing the
-        * fault so this update may be superfluous but who really cares...
-        */
-       file_update_time(vma->vm_file);
-
         lock_page(page);
         size = i_size_read(inode);
         if ((page->mapping != inode->i_mapping) ||
@@ -2355,6 +2349,13 @@ int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
         struct super_block *sb = vma->vm_file->f_path.dentry->d_inode->i_sb;
  
         sb_start_pagefault(sb);
+
+       /*
+        * Update file times before taking page lock. We may end up failing the
+        * fault so this update may be superfluous but who really cares...
+        */
+       file_update_time(vma->vm_file);
+
         ret = __block_page_mkwrite(vma, vmf, get_block);
         sb_end_pagefault(sb);
         return block_page_mkwrite_return(ret);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h

index c3411d4..3ab2539 100644 (file)
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -186,7 +186,6 @@ struct mpage_da_data {
  #define EXT4_IO_END_ERROR      0x0002
  #define EXT4_IO_END_QUEUED     0x0004
  #define EXT4_IO_END_DIRECT     0x0008
-#define EXT4_IO_END_IN_FSYNC   0x0010
  
  struct ext4_io_page {
         struct page     *p_page;
@@ -912,9 +911,7 @@ struct ext4_inode_info {
         struct list_head i_completed_io_list;
         spinlock_t i_completed_io_lock;
         atomic_t i_ioend_count; /* Number of outstanding io_end structs */
-       /* current io_end structure for async DIO write*/
-       ext4_io_end_t *cur_aio_dio;
-       atomic_t i_aiodio_unwritten; /* Nr. of inflight conversions pending */
+       atomic_t i_unwritten; /* Nr. of inflight conversions pending */
  
         spinlock_t i_block_reservation_lock;
  
@@ -1233,6 +1230,7 @@ struct ext4_sb_info {
         spinlock_t s_md_lock;
         unsigned short *s_mb_offsets;
         unsigned int *s_mb_maxs;
+       unsigned int s_group_info_size;
  
         /* tunables */
         unsigned long s_stripe;
@@ -1243,6 +1241,7 @@ struct ext4_sb_info {
         unsigned int s_mb_order2_reqs;
         unsigned int s_mb_group_prealloc;
         unsigned int s_max_writeback_mb_bump;
+       unsigned int s_max_dir_size_kb;
         /* where last allocation was done - for stream allocation */
         unsigned long s_mb_last_group;
         unsigned long s_mb_last_start;
@@ -1270,8 +1269,12 @@ struct ext4_sb_info {
         unsigned long s_sectors_written_start;
         u64 s_kbytes_written;
  
+       /* the size of zero-out chunk */
+       unsigned int s_extent_max_zeroout_kb;
+
         unsigned int s_log_groups_per_flex;
         struct flex_groups *s_flex_groups;
+       ext4_group_t s_flex_groups_allocated;
  
         /* workqueue for dio unwritten */
         struct workqueue_struct *dio_unwritten_wq;
@@ -1328,10 +1331,20 @@ static inline void ext4_set_io_unwritten_flag(struct inode *inode,
  {
         if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
                 io_end->flag |= EXT4_IO_END_UNWRITTEN;
-               atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
+               atomic_inc(&EXT4_I(inode)->i_unwritten);
         }
  }
  
+static inline ext4_io_end_t *ext4_inode_aio(struct inode *inode)
+{
+       return inode->i_private;
+}
+
+static inline void ext4_inode_aio_set(struct inode *inode, ext4_io_end_t *io)
+{
+       inode->i_private = io;
+}
+
  /*
   * Inode dynamic state flags
   */
@@ -1345,6 +1358,8 @@ enum {
         EXT4_STATE_DIO_UNWRITTEN,       /* need convert on dio done*/
         EXT4_STATE_NEWENTRY,            /* File just added to dir */
         EXT4_STATE_DELALLOC_RESERVED,   /* blks already reserved for delalloc */
+       EXT4_STATE_DIOREAD_LOCK,        /* Disable support for dio read
+                                          nolocking */
  };
  
  #define EXT4_INODE_BIT_FNS(name, field, offset)                                \
@@ -1932,7 +1947,7 @@ extern void ext4_htree_free_dir_info(struct dir_private_info *p);
  
  /* fsync.c */
  extern int ext4_sync_file(struct file *, loff_t, loff_t, int);
-extern int ext4_flush_completed_IO(struct inode *);
+extern int ext4_flush_unwritten_io(struct inode *);
  
  /* hash.c */
  extern int ext4fs_dirhash(const char *name, int len, struct
@@ -1966,6 +1981,8 @@ extern void ext4_exit_mballoc(void);
  extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
                              struct buffer_head *bh, ext4_fsblk_t block,
                              unsigned long count, int flags);
+extern int ext4_mb_alloc_groupinfo(struct super_block *sb,
+                                  ext4_group_t ngroups);
  extern int ext4_mb_add_groupinfo(struct super_block *sb,
                 ext4_group_t i, struct ext4_group_desc *desc);
  extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
@@ -2051,6 +2068,8 @@ extern void ext4_superblock_csum_set(struct super_block *sb,
  extern void *ext4_kvmalloc(size_t size, gfp_t flags);
  extern void *ext4_kvzalloc(size_t size, gfp_t flags);
  extern void ext4_kvfree(void *ptr);
+extern int ext4_alloc_flex_bg_array(struct super_block *sb,
+                                   ext4_group_t ngroup);
  extern __printf(4, 5)
  void __ext4_error(struct super_block *, const char *, unsigned int,
                   const char *, ...);
@@ -2352,6 +2371,7 @@ extern const struct file_operations ext4_dir_operations;
  extern const struct inode_operations ext4_file_inode_operations;
  extern const struct file_operations ext4_file_operations;
  extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);
+extern void ext4_unwritten_wait(struct inode *inode);
  
  /* namei.c */
  extern const struct inode_operations ext4_dir_inode_operations;
@@ -2400,11 +2420,11 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
  
  /* page-io.c */
  extern int __init ext4_init_pageio(void);
+extern void ext4_add_complete_io(ext4_io_end_t *io_end);
  extern void ext4_exit_pageio(void);
  extern void ext4_ioend_wait(struct inode *);
  extern void ext4_free_io_end(ext4_io_end_t *io);
  extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
-extern int ext4_end_io_nolock(ext4_io_end_t *io);
  extern void ext4_io_submit(struct ext4_io_submit *io);
  extern int ext4_bio_write_page(struct ext4_io_submit *io,
                                struct page *page,
@@ -2452,6 +2472,21 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh)
         set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
  }
  
+/*
+ * Disable DIO read nolock optimization, so new dioreaders will be forced
+ * to grab i_mutex
+ */
+static inline void ext4_inode_block_unlocked_dio(struct inode *inode)
+{
+       ext4_set_inode_state(inode, EXT4_STATE_DIOREAD_LOCK);
+       smp_mb();
+}
+static inline void ext4_inode_resume_unlocked_dio(struct inode *inode)
+{
+       smp_mb();
+       ext4_clear_inode_state(inode, EXT4_STATE_DIOREAD_LOCK);
+}
+
  #define in_range(b, first, len)        ((b) >= (first) && (b) <= (first) + (len) - 1)
  
  /* For ioend & aio unwritten conversion wait queues */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c

index aabbb3f..1c94cca 100644 (file)
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1177,7 +1177,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
                   le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
                   ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
  
-       neh->eh_depth = cpu_to_le16(le16_to_cpu(neh->eh_depth) + 1);
+       le16_add_cpu(&neh->eh_depth, 1);
         ext4_mark_inode_dirty(handle, inode);
  out:
         brelse(bh);
@@ -1656,16 +1656,60 @@ static int ext4_ext_try_to_merge_right(struct inode *inode,
  }
  
  /*
+ * This function does a very simple check to see if we can collapse
+ * an extent tree with a single extent tree leaf block into the inode.
+ */
+static void ext4_ext_try_to_merge_up(handle_t *handle,
+                                    struct inode *inode,
+                                    struct ext4_ext_path *path)
+{
+       size_t s;
+       unsigned max_root = ext4_ext_space_root(inode, 0);
+       ext4_fsblk_t blk;
+
+       if ((path[0].p_depth != 1) ||
+           (le16_to_cpu(path[0].p_hdr->eh_entries) != 1) ||
+           (le16_to_cpu(path[1].p_hdr->eh_entries) > max_root))
+               return;
+
+       /*
+        * We need to modify the block allocation bitmap and the block
+        * group descriptor to release the extent tree block.  If we
+        * can't get the journal credits, give up.
+        */
+       if (ext4_journal_extend(handle, 2))
+               return;
+
+       /*
+        * Copy the extent data up to the inode
+        */
+       blk = ext4_idx_pblock(path[0].p_idx);
+       s = le16_to_cpu(path[1].p_hdr->eh_entries) *
+               sizeof(struct ext4_extent_idx);
+       s += sizeof(struct ext4_extent_header);
+
+       memcpy(path[0].p_hdr, path[1].p_hdr, s);
+       path[0].p_depth = 0;
+       path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) +
+               (path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr));
+       path[0].p_hdr->eh_max = cpu_to_le16(max_root);
+
+       brelse(path[1].p_bh);
+       ext4_free_blocks(handle, inode, NULL, blk, 1,
+                        EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
+}
+
+/*
   * This function tries to merge the @ex extent to neighbours in the tree.
   * return 1 if merge left else 0.
   */
-static int ext4_ext_try_to_merge(struct inode *inode,
+static void ext4_ext_try_to_merge(handle_t *handle,
+                                 struct inode *inode,
                                   struct ext4_ext_path *path,
                                   struct ext4_extent *ex) {
         struct ext4_extent_header *eh;
         unsigned int depth;
         int merge_done = 0;
-       int ret = 0;
  
         depth = ext_depth(inode);
         BUG_ON(path[depth].p_hdr == NULL);
@@ -1675,9 +1719,9 @@ static int ext4_ext_try_to_merge(struct inode *inode,
                 merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);
  
         if (!merge_done)
-               ret = ext4_ext_try_to_merge_right(inode, path, ex);
+               (void) ext4_ext_try_to_merge_right(inode, path, ex);
  
-       return ret;
+       ext4_ext_try_to_merge_up(handle, inode, path);
  }
  
  /*
@@ -1893,7 +1937,7 @@ has_space:
  merge:
         /* try to merge extents */
         if (!(flag & EXT4_GET_BLOCKS_PRE_IO))
-               ext4_ext_try_to_merge(inode, path, nearex);
+               ext4_ext_try_to_merge(handle, inode, path, nearex);
  
  
         /* time to correct all indexes above */
@@ -1901,7 +1945,7 @@ merge:
         if (err)
                 goto cleanup;
  
-       err = ext4_ext_dirty(handle, inode, path + depth);
+       err = ext4_ext_dirty(handle, inode, path + path->p_depth);
  
  cleanup:
         if (npath) {
@@ -2092,13 +2136,10 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
  }
  
  /*
- * ext4_ext_check_cache()
+ * ext4_ext_in_cache()
   * Checks to see if the given block is in the cache.
   * If it is, the cached extent is stored in the given
- * cache extent pointer.  If the cached extent is a hole,
- * this routine should be used instead of
- * ext4_ext_in_cache if the calling function needs to
- * know the size of the hole.
+ * cache extent pointer.
   *
   * @inode: The files inode
   * @block: The block to look for in the cache
@@ -2107,8 +2148,10 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
   *
   * Return 0 if cache is invalid; 1 if the cache is valid
   */
-static int ext4_ext_check_cache(struct inode *inode, ext4_lblk_t block,
-       struct ext4_ext_cache *ex){
+static int
+ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
+                 struct ext4_extent *ex)
+{
         struct ext4_ext_cache *cex;
         struct ext4_sb_info *sbi;
         int ret = 0;
@@ -2125,7 +2168,9 @@ static int ext4_ext_check_cache(struct inode *inode, ext4_lblk_t block,
                 goto errout;
  
         if (in_range(block, cex->ec_block, cex->ec_len)) {
-               memcpy(ex, cex, sizeof(struct ext4_ext_cache));
+               ex->ee_block = cpu_to_le32(cex->ec_block);
+               ext4_ext_store_pblock(ex, cex->ec_start);
+               ex->ee_len = cpu_to_le16(cex->ec_len);
                 ext_debug("%u cached by %u:%u:%llu\n",
                                 block,
                                 cex->ec_block, cex->ec_len, cex->ec_start);
@@ -2138,37 +2183,6 @@ errout:
  }
  
  /*
- * ext4_ext_in_cache()
- * Checks to see if the given block is in the cache.
- * If it is, the cached extent is stored in the given
- * extent pointer.
- *
- * @inode: The files inode
- * @block: The block to look for in the cache
- * @ex:    Pointer where the cached extent will be stored
- *         if it contains block
- *
- * Return 0 if cache is invalid; 1 if the cache is valid
- */
-static int
-ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
-                       struct ext4_extent *ex)
-{
-       struct ext4_ext_cache cex;
-       int ret = 0;
-
-       if (ext4_ext_check_cache(inode, block, &cex)) {
-               ex->ee_block = cpu_to_le32(cex.ec_block);
-               ext4_ext_store_pblock(ex, cex.ec_start);
-               ex->ee_len = cpu_to_le16(cex.ec_len);
-               ret = 1;
-       }
-
-       return ret;
-}
-
-
-/*
   * ext4_ext_rm_idx:
   * removes index from the index block.
   */
@@ -2274,10 +2288,13 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
         unsigned short ee_len =  ext4_ext_get_actual_len(ex);
         ext4_fsblk_t pblk;
-       int flags = EXT4_FREE_BLOCKS_FORGET;
+       int flags = 0;
  
         if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
-               flags |= EXT4_FREE_BLOCKS_METADATA;
+               flags |= EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
+       else if (ext4_should_journal_data(inode))
+               flags |= EXT4_FREE_BLOCKS_FORGET;
+
         /*
          * For bigalloc file systems, we never free a partial cluster
          * at the beginning of the extent.  Instead, we make a note
@@ -2572,7 +2589,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
         struct ext4_ext_path *path = NULL;
         ext4_fsblk_t partial_cluster = 0;
         handle_t *handle;
-       int i = 0, err;
+       int i = 0, err = 0;
  
         ext_debug("truncate since %u to %u\n", start, end);
  
@@ -2604,12 +2621,16 @@ again:
                         return PTR_ERR(path);
                 }
                 depth = ext_depth(inode);
+               /* Leaf not may not exist only if inode has no blocks at all */
                 ex = path[depth].p_ext;
                 if (!ex) {
-                       ext4_ext_drop_refs(path);
-                       kfree(path);
-                       path = NULL;
-                       goto cont;
+                       if (depth) {
+                               EXT4_ERROR_INODE(inode,
+                                                "path[%d].p_hdr == NULL",
+                                                depth);
+                               err = -EIO;
+                       }
+                       goto out;
                 }
  
                 ee_block = le32_to_cpu(ex->ee_block);
@@ -2641,8 +2662,6 @@ again:
                                 goto out;
                 }
         }
-cont:
-
         /*
          * We start scanning from right side, freeing all the blocks
          * after i_size and walking into the tree depth-wise.
@@ -2924,9 +2943,9 @@ static int ext4_split_extent_at(handle_t *handle,
                         ext4_ext_mark_initialized(ex);
  
                 if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
-                       ext4_ext_try_to_merge(inode, path, ex);
+                       ext4_ext_try_to_merge(handle, inode, path, ex);
  
-               err = ext4_ext_dirty(handle, inode, path + depth);
+               err = ext4_ext_dirty(handle, inode, path + path->p_depth);
                 goto out;
         }
  
@@ -2958,8 +2977,8 @@ static int ext4_split_extent_at(handle_t *handle,
                         goto fix_extent_len;
                 /* update the extent length and mark as initialized */
                 ex->ee_len = cpu_to_le16(ee_len);
-               ext4_ext_try_to_merge(inode, path, ex);
-               err = ext4_ext_dirty(handle, inode, path + depth);
+               ext4_ext_try_to_merge(handle, inode, path, ex);
+               err = ext4_ext_dirty(handle, inode, path + path->p_depth);
                 goto out;
         } else if (err)
                 goto fix_extent_len;
@@ -3041,7 +3060,6 @@ out:
         return err ? err : map->m_len;
  }
  
-#define EXT4_EXT_ZERO_LEN 7
  /*
   * This function is called by ext4_ext_map_blocks() if someone tries to write
   * to an uninitialized extent. It may result in splitting the uninitialized
@@ -3067,13 +3085,14 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                                            struct ext4_map_blocks *map,
                                            struct ext4_ext_path *path)
  {
+       struct ext4_sb_info *sbi;
         struct ext4_extent_header *eh;
         struct ext4_map_blocks split_map;
         struct ext4_extent zero_ex;
         struct ext4_extent *ex;
         ext4_lblk_t ee_block, eof_block;
         unsigned int ee_len, depth;
-       int allocated;
+       int allocated, max_zeroout = 0;
         int err = 0;
         int split_flag = 0;
  
@@ -3081,6 +3100,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                 "block %llu, max_blocks %u\n", inode->i_ino,
                 (unsigned long long)map->m_lblk, map->m_len);
  
+       sbi = EXT4_SB(inode->i_sb);
         eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
                 inode->i_sb->s_blocksize_bits;
         if (eof_block < map->m_lblk + map->m_len)
@@ -3180,9 +3200,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
          */
         split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
  
-       /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
-       if (ee_len <= 2*EXT4_EXT_ZERO_LEN &&
-           (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+       if (EXT4_EXT_MAY_ZEROOUT & split_flag)
+               max_zeroout = sbi->s_extent_max_zeroout_kb >>
+                       inode->i_sb->s_blocksize_bits;
+
+       /* If extent is less than s_max_zeroout_kb, zeroout directly */
+       if (max_zeroout && (ee_len <= max_zeroout)) {
                 err = ext4_ext_zeroout(inode, ex);
                 if (err)
                         goto out;
@@ -3191,8 +3214,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                 if (err)
                         goto out;
                 ext4_ext_mark_initialized(ex);
-               ext4_ext_try_to_merge(inode, path, ex);
-               err = ext4_ext_dirty(handle, inode, path + depth);
+               ext4_ext_try_to_merge(handle, inode, path, ex);
+               err = ext4_ext_dirty(handle, inode, path + path->p_depth);
                 goto out;
         }
  
@@ -3206,9 +3229,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
         split_map.m_lblk = map->m_lblk;
         split_map.m_len = map->m_len;
  
-       if (allocated > map->m_len) {
-               if (allocated <= EXT4_EXT_ZERO_LEN &&
-                   (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+       if (max_zeroout && (allocated > map->m_len)) {
+               if (allocated <= max_zeroout) {
                         /* case 3 */
                         zero_ex.ee_block =
                                          cpu_to_le32(map->m_lblk);
@@ -3220,9 +3242,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                                 goto out;
                         split_map.m_lblk = map->m_lblk;
                         split_map.m_len = allocated;
-               } else if ((map->m_lblk - ee_block + map->m_len <
-                          EXT4_EXT_ZERO_LEN) &&
-                          (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+               } else if (map->m_lblk - ee_block + map->m_len < max_zeroout) {
                         /* case 2 */
                         if (map->m_lblk != ee_block) {
                                 zero_ex.ee_block = ex->ee_block;
@@ -3242,7 +3262,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
         }
  
         allocated = ext4_split_extent(handle, inode, path,
-                                      &split_map, split_flag, 0);
+                                     &split_map, split_flag, 0);
         if (allocated < 0)
                 err = allocated;
  
@@ -3256,7 +3276,7 @@ out:
   * to an uninitialized extent.
   *
   * Writing to an uninitialized extent may result in splitting the uninitialized
- * extent into multiple /initialized uninitialized extents (up to three)
+ * extent into multiple initialized/uninitialized extents (up to three)
   * There are three possibilities:
   *   a> There is no split required: Entire extent should be uninitialized
   *   b> Splits in two extents: Write is happening at either end of the extent
@@ -3333,10 +3353,10 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
         /* note: ext4_ext_correct_indexes() isn't needed here because
          * borders are not changed
          */
-       ext4_ext_try_to_merge(inode, path, ex);
+       ext4_ext_try_to_merge(handle, inode, path, ex);
  
         /* Mark modified extent as dirty */
-       err = ext4_ext_dirty(handle, inode, path + depth);
+       err = ext4_ext_dirty(handle, inode, path + path->p_depth);
  out:
         ext4_ext_show_leaf(inode, path);
         return err;
@@ -3600,7 +3620,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
  {
         int ret = 0;
         int err = 0;
-       ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
+       ext4_io_end_t *io = ext4_inode_aio(inode);
  
         ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical "
                   "block %llu, max_blocks %u, flags %x, allocated %u\n",
@@ -3615,6 +3635,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
         if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
                 ret = ext4_split_unwritten_extents(handle, inode, map,
                                                    path, flags);
+               if (ret <= 0)
+                       goto out;
                 /*
                  * Flag the inode(non aio case) or end_io struct (aio case)
                  * that this IO needs to conversion to written when IO is
@@ -3858,8 +3880,9 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
         unsigned int allocated = 0, offset = 0;
         unsigned int allocated_clusters = 0;
         struct ext4_allocation_request ar;
-       ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
+       ext4_io_end_t *io = ext4_inode_aio(inode);
         ext4_lblk_t cluster_offset;
+       int set_unwritten = 0;
  
         ext_debug("blocks %u/%u requested for inode %lu\n",
                   map->m_lblk, map->m_len, inode->i_ino);
@@ -4082,13 +4105,8 @@ got_allocated_blocks:
                  * For non asycn direct IO case, flag the inode state
                  * that we need to perform conversion when IO is done.
                  */
-               if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
-                       if (io)
-                               ext4_set_io_unwritten_flag(inode, io);
-                       else
-                               ext4_set_inode_state(inode,
-                                                    EXT4_STATE_DIO_UNWRITTEN);
-               }
+               if ((flags & EXT4_GET_BLOCKS_PRE_IO))
+                       set_unwritten = 1;
                 if (ext4_should_dioread_nolock(inode))
                         map->m_flags |= EXT4_MAP_UNINIT;
         }
@@ -4100,6 +4118,15 @@ got_allocated_blocks:
         if (!err)
                 err = ext4_ext_insert_extent(handle, inode, path,
                                              &newex, flags);
+
+       if (!err && set_unwritten) {
+               if (io)
+                       ext4_set_io_unwritten_flag(inode, io);
+               else
+                       ext4_set_inode_state(inode,
+                                            EXT4_STATE_DIO_UNWRITTEN);
+       }
+
         if (err && free_on_err) {
                 int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
                         EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;
@@ -4241,7 +4268,7 @@ void ext4_ext_truncate(struct inode *inode)
          * finish any pending end_io work so we won't run the risk of
          * converting any truncated blocks to initialized later
          */
-       ext4_flush_completed_IO(inode);
+       ext4_flush_unwritten_io(inode);
  
         /*
          * probably first extent we're gonna free will be last in block
@@ -4769,9 +4796,32 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
         loff_t first_page_offset, last_page_offset;
         int credits, err = 0;
  
+       /*
+        * Write out all dirty pages to avoid race conditions
+        * Then release them.
+        */
+       if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+               err = filemap_write_and_wait_range(mapping,
+                       offset, offset + length - 1);
+
+               if (err)
+                       return err;
+       }
+
+       mutex_lock(&inode->i_mutex);
+       /* It's not possible punch hole on append only file */
+       if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
+               err = -EPERM;
+               goto out_mutex;
+       }
+       if (IS_SWAPFILE(inode)) {
+               err = -ETXTBSY;
+               goto out_mutex;
+       }
+
         /* No need to punch hole beyond i_size */
         if (offset >= inode->i_size)
-               return 0;
+               goto out_mutex;
  
         /*
          * If the hole extends beyond i_size, set the hole
@@ -4789,35 +4839,26 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
         first_page_offset = first_page << PAGE_CACHE_SHIFT;
         last_page_offset = last_page << PAGE_CACHE_SHIFT;
  
-       /*
-        * Write out all dirty pages to avoid race conditions
-        * Then release them.
-        */
-       if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
-               err = filemap_write_and_wait_range(mapping,
-                       offset, offset + length - 1);
-
-               if (err)
-                       return err;
-       }
-
         /* Now release the pages */
         if (last_page_offset > first_page_offset) {
                 truncate_pagecache_range(inode, first_page_offset,
                                          last_page_offset - 1);
         }
  
-       /* finish any pending end_io work */
-       ext4_flush_completed_IO(inode);
+       /* Wait all existing dio workers, newcomers will block on i_mutex */
+       ext4_inode_block_unlocked_dio(inode);
+       err = ext4_flush_unwritten_io(inode);
+       if (err)
+               goto out_dio;
+       inode_dio_wait(inode);
  
         credits = ext4_writepage_trans_blocks(inode);
         handle = ext4_journal_start(inode, credits);
-       if (IS_ERR(handle))
-               return PTR_ERR(handle);
+       if (IS_ERR(handle)) {
+               err = PTR_ERR(handle);
+               goto out_dio;
+       }
  
-       err = ext4_orphan_add(handle, inode);
-       if (err)
-               goto out;
  
         /*
          * Now we need to zero out the non-page-aligned data in the
@@ -4903,10 +4944,13 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
         up_write(&EXT4_I(inode)->i_data_sem);
  
  out:
-       ext4_orphan_del(handle, inode);
         inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
         ext4_mark_inode_dirty(handle, inode);
         ext4_journal_stop(handle);
+out_dio:
+       ext4_inode_resume_unlocked_dio(inode);
+out_mutex:
+       mutex_unlock(&inode->i_mutex);
         return err;
  }
  int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
diff --git a/fs/ext4/file.c b/fs/ext4/file.c

index 3b0e3bd..ca6f07a 100644 (file)
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -55,11 +55,11 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
         return 0;
  }
  
-static void ext4_aiodio_wait(struct inode *inode)
+void ext4_unwritten_wait(struct inode *inode)
  {
         wait_queue_head_t *wq = ext4_ioend_wq(inode);
  
-       wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_aiodio_unwritten) == 0));
+       wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_unwritten) == 0));
  }
  
  /*
@@ -116,7 +116,7 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
                                  "performance will be poor.",
                                  inode->i_ino, current->comm);
                 mutex_lock(ext4_aio_mutex(inode));
-               ext4_aiodio_wait(inode);
+               ext4_unwritten_wait(inode);
         }
  
         BUG_ON(iocb->ki_pos != pos);
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c

index 2a1dcea..be1d89f 100644 (file)
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -34,87 +34,6 @@
  
  #include <trace/events/ext4.h>
  
-static void dump_completed_IO(struct inode * inode)
-{
-#ifdef EXT4FS_DEBUG
-       struct list_head *cur, *before, *after;
-       ext4_io_end_t *io, *io0, *io1;
-       unsigned long flags;
-
-       if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
-               ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
-               return;
-       }
-
-       ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
-       spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
-       list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
-               cur = &io->list;
-               before = cur->prev;
-               io0 = container_of(before, ext4_io_end_t, list);
-               after = cur->next;
-               io1 = container_of(after, ext4_io_end_t, list);
-
-               ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
-                           io, inode->i_ino, io0, io1);
-       }
-       spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
-#endif
-}
-
-/*
- * This function is called from ext4_sync_file().
- *
- * When IO is completed, the work to convert unwritten extents to
- * written is queued on workqueue but may not get immediately
- * scheduled. When fsync is called, we need to ensure the
- * conversion is complete before fsync returns.
- * The inode keeps track of a list of pending/completed IO that
- * might needs to do the conversion. This function walks through
- * the list and convert the related unwritten extents for completed IO
- * to written.
- * The function return the number of pending IOs on success.
- */
-int ext4_flush_completed_IO(struct inode *inode)
-{
-       ext4_io_end_t *io;
-       struct ext4_inode_info *ei = EXT4_I(inode);
-       unsigned long flags;
-       int ret = 0;
-       int ret2 = 0;
-
-       dump_completed_IO(inode);
-       spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-       while (!list_empty(&ei->i_completed_io_list)){
-               io = list_entry(ei->i_completed_io_list.next,
-                               ext4_io_end_t, list);
-               list_del_init(&io->list);
-               io->flag |= EXT4_IO_END_IN_FSYNC;
-               /*
-                * Calling ext4_end_io_nolock() to convert completed
-                * IO to written.
-                *
-                * When ext4_sync_file() is called, run_queue() may already
-                * about to flush the work corresponding to this io structure.
-                * It will be upset if it founds the io structure related
-                * to the work-to-be schedule is freed.
-                *
-                * Thus we need to keep the io structure still valid here after
-                * conversion finished. The io structure has a flag to
-                * avoid double converting from both fsync and background work
-                * queue work.
-                */
-               spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-               ret = ext4_end_io_nolock(io);
-               if (ret < 0)
-                       ret2 = ret;
-               spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-               io->flag &= ~EXT4_IO_END_IN_FSYNC;
-       }
-       spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-       return (ret2 < 0) ? ret2 : 0;
-}
-
  /*
   * If we're not journaling and this is a just-created file, we have to
   * sync our parent directory (if it was freshly created) since
@@ -203,7 +122,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
         struct inode *inode = file->f_mapping->host;
         struct ext4_inode_info *ei = EXT4_I(inode);
         journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
-       int ret;
+       int ret, err;
         tid_t commit_tid;
         bool needs_barrier = false;
  
@@ -219,7 +138,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
         if (inode->i_sb->s_flags & MS_RDONLY)
                 goto out;
  
-       ret = ext4_flush_completed_IO(inode);
+       ret = ext4_flush_unwritten_io(inode);
         if (ret < 0)
                 goto out;
  
@@ -255,8 +174,11 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
                 needs_barrier = true;
         jbd2_log_start_commit(journal, commit_tid);
         ret = jbd2_log_wait_commit(journal, commit_tid);
-       if (needs_barrier)
-               blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
+       if (needs_barrier) {
+               err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
+               if (!ret)
+                       ret = err;
+       }
   out:
         mutex_unlock(&inode->i_mutex);
         trace_ext4_sync_file_exit(inode, ret);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c

index 26154b8..fa36372 100644 (file)
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -697,6 +697,15 @@ got_group:
                 if (!gdp)
                         goto fail;
  
+               /*
+                * Check free inodes count before loading bitmap.
+                */
+               if (ext4_free_inodes_count(sb, gdp) == 0) {
+                       if (++group == ngroups)
+                               group = 0;
+                       continue;
+               }
+
                 brelse(inode_bitmap_bh);
                 inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
                 if (!inode_bitmap_bh)
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c

index 830e1b2..792e388 100644 (file)
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -807,16 +807,30 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
  
  retry:
         if (rw == READ && ext4_should_dioread_nolock(inode)) {
-               if (unlikely(!list_empty(&ei->i_completed_io_list))) {
+               if (unlikely(atomic_read(&EXT4_I(inode)->i_unwritten))) {
                         mutex_lock(&inode->i_mutex);
-                       ext4_flush_completed_IO(inode);
+                       ext4_flush_unwritten_io(inode);
                         mutex_unlock(&inode->i_mutex);
                 }
+               /*
+                * Nolock dioread optimization may be dynamically disabled
+                * via ext4_inode_block_unlocked_dio(). Check inode's state
+                * while holding extra i_dio_count ref.
+                */
+               atomic_inc(&inode->i_dio_count);
+               smp_mb();
+               if (unlikely(ext4_test_inode_state(inode,
+                                                   EXT4_STATE_DIOREAD_LOCK))) {
+                       inode_dio_done(inode);
+                       goto locked;
+               }
                 ret = __blockdev_direct_IO(rw, iocb, inode,
                                  inode->i_sb->s_bdev, iov,
                                  offset, nr_segs,
                                  ext4_get_block, NULL, NULL, 0);
+               inode_dio_done(inode);
         } else {
+locked:
                 ret = blockdev_direct_IO(rw, iocb, inode, iov,
                                  offset, nr_segs, ext4_get_block);
  
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c

index c862ee5..b3c243b 100644 (file)
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -732,11 +732,13 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
         err = ext4_map_blocks(handle, inode, &map,
                               create ? EXT4_GET_BLOCKS_CREATE : 0);
  
+       /* ensure we send some value back into *errp */
+       *errp = 0;
+
         if (err < 0)
                 *errp = err;
         if (err <= 0)
                 return NULL;
-       *errp = 0;
  
         bh = sb_getblk(inode->i_sb, map.m_pblk);
         if (!bh) {
@@ -1954,9 +1956,6 @@ out:
         return ret;
  }
  
-static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
-static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
-
  /*
   * Note that we don't need to start a transaction unless we're journaling data
   * because we should have holes filled from ext4_page_mkwrite(). We even don't
@@ -2463,6 +2462,16 @@ static int ext4_nonda_switch(struct super_block *sb)
         free_blocks  = EXT4_C2B(sbi,
                 percpu_counter_read_positive(&sbi->s_freeclusters_counter));
         dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter);
+       /*
+        * Start pushing delalloc when 1/2 of free blocks are dirty.
+        */
+       if (dirty_blocks && (free_blocks < 2 * dirty_blocks) &&
+           !writeback_in_progress(sb->s_bdi) &&
+           down_read_trylock(&sb->s_umount)) {
+               writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE);
+               up_read(&sb->s_umount);
+       }
+
         if (2 * free_blocks < 3 * dirty_blocks ||
                 free_blocks < (dirty_blocks + EXT4_FREECLUSTERS_WATERMARK)) {
                 /*
@@ -2471,13 +2480,6 @@ static int ext4_nonda_switch(struct super_block *sb)
                  */
                 return 1;
         }
-       /*
-        * Even if we don't switch but are nearing capacity,
-        * start pushing delalloc when 1/2 of free blocks are dirty.
-        */
-       if (free_blocks < 2 * dirty_blocks)
-               writeback_inodes_sb_if_idle(sb, WB_REASON_FS_FREE_SPACE);
-
         return 0;
  }
  
@@ -2879,9 +2881,6 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
  {
         struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
          ext4_io_end_t *io_end = iocb->private;
-       struct workqueue_struct *wq;
-       unsigned long flags;
-       struct ext4_inode_info *ei;
  
         /* if not async direct IO or dio with 0 bytes write, just return */
         if (!io_end || !size)
@@ -2910,24 +2909,14 @@ out:
                 io_end->iocb = iocb;
                 io_end->result = ret;
         }
-       wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
  
-       /* Add the io_end to per-inode completed aio dio list*/
-       ei = EXT4_I(io_end->inode);
-       spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-       list_add_tail(&io_end->list, &ei->i_completed_io_list);
-       spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-
-       /* queue the work to convert unwritten extents to written */
-       queue_work(wq, &io_end->work);
+       ext4_add_complete_io(io_end);
  }
  
  static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
  {
         ext4_io_end_t *io_end = bh->b_private;
-       struct workqueue_struct *wq;
         struct inode *inode;
-       unsigned long flags;
  
         if (!test_clear_buffer_uninit(bh) || !io_end)
                 goto out;
@@ -2946,15 +2935,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
          */
         inode = io_end->inode;
         ext4_set_io_unwritten_flag(inode, io_end);
-
-       /* Add the io_end to per-inode completed io list*/
-       spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
-       list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
-       spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
-
-       wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
-       /* queue the work to convert unwritten extents to written */
-       queue_work(wq, &io_end->work);
+       ext4_add_complete_io(io_end);
  out:
         bh->b_private = NULL;
         bh->b_end_io = NULL;
@@ -3029,6 +3010,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
                 overwrite = *((int *)iocb->private);
  
                 if (overwrite) {
+                       atomic_inc(&inode->i_dio_count);
                         down_read(&EXT4_I(inode)->i_data_sem);
                         mutex_unlock(&inode->i_mutex);
                 }
@@ -3054,7 +3036,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
                  * hook to the iocb.
                  */
                 iocb->private = NULL;
-               EXT4_I(inode)->cur_aio_dio = NULL;
+               ext4_inode_aio_set(inode, NULL);
                 if (!is_sync_kiocb(iocb)) {
                         ext4_io_end_t *io_end =
                                 ext4_init_io_end(inode, GFP_NOFS);
@@ -3071,7 +3053,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
                          * is a unwritten extents needs to be converted
                          * when IO is completed.
                          */
-                       EXT4_I(inode)->cur_aio_dio = iocb->private;
+                       ext4_inode_aio_set(inode, io_end);
                 }
  
                 if (overwrite)
@@ -3091,7 +3073,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
                                                  NULL,
                                                  DIO_LOCKING);
                 if (iocb->private)
-                       EXT4_I(inode)->cur_aio_dio = NULL;
+                       ext4_inode_aio_set(inode, NULL);
                 /*
                  * The io_end structure takes a reference to the inode,
                  * that structure needs to be destroyed and the
@@ -3126,6 +3108,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
         retake_lock:
                 /* take i_mutex locking again if we do a ovewrite dio */
                 if (overwrite) {
+                       inode_dio_done(inode);
                         up_read(&EXT4_I(inode)->i_data_sem);
                         mutex_lock(&inode->i_mutex);
                 }
@@ -4052,6 +4035,7 @@ static int ext4_do_update_inode(handle_t *handle,
         struct ext4_inode_info *ei = EXT4_I(inode);
         struct buffer_head *bh = iloc->bh;
         int err = 0, rc, block;
+       int need_datasync = 0;
         uid_t i_uid;
         gid_t i_gid;
  
@@ -4102,7 +4086,10 @@ static int ext4_do_update_inode(handle_t *handle,
                 raw_inode->i_file_acl_high =
                         cpu_to_le16(ei->i_file_acl >> 32);
         raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
-       ext4_isize_set(raw_inode, ei->i_disksize);
+       if (ei->i_disksize != ext4_isize(raw_inode)) {
+               ext4_isize_set(raw_inode, ei->i_disksize);
+               need_datasync = 1;
+       }
         if (ei->i_disksize > 0x7fffffffULL) {
                 struct super_block *sb = inode->i_sb;
                 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
@@ -4155,7 +4142,7 @@ static int ext4_do_update_inode(handle_t *handle,
                 err = rc;
         ext4_clear_inode_state(inode, EXT4_STATE_NEW);
  
-       ext4_update_inode_fsync_trans(handle, inode, 0);
+       ext4_update_inode_fsync_trans(handle, inode, need_datasync);
  out_brelse:
         brelse(bh);
         ext4_std_error(inode->i_sb, err);
@@ -4298,7 +4285,6 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
         }
  
         if (attr->ia_valid & ATTR_SIZE) {
-               inode_dio_wait(inode);
  
                 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
                         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -4347,8 +4333,17 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
         }
  
         if (attr->ia_valid & ATTR_SIZE) {
-               if (attr->ia_size != i_size_read(inode))
+               if (attr->ia_size != i_size_read(inode)) {
                         truncate_setsize(inode, attr->ia_size);
+                       /* Inode size will be reduced, wait for dio in flight.
+                        * Temporarily disable dioread_nolock to prevent
+                        * livelock. */
+                       if (orphan) {
+                               ext4_inode_block_unlocked_dio(inode);
+                               inode_dio_wait(inode);
+                               ext4_inode_resume_unlocked_dio(inode);
+                       }
+               }
                 ext4_truncate(inode);
         }
  
@@ -4727,6 +4722,10 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
                         return err;
         }
  
+       /* Wait for all existing dio workers */
+       ext4_inode_block_unlocked_dio(inode);
+       inode_dio_wait(inode);
+
         jbd2_journal_lock_updates(journal);
  
         /*
@@ -4746,6 +4745,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
         ext4_set_aops(inode);
  
         jbd2_journal_unlock_updates(journal);
+       ext4_inode_resume_unlocked_dio(inode);
  
         /* Finally we can mark the inode as dirty. */
  
@@ -4780,6 +4780,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
         int retries = 0;
  
         sb_start_pagefault(inode->i_sb);
+       file_update_time(vma->vm_file);
         /* Delalloc case is easy... */
         if (test_opt(inode->i_sb, DELALLOC) &&
             !ext4_should_journal_data(inode) &&
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c

index 5439d6a..5747f52 100644 (file)
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -366,26 +366,11 @@ group_add_out:
                         return -EOPNOTSUPP;
                 }
  
-               if (EXT4_HAS_INCOMPAT_FEATURE(sb,
-                              EXT4_FEATURE_INCOMPAT_META_BG)) {
-                       ext4_msg(sb, KERN_ERR,
-                                "Online resizing not (yet) supported with meta_bg");
-                       return -EOPNOTSUPP;
-               }
-
                 if (copy_from_user(&n_blocks_count, (__u64 __user *)arg,
                                    sizeof(__u64))) {
                         return -EFAULT;
                 }
  
-               if (n_blocks_count > MAX_32_NUM &&
-                   !EXT4_HAS_INCOMPAT_FEATURE(sb,
-                                              EXT4_FEATURE_INCOMPAT_64BIT)) {
-                       ext4_msg(sb, KERN_ERR,
-                                "File system only supports 32-bit block numbers");
-                       return -EOPNOTSUPP;
-               }
-
                 err = ext4_resize_begin(sb);
                 if (err)
                         return err;
@@ -420,13 +405,6 @@ resizefs_out:
                 if (!blk_queue_discard(q))
                         return -EOPNOTSUPP;
  
-               if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
-                              EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
-                       ext4_msg(sb, KERN_ERR,
-                                "FITRIM not supported with bigalloc");
-                       return -EOPNOTSUPP;
-               }
-
                 if (copy_from_user(&range, (struct fstrim_range __user *)arg,
                     sizeof(range)))
                         return -EFAULT;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c

index 08778f6..f8b27bf 100644 (file)
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -24,6 +24,7 @@
  #include "ext4_jbd2.h"
  #include "mballoc.h"
  #include <linux/debugfs.h>
+#include <linux/log2.h>
  #include <linux/slab.h>
  #include <trace/events/ext4.h>
  
@@ -1338,17 +1339,17 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
         mb_check_buddy(e4b);
  }
  
-static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
+static int mb_find_extent(struct ext4_buddy *e4b, int block,
                                 int needed, struct ext4_free_extent *ex)
  {
         int next = block;
-       int max;
+       int max, order;
         void *buddy;
  
         assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
         BUG_ON(ex == NULL);
  
-       buddy = mb_find_buddy(e4b, order, &max);
+       buddy = mb_find_buddy(e4b, 0, &max);
         BUG_ON(buddy == NULL);
         BUG_ON(block >= max);
         if (mb_test_bit(block, buddy)) {
@@ -1358,12 +1359,9 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
                 return 0;
         }
  
-       /* FIXME dorp order completely ? */
-       if (likely(order == 0)) {
-               /* find actual order */
-               order = mb_find_order_for_block(e4b, block);
-               block = block >> order;
-       }
+       /* find actual order */
+       order = mb_find_order_for_block(e4b, block);
+       block = block >> order;
  
         ex->fe_len = 1 << order;
         ex->fe_start = block << order;
@@ -1549,7 +1547,7 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
                 /* recheck chunk's availability - we don't know
                  * when it was found (within this lock-unlock
                  * period or not) */
-               max = mb_find_extent(e4b, 0, bex->fe_start, gex->fe_len, &ex);
+               max = mb_find_extent(e4b, bex->fe_start, gex->fe_len, &ex);
                 if (max >= gex->fe_len) {
                         ext4_mb_use_best_found(ac, e4b);
                         return;
@@ -1641,7 +1639,7 @@ int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
                 return err;
  
         ext4_lock_group(ac->ac_sb, group);
-       max = mb_find_extent(e4b, 0, ex.fe_start, ex.fe_len, &ex);
+       max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex);
  
         if (max > 0) {
                 ac->ac_b_ex = ex;
@@ -1662,17 +1660,20 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
         int max;
         int err;
         struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+       struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
         struct ext4_free_extent ex;
  
         if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL))
                 return 0;
+       if (grp->bb_free == 0)
+               return 0;
  
         err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
         if (err)
                 return err;
  
         ext4_lock_group(ac->ac_sb, group);
-       max = mb_find_extent(e4b, 0, ac->ac_g_ex.fe_start,
+       max = mb_find_extent(e4b, ac->ac_g_ex.fe_start,
                              ac->ac_g_ex.fe_len, &ex);
  
         if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
@@ -1788,7 +1789,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
                         break;
                 }
  
-               mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex);
+               mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex);
                 BUG_ON(ex.fe_len <= 0);
                 if (free < ex.fe_len) {
                         ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
@@ -1840,7 +1841,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
  
         while (i < EXT4_CLUSTERS_PER_GROUP(sb)) {
                 if (!mb_test_bit(i, bitmap)) {
-                       max = mb_find_extent(e4b, 0, i, sbi->s_stripe, &ex);
+                       max = mb_find_extent(e4b, i, sbi->s_stripe, &ex);
                         if (max >= sbi->s_stripe) {
                                 ac->ac_found++;
                                 ac->ac_b_ex = ex;
@@ -1862,6 +1863,12 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
  
         BUG_ON(cr < 0 || cr >= 4);
  
+       free = grp->bb_free;
+       if (free == 0)
+               return 0;
+       if (cr <= 2 && free < ac->ac_g_ex.fe_len)
+               return 0;
+
         /* We only do this if the grp has never been initialized */
         if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
                 int ret = ext4_mb_init_group(ac->ac_sb, group);
@@ -1869,10 +1876,7 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
                         return 0;
         }
  
-       free = grp->bb_free;
         fragments = grp->bb_fragments;
-       if (free == 0)
-               return 0;
         if (fragments == 0)
                 return 0;
  
@@ -2163,6 +2167,39 @@ static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
         return cachep;
  }
  
+/*
+ * Allocate the top-level s_group_info array for the specified number
+ * of groups
+ */
+int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups)
+{
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+       unsigned size;
+       struct ext4_group_info ***new_groupinfo;
+
+       size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >>
+               EXT4_DESC_PER_BLOCK_BITS(sb);
+       if (size <= sbi->s_group_info_size)
+               return 0;
+
+       size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size);
+       new_groupinfo = ext4_kvzalloc(size, GFP_KERNEL);
+       if (!new_groupinfo) {
+               ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group");
+               return -ENOMEM;
+       }
+       if (sbi->s_group_info) {
+               memcpy(new_groupinfo, sbi->s_group_info,
+                      sbi->s_group_info_size * sizeof(*sbi->s_group_info));
+               ext4_kvfree(sbi->s_group_info);
+       }
+       sbi->s_group_info = new_groupinfo;
+       sbi->s_group_info_size = size / sizeof(*sbi->s_group_info);
+       ext4_debug("allocated s_groupinfo array for %d meta_bg's\n", 
+                  sbi->s_group_info_size);
+       return 0;
+}
+
  /* Create and initialize ext4_group_info data for the given group. */
  int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
                           struct ext4_group_desc *desc)
@@ -2195,12 +2232,11 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
                 sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
         i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
  
-       meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL);
+       meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_KERNEL);
         if (meta_group_info[i] == NULL) {
                 ext4_msg(sb, KERN_ERR, "can't allocate buddy mem");
                 goto exit_group_info;
         }
-       memset(meta_group_info[i], 0, kmem_cache_size(cachep));
         set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
                 &(meta_group_info[i]->bb_state));
  
@@ -2252,49 +2288,14 @@ static int ext4_mb_init_backend(struct super_block *sb)
         ext4_group_t ngroups = ext4_get_groups_count(sb);
         ext4_group_t i;
         struct ext4_sb_info *sbi = EXT4_SB(sb);
-       struct ext4_super_block *es = sbi->s_es;
-       int num_meta_group_infos;
-       int num_meta_group_infos_max;
-       int array_size;
+       int err;
         struct ext4_group_desc *desc;
         struct kmem_cache *cachep;
  
-       /* This is the number of blocks used by GDT */
-       num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) -
-                               1) >> EXT4_DESC_PER_BLOCK_BITS(sb);
-
-       /*
-        * This is the total number of blocks used by GDT including
-        * the number of reserved blocks for GDT.
-        * The s_group_info array is allocated with this value
-        * to allow a clean online resize without a complex
-        * manipulation of pointer.
-        * The drawback is the unused memory when no resize
-        * occurs but it's very low in terms of pages
-        * (see comments below)
-        * Need to handle this properly when META_BG resizing is allowed
-        */
-       num_meta_group_infos_max = num_meta_group_infos +
-                               le16_to_cpu(es->s_reserved_gdt_blocks);
+       err = ext4_mb_alloc_groupinfo(sb, ngroups);
+       if (err)
+               return err;
  
-       /*
-        * array_size is the size of s_group_info array. We round it
-        * to the next power of two because this approximation is done
-        * internally by kmalloc so we can have some more memory
-        * for free here (e.g. may be used for META_BG resize).
-        */
-       array_size = 1;
-       while (array_size < sizeof(*sbi->s_group_info) *
-              num_meta_group_infos_max)
-               array_size = array_size << 1;
-       /* An 8TB filesystem with 64-bit pointers requires a 4096 byte
-        * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
-        * So a two level scheme suffices for now. */
-       sbi->s_group_info = ext4_kvzalloc(array_size, GFP_KERNEL);
-       if (sbi->s_group_info == NULL) {
-               ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group");
-               return -ENOMEM;
-       }
         sbi->s_buddy_cache = new_inode(sb);
         if (sbi->s_buddy_cache == NULL) {
                 ext4_msg(sb, KERN_ERR, "can't get new inode");
@@ -2322,7 +2323,7 @@ err_freebuddy:
         cachep = get_groupinfo_cache(sb->s_blocksize_bits);
         while (i-- > 0)
                 kmem_cache_free(cachep, ext4_get_group_info(sb, i));
-       i = num_meta_group_infos;
+       i = sbi->s_group_info_size;
         while (i-- > 0)
                 kfree(sbi->s_group_info[i]);
         iput(sbi->s_buddy_cache);
@@ -4008,7 +4009,6 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
         ext4_get_group_no_and_offset(sb, goal, &group, &block);
  
         /* set up allocation goals */
-       memset(ac, 0, sizeof(struct ext4_allocation_context));
         ac->ac_b_ex.fe_logical = ar->logical & ~(sbi->s_cluster_ratio - 1);
         ac->ac_status = AC_STATUS_CONTINUE;
         ac->ac_sb = sb;
@@ -4291,7 +4291,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
                 }
         }
  
-       ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
+       ac = kmem_cache_zalloc(ext4_ac_cachep, GFP_NOFS);
         if (!ac) {
                 ar->len = 0;
                 *errp = -ENOMEM;
@@ -4657,6 +4657,8 @@ do_more:
                  * with group lock held. generate_buddy look at
                  * them with group lock_held
                  */
+               if (test_opt(sb, DISCARD))
+                       ext4_issue_discard(sb, block_group, bit, count);
                 ext4_lock_group(sb, block_group);
                 mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
                 mb_free_blocks(inode, &e4b, bit, count_clusters);
@@ -4988,7 +4990,8 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
  
         start = range->start >> sb->s_blocksize_bits;
         end = start + (range->len >> sb->s_blocksize_bits) - 1;
-       minlen = range->minlen >> sb->s_blocksize_bits;
+       minlen = EXT4_NUM_B2C(EXT4_SB(sb),
+                             range->minlen >> sb->s_blocksize_bits);
  
         if (unlikely(minlen > EXT4_CLUSTERS_PER_GROUP(sb)) ||
             unlikely(start >= max_blks))
@@ -5048,6 +5051,6 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
                 atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen);
  
  out:
-       range->len = trimmed * sb->s_blocksize;
+       range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits;
         return ret;
  }
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h

index c070618..3ccd889 100644 (file)
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -65,11 +65,6 @@ extern u8 mb_enable_debug;
  #define MB_DEFAULT_MIN_TO_SCAN         10
  
  /*
- * How many groups mballoc will scan looking for the best chunk
- */
-#define MB_DEFAULT_MAX_GROUPS_TO_SCAN  5
-
-/*
   * with 'ext4_mb_stats' allocator will collect stats that will be
   * shown at umount. The collecting costs though!
   */
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c

index c5826c6..292daee 100644 (file)
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -141,55 +141,21 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
  }
  
  /**
- * mext_check_null_inode - NULL check for two inodes
- *
- * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
- */
-static int
-mext_check_null_inode(struct inode *inode1, struct inode *inode2,
-                     const char *function, unsigned int line)
-{
-       int ret = 0;
-
-       if (inode1 == NULL) {
-               __ext4_error(inode2->i_sb, function, line,
-                       "Both inodes should not be NULL: "
-                       "inode1 NULL inode2 %lu", inode2->i_ino);
-               ret = -EIO;
-       } else if (inode2 == NULL) {
-               __ext4_error(inode1->i_sb, function, line,
-                       "Both inodes should not be NULL: "
-                       "inode1 %lu inode2 NULL", inode1->i_ino);
-               ret = -EIO;
-       }
-       return ret;
-}
-
-/**
   * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem
   *
- * @orig_inode:                original inode structure
- * @donor_inode:       donor inode structure
- * Acquire write lock of i_data_sem of the two inodes (orig and donor) by
- * i_ino order.
+ * Acquire write lock of i_data_sem of the two inodes
   */
  static void
-double_down_write_data_sem(struct inode *orig_inode, struct inode *donor_inode)
+double_down_write_data_sem(struct inode *first, struct inode *second)
  {
-       struct inode *first = orig_inode, *second = donor_inode;
+       if (first < second) {
+               down_write(&EXT4_I(first)->i_data_sem);
+               down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING);
+       } else {
+               down_write(&EXT4_I(second)->i_data_sem);
+               down_write_nested(&EXT4_I(first)->i_data_sem, SINGLE_DEPTH_NESTING);
  
-       /*
-        * Use the inode number to provide the stable locking order instead
-        * of its address, because the C language doesn't guarantee you can
-        * compare pointers that don't come from the same array.
-        */
-       if (donor_inode->i_ino < orig_inode->i_ino) {
-               first = donor_inode;
-               second = orig_inode;
         }
-
-       down_write(&EXT4_I(first)->i_data_sem);
-       down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING);
  }
  
  /**
@@ -604,9 +570,8 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
         diff = donor_off - le32_to_cpu(tmp_dext->ee_block);
  
         ext4_ext_store_pblock(tmp_dext, ext4_ext_pblock(tmp_dext) + diff);
-       tmp_dext->ee_block =
-                       cpu_to_le32(le32_to_cpu(tmp_dext->ee_block) + diff);
-       tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_dext->ee_len) - diff);
+       le32_add_cpu(&tmp_dext->ee_block, diff);
+       le16_add_cpu(&tmp_dext->ee_len, -diff);
  
         if (max_count < ext4_ext_get_actual_len(tmp_dext))
                 tmp_dext->ee_len = cpu_to_le16(max_count);
@@ -629,6 +594,43 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
  }
  
  /**
+ * mext_check_coverage - Check that all extents in range has the same type
+ *
+ * @inode:             inode in question
+ * @from:              block offset of inode
+ * @count:             block count to be checked
+ * @uninit:            extents expected to be uninitialized
+ * @err:               pointer to save error value
+ *
+ * Return 1 if all extents in range has expected type, and zero otherwise.
+ */
+static int
+mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count,
+                         int uninit, int *err)
+{
+       struct ext4_ext_path *path = NULL;
+       struct ext4_extent *ext;
+       ext4_lblk_t last = from + count;
+       while (from < last) {
+               *err = get_ext_path(inode, from, &path);
+               if (*err)
+                       return 0;
+               ext = path[ext_depth(inode)].p_ext;
+               if (!ext) {
+                       ext4_ext_drop_refs(path);
+                       return 0;
+               }
+               if (uninit != ext4_ext_is_uninitialized(ext)) {
+                       ext4_ext_drop_refs(path);
+                       return 0;
+               }
+               from += ext4_ext_get_actual_len(ext);
+               ext4_ext_drop_refs(path);
+       }
+       return 1;
+}
+
+/**
   * mext_replace_branches - Replace original extents with new extents
   *
   * @handle:            journal handle
@@ -663,9 +665,6 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
         int replaced_count = 0;
         int dext_alen;
  
-       /* Protect extent trees against block allocations via delalloc */
-       double_down_write_data_sem(orig_inode, donor_inode);
-
         /* Get the original extent for the block "orig_off" */
         *err = get_ext_path(orig_inode, orig_off, &orig_path);
         if (*err)
@@ -764,12 +763,122 @@ out:
         ext4_ext_invalidate_cache(orig_inode);
         ext4_ext_invalidate_cache(donor_inode);
  
-       double_up_write_data_sem(orig_inode, donor_inode);
-
         return replaced_count;
  }
  
  /**
+ * mext_page_double_lock - Grab and lock pages on both @inode1 and @inode2
+ *
+ * @inode1:    the inode structure
+ * @inode2:    the inode structure
+ * @index:     page index
+ * @page:      result page vector
+ *
+ * Grab two locked pages for inode's by inode order
+ */
+static int
+mext_page_double_lock(struct inode *inode1, struct inode *inode2,
+                     pgoff_t index, struct page *page[2])
+{
+       struct address_space *mapping[2];
+       unsigned fl = AOP_FLAG_NOFS;
+
+       BUG_ON(!inode1 || !inode2);
+       if (inode1 < inode2) {
+               mapping[0] = inode1->i_mapping;
+               mapping[1] = inode2->i_mapping;
+       } else {
+               mapping[0] = inode2->i_mapping;
+               mapping[1] = inode1->i_mapping;
+       }
+
+       page[0] = grab_cache_page_write_begin(mapping[0], index, fl);
+       if (!page[0])
+               return -ENOMEM;
+
+       page[1] = grab_cache_page_write_begin(mapping[1], index, fl);
+       if (!page[1]) {
+               unlock_page(page[0]);
+               page_cache_release(page[0]);
+               return -ENOMEM;
+       }
+
+       if (inode1 > inode2) {
+               struct page *tmp;
+               tmp = page[0];
+               page[0] = page[1];
+               page[1] = tmp;
+       }
+       return 0;
+}
+
+/* Force page buffers uptodate w/o dropping page's lock */
+static int
+mext_page_mkuptodate(struct page *page, unsigned from, unsigned to)
+{
+       struct inode *inode = page->mapping->host;
+       sector_t block;
+       struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
+       unsigned int blocksize, block_start, block_end;
+       int i, err,  nr = 0, partial = 0;
+       BUG_ON(!PageLocked(page));
+       BUG_ON(PageWriteback(page));
+
+       if (PageUptodate(page))
+               return 0;
+
+       blocksize = 1 << inode->i_blkbits;
+       if (!page_has_buffers(page))
+               create_empty_buffers(page, blocksize, 0);
+
+       head = page_buffers(page);
+       block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+       for (bh = head, block_start = 0; bh != head || !block_start;
+            block++, block_start = block_end, bh = bh->b_this_page) {
+               block_end = block_start + blocksize;
+               if (block_end <= from || block_start >= to) {
+                       if (!buffer_uptodate(bh))
+                               partial = 1;
+                       continue;
+               }
+               if (buffer_uptodate(bh))
+                       continue;
+               if (!buffer_mapped(bh)) {
+                       int err = 0;
+                       err = ext4_get_block(inode, block, bh, 0);
+                       if (err) {
+                               SetPageError(page);
+                               return err;
+                       }
+                       if (!buffer_mapped(bh)) {
+                               zero_user(page, block_start, blocksize);
+                               if (!err)
+                                       set_buffer_uptodate(bh);
+                               continue;
+                       }
+               }
+               BUG_ON(nr >= MAX_BUF_PER_PAGE);
+               arr[nr++] = bh;
+       }
+       /* No io required */
+       if (!nr)
+               goto out;
+
+       for (i = 0; i < nr; i++) {
+               bh = arr[i];
+               if (!bh_uptodate_or_lock(bh)) {
+                       err = bh_submit_read(bh);
+                       if (err)
+                               return err;
+               }
+       }
+out:
+       if (!partial)
+               SetPageUptodate(page);
+       return 0;
+}
+
+/**
   * move_extent_per_page - Move extent data per page
   *
   * @o_filp:                    file structure of original file
@@ -791,26 +900,24 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
                   int block_len_in_page, int uninit, int *err)
  {
         struct inode *orig_inode = o_filp->f_dentry->d_inode;
-       struct address_space *mapping = orig_inode->i_mapping;
-       struct buffer_head *bh;
-       struct page *page = NULL;
-       const struct address_space_operations *a_ops = mapping->a_ops;
+       struct page *pagep[2] = {NULL, NULL};
         handle_t *handle;
         ext4_lblk_t orig_blk_offset;
         long long offs = orig_page_offset << PAGE_CACHE_SHIFT;
         unsigned long blocksize = orig_inode->i_sb->s_blocksize;
         unsigned int w_flags = 0;
         unsigned int tmp_data_size, data_size, replaced_size;
-       void *fsdata;
-       int i, jblocks;
-       int err2 = 0;
+       int err2, jblocks, retries = 0;
         int replaced_count = 0;
+       int from = data_offset_in_page << orig_inode->i_blkbits;
         int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
  
         /*
          * It needs twice the amount of ordinary journal buffers because
          * inode and donor_inode may change each different metadata blocks.
          */
+again:
+       *err = 0;
         jblocks = ext4_writepage_trans_blocks(orig_inode) * 2;
         handle = ext4_journal_start(orig_inode, jblocks);
         if (IS_ERR(handle)) {
@@ -824,19 +931,6 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
         orig_blk_offset = orig_page_offset * blocks_per_page +
                 data_offset_in_page;
  
-       /*
-        * If orig extent is uninitialized one,
-        * it's not necessary force the page into memory
-        * and then force it to be written out again.
-        * Just swap data blocks between orig and donor.
-        */
-       if (uninit) {
-               replaced_count = mext_replace_branches(handle, orig_inode,
-                                               donor_inode, orig_blk_offset,
-                                               block_len_in_page, err);
-               goto out2;
-       }
-
         offs = (long long)orig_blk_offset << orig_inode->i_blkbits;
  
         /* Calculate data_size */
@@ -858,75 +952,120 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
  
         replaced_size = data_size;
  
-       *err = a_ops->write_begin(o_filp, mapping, offs, data_size, w_flags,
-                                &page, &fsdata);
+       *err = mext_page_double_lock(orig_inode, donor_inode, orig_page_offset,
+                                    pagep);
         if (unlikely(*err < 0))
-               goto out;
-
-       if (!PageUptodate(page)) {
-               mapping->a_ops->readpage(o_filp, page);
-               lock_page(page);
-       }
-
+               goto stop_journal;
         /*
-        * try_to_release_page() doesn't call releasepage in writeback mode.
-        * We should care about the order of writing to the same file
-        * by multiple move extent processes.
-        * It needs to call wait_on_page_writeback() to wait for the
-        * writeback of the page.
+        * If orig extent was uninitialized it can become initialized
+        * at any time after i_data_sem was dropped, in order to
+        * serialize with delalloc we have recheck extent while we
+        * hold page's lock, if it is still the case data copy is not
+        * necessary, just swap data blocks between orig and donor.
          */
-       wait_on_page_writeback(page);
+       if (uninit) {
+               double_down_write_data_sem(orig_inode, donor_inode);
+               /* If any of extents in range became initialized we have to
+                * fallback to data copying */
+               uninit = mext_check_coverage(orig_inode, orig_blk_offset,
+                                            block_len_in_page, 1, err);
+               if (*err)
+                       goto drop_data_sem;
  
-       /* Release old bh and drop refs */
-       try_to_release_page(page, 0);
+               uninit &= mext_check_coverage(donor_inode, orig_blk_offset,
+                                             block_len_in_page, 1, err);
+               if (*err)
+                       goto drop_data_sem;
+
+               if (!uninit) {
+                       double_up_write_data_sem(orig_inode, donor_inode);
+                       goto data_copy;
+               }
+               if ((page_has_private(pagep[0]) &&
+                    !try_to_release_page(pagep[0], 0)) ||
+                   (page_has_private(pagep[1]) &&
+                    !try_to_release_page(pagep[1], 0))) {
+                       *err = -EBUSY;
+                       goto drop_data_sem;
+               }
+               replaced_count = mext_replace_branches(handle, orig_inode,
+                                               donor_inode, orig_blk_offset,
+                                               block_len_in_page, err);
+       drop_data_sem:
+               double_up_write_data_sem(orig_inode, donor_inode);
+               goto unlock_pages;
+       }
+data_copy:
+       *err = mext_page_mkuptodate(pagep[0], from, from + replaced_size);
+       if (*err)
+               goto unlock_pages;
+
+       /* At this point all buffers in range are uptodate, old mapping layout
+        * is no longer required, try to drop it now. */
+       if ((page_has_private(pagep[0]) && !try_to_release_page(pagep[0], 0)) ||
+           (page_has_private(pagep[1]) && !try_to_release_page(pagep[1], 0))) {
+               *err = -EBUSY;
+               goto unlock_pages;
+       }
  
         replaced_count = mext_replace_branches(handle, orig_inode, donor_inode,
-                                       orig_blk_offset, block_len_in_page,
-                                       &err2);
-       if (err2) {
+                                              orig_blk_offset,
+                                              block_len_in_page, err);
+       if (*err) {
                 if (replaced_count) {
                         block_len_in_page = replaced_count;
                         replaced_size =
                                 block_len_in_page << orig_inode->i_blkbits;
                 } else
-                       goto out;
+                       goto unlock_pages;
         }
+       /* Perform all necessary steps similar write_begin()/write_end()
+        * but keeping in mind that i_size will not change */
+       *err = __block_write_begin(pagep[0], from, from + replaced_size,
+                                  ext4_get_block);
+       if (!*err)
+               *err = block_commit_write(pagep[0], from, from + replaced_size);
  
-       if (!page_has_buffers(page))
-               create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0);
-
-       bh = page_buffers(page);
-       for (i = 0; i < data_offset_in_page; i++)
-               bh = bh->b_this_page;
-
-       for (i = 0; i < block_len_in_page; i++) {
-               *err = ext4_get_block(orig_inode,
-                               (sector_t)(orig_blk_offset + i), bh, 0);
-               if (*err < 0)
-                       goto out;
-
-               if (bh->b_this_page != NULL)
-                       bh = bh->b_this_page;
-       }
-
-       *err = a_ops->write_end(o_filp, mapping, offs, data_size, replaced_size,
-                              page, fsdata);
-       page = NULL;
-
-out:
-       if (unlikely(page)) {
-               if (PageLocked(page))
-                       unlock_page(page);
-               page_cache_release(page);
-               ext4_journal_stop(handle);
-       }
-out2:
+       if (unlikely(*err < 0))
+               goto repair_branches;
+
+       /* Even in case of data=writeback it is reasonable to pin
+        * inode to transaction, to prevent unexpected data loss */
+       *err = ext4_jbd2_file_inode(handle, orig_inode);
+
+unlock_pages:
+       unlock_page(pagep[0]);
+       page_cache_release(pagep[0]);
+       unlock_page(pagep[1]);
+       page_cache_release(pagep[1]);
+stop_journal:
         ext4_journal_stop(handle);
-
-       if (err2)
-               *err = err2;
-
+       /* Buffer was busy because probably is pinned to journal transaction,
+        * force transaction commit may help to free it. */
+       if (*err == -EBUSY && ext4_should_retry_alloc(orig_inode->i_sb,
+                                                     &retries))
+               goto again;
         return replaced_count;
+
+repair_branches:
+       /*
+        * This should never ever happen!
+        * Extents are swapped already, but we are not able to copy data.
+        * Try to swap extents to it's original places
+        */
+       double_down_write_data_sem(orig_inode, donor_inode);
+       replaced_count = mext_replace_branches(handle, donor_inode, orig_inode,
+                                              orig_blk_offset,
+                                              block_len_in_page, &err2);
+       double_up_write_data_sem(orig_inode, donor_inode);
+       if (replaced_count != block_len_in_page) {
+               EXT4_ERROR_INODE_BLOCK(orig_inode, (sector_t)(orig_blk_offset),
+                                      "Unable to copy data block,"
+                                      " data will be lost.");
+               *err = -EIO;
+       }
+       replaced_count = 0;
+       goto unlock_pages;
  }
  
  /**
@@ -969,14 +1108,6 @@ mext_check_arguments(struct inode *orig_inode,
                 return -EINVAL;
         }
  
-       /* Files should be in the same ext4 FS */
-       if (orig_inode->i_sb != donor_inode->i_sb) {
-               ext4_debug("ext4 move extent: The argument files "
-                       "should be in same FS [ino:orig %lu, donor %lu]\n",
-                       orig_inode->i_ino, donor_inode->i_ino);
-               return -EINVAL;
-       }
-
         /* Ext4 move extent supports only extent based file */
         if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) {
                 ext4_debug("ext4 move extent: orig file is not extents "
@@ -1002,7 +1133,6 @@ mext_check_arguments(struct inode *orig_inode,
         }
  
         if ((orig_start >= EXT_MAX_BLOCKS) ||
-           (donor_start >= EXT_MAX_BLOCKS) ||
             (*len > EXT_MAX_BLOCKS) ||
             (orig_start + *len >= EXT_MAX_BLOCKS))  {
                 ext4_debug("ext4 move extent: Can't handle over [%u] blocks "
@@ -1072,35 +1202,19 @@ mext_check_arguments(struct inode *orig_inode,
   * @inode1:    the inode structure
   * @inode2:    the inode structure
   *
- * Lock two inodes' i_mutex by i_ino order.
- * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
+ * Lock two inodes' i_mutex
   */
-static int
+static void
  mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
  {
-       int ret = 0;
-
-       BUG_ON(inode1 == NULL && inode2 == NULL);
-
-       ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__);
-       if (ret < 0)
-               goto out;
-
-       if (inode1 == inode2) {
-               mutex_lock(&inode1->i_mutex);
-               goto out;
-       }
-
-       if (inode1->i_ino < inode2->i_ino) {
+       BUG_ON(inode1 == inode2);
+       if (inode1 < inode2) {
                 mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
                 mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
         } else {
                 mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT);
                 mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD);
         }
-
-out:
-       return ret;
  }
  
  /**
@@ -1109,28 +1223,13 @@ out:
   * @inode1:     the inode that is released first
   * @inode2:     the inode that is released second
   *
- * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
   */
  
-static int
+static void
  mext_inode_double_unlock(struct inode *inode1, struct inode *inode2)
  {
-       int ret = 0;
-
-       BUG_ON(inode1 == NULL && inode2 == NULL);
-
-       ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__);
-       if (ret < 0)
-               goto out;
-
-       if (inode1)
-               mutex_unlock(&inode1->i_mutex);
-
-       if (inode2 && inode2 != inode1)
-               mutex_unlock(&inode2->i_mutex);
-
-out:
-       return ret;
+       mutex_unlock(&inode1->i_mutex);
+       mutex_unlock(&inode2->i_mutex);
  }
  
  /**
@@ -1187,16 +1286,23 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
         ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0;
         ext4_lblk_t rest_blocks;
         pgoff_t orig_page_offset = 0, seq_end_page;
-       int ret1, ret2, depth, last_extent = 0;
+       int ret, depth, last_extent = 0;
         int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
         int data_offset_in_page;
         int block_len_in_page;
         int uninit;
  
-       /* orig and donor should be different file */
-       if (orig_inode->i_ino == donor_inode->i_ino) {
+       if (orig_inode->i_sb != donor_inode->i_sb) {
+               ext4_debug("ext4 move extent: The argument files "
+                       "should be in same FS [ino:orig %lu, donor %lu]\n",
+                       orig_inode->i_ino, donor_inode->i_ino);
+               return -EINVAL;
+       }
+
+       /* orig and donor should be different inodes */
+       if (orig_inode == donor_inode) {
                 ext4_debug("ext4 move extent: The argument files should not "
-                       "be same file [ino:orig %lu, donor %lu]\n",
+                       "be same inode [ino:orig %lu, donor %lu]\n",
                         orig_inode->i_ino, donor_inode->i_ino);
                 return -EINVAL;
         }
@@ -1208,18 +1314,27 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
                         orig_inode->i_ino, donor_inode->i_ino);
                 return -EINVAL;
         }
-
+       /* TODO: This is non obvious task to swap blocks for inodes with full
+          jornaling enabled */
+       if (ext4_should_journal_data(orig_inode) ||
+           ext4_should_journal_data(donor_inode)) {
+               return -EINVAL;
+       }
         /* Protect orig and donor inodes against a truncate */
-       ret1 = mext_inode_double_lock(orig_inode, donor_inode);
-       if (ret1 < 0)
-               return ret1;
+       mext_inode_double_lock(orig_inode, donor_inode);
+
+       /* Wait for all existing dio workers */
+       ext4_inode_block_unlocked_dio(orig_inode);
+       ext4_inode_block_unlocked_dio(donor_inode);
+       inode_dio_wait(orig_inode);
+       inode_dio_wait(donor_inode);
  
         /* Protect extent tree against block allocations via delalloc */
         double_down_write_data_sem(orig_inode, donor_inode);
         /* Check the filesystem environment whether move_extent can be done */
-       ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start,
+       ret = mext_check_arguments(orig_inode, donor_inode, orig_start,
                                     donor_start, &len);
-       if (ret1)
+       if (ret)
                 goto out;
  
         file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits;
@@ -1227,13 +1342,13 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
         if (file_end < block_end)
                 len -= block_end - file_end;
  
-       ret1 = get_ext_path(orig_inode, block_start, &orig_path);
-       if (ret1)
+       ret = get_ext_path(orig_inode, block_start, &orig_path);
+       if (ret)
                 goto out;
  
         /* Get path structure to check the hole */
-       ret1 = get_ext_path(orig_inode, block_start, &holecheck_path);
-       if (ret1)
+       ret = get_ext_path(orig_inode, block_start, &holecheck_path);
+       if (ret)
                 goto out;
  
         depth = ext_depth(orig_inode);
@@ -1252,13 +1367,13 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
                 last_extent = mext_next_extent(orig_inode,
                                         holecheck_path, &ext_cur);
                 if (last_extent < 0) {
-                       ret1 = last_extent;
+                       ret = last_extent;
                         goto out;
                 }
                 last_extent = mext_next_extent(orig_inode, orig_path,
                                                         &ext_dummy);
                 if (last_extent < 0) {
-                       ret1 = last_extent;
+                       ret = last_extent;
                         goto out;
                 }
                 seq_start = le32_to_cpu(ext_cur->ee_block);
@@ -1272,7 +1387,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
         if (le32_to_cpu(ext_cur->ee_block) > block_end) {
                 ext4_debug("ext4 move extent: The specified range of file "
                                                         "may be the hole\n");
-               ret1 = -EINVAL;
+               ret = -EINVAL;
                 goto out;
         }
  
@@ -1292,7 +1407,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
                 last_extent = mext_next_extent(orig_inode, holecheck_path,
                                                 &ext_cur);
                 if (last_extent < 0) {
-                       ret1 = last_extent;
+                       ret = last_extent;
                         break;
                 }
                 add_blocks = ext4_ext_get_actual_len(ext_cur);
@@ -1349,18 +1464,18 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
                                                 orig_page_offset,
                                                 data_offset_in_page,
                                                 block_len_in_page, uninit,
-                                               &ret1);
+                                               &ret);
  
                         /* Count how many blocks we have exchanged */
                         *moved_len += block_len_in_page;
-                       if (ret1 < 0)
+                       if (ret < 0)
                                 break;
                         if (*moved_len > len) {
                                 EXT4_ERROR_INODE(orig_inode,
                                         "We replaced blocks too much! "
                                         "sum of replaced: %llu requested: %llu",
                                         *moved_len, len);
-                               ret1 = -EIO;
+                               ret = -EIO;
                                 break;
                         }
  
@@ -1374,22 +1489,22 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
                 }
  
                 double_down_write_data_sem(orig_inode, donor_inode);
-               if (ret1 < 0)
+               if (ret < 0)
                         break;
  
                 /* Decrease buffer counter */
                 if (holecheck_path)
                         ext4_ext_drop_refs(holecheck_path);
-               ret1 = get_ext_path(orig_inode, seq_start, &holecheck_path);
-               if (ret1)
+               ret = get_ext_path(orig_inode, seq_start, &holecheck_path);
+               if (ret)
                         break;
                 depth = holecheck_path->p_depth;
  
                 /* Decrease buffer counter */
                 if (orig_path)
                         ext4_ext_drop_refs(orig_path);
-               ret1 = get_ext_path(orig_inode, seq_start, &orig_path);
-               if (ret1)
+               ret = get_ext_path(orig_inode, seq_start, &orig_path);
+               if (ret)
                         break;
  
                 ext_cur = holecheck_path[depth].p_ext;
@@ -1412,12 +1527,9 @@ out:
                 kfree(holecheck_path);
         }
         double_up_write_data_sem(orig_inode, donor_inode);
-       ret2 = mext_inode_double_unlock(orig_inode, donor_inode);
-
-       if (ret1)
-               return ret1;
-       else if (ret2)
-               return ret2;
+       ext4_inode_resume_unlocked_dio(orig_inode);
+       ext4_inode_resume_unlocked_dio(donor_inode);
+       mext_inode_double_unlock(orig_inode, donor_inode);
  
-       return 0;
+       return ret;
  }
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c

index 2a42cc0..6d600a6 100644 (file)
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -55,6 +55,13 @@ static struct buffer_head *ext4_append(handle_t *handle,
  {
         struct buffer_head *bh;
  
+       if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb &&
+                    ((inode->i_size >> 10) >=
+                     EXT4_SB(inode->i_sb)->s_max_dir_size_kb))) {
+               *err = -ENOSPC;
+               return NULL;
+       }
+
         *block = inode->i_size >> inode->i_sb->s_blocksize_bits;
  
         bh = ext4_bread(handle, inode, *block, 1, err);
@@ -67,6 +74,12 @@ static struct buffer_head *ext4_append(handle_t *handle,
                         bh = NULL;
                 }
         }
+       if (!bh && !(*err)) {
+               *err = -EIO;
+               ext4_error(inode->i_sb,
+                          "Directory hole detected on inode %lu\n",
+                          inode->i_ino);
+       }
         return bh;
  }
  
@@ -594,8 +607,11 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
         u32 hash;
  
         frame->bh = NULL;
-       if (!(bh = ext4_bread (NULL,dir, 0, 0, err)))
+       if (!(bh = ext4_bread(NULL, dir, 0, 0, err))) {
+               if (*err == 0)
+                       *err = ERR_BAD_DX_DIR;
                 goto fail;
+       }
         root = (struct dx_root *) bh->b_data;
         if (root->info.hash_version != DX_HASH_TEA &&
             root->info.hash_version != DX_HASH_HALF_MD4 &&
@@ -696,8 +712,11 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
                 frame->entries = entries;
                 frame->at = at;
                 if (!indirect--) return frame;
-               if (!(bh = ext4_bread (NULL,dir, dx_get_block(at), 0, err)))
+               if (!(bh = ext4_bread(NULL, dir, dx_get_block(at), 0, err))) {
+                       if (!(*err))
+                               *err = ERR_BAD_DX_DIR;
                         goto fail2;
+               }
                 at = entries = ((struct dx_node *) bh->b_data)->entries;
  
                 if (!buffer_verified(bh) &&
@@ -807,8 +826,15 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
          */
         while (num_frames--) {
                 if (!(bh = ext4_bread(NULL, dir, dx_get_block(p->at),
-                                     0, &err)))
+                                     0, &err))) {
+                       if (!err) {
+                               ext4_error(dir->i_sb,
+                                          "Directory hole detected on inode %lu\n",
+                                          dir->i_ino);
+                               return -EIO;
+                       }
                         return err; /* Failure */
+               }
  
                 if (!buffer_verified(bh) &&
                     !ext4_dx_csum_verify(dir,
@@ -839,12 +865,19 @@ static int htree_dirblock_to_tree(struct file *dir_file,
  {
         struct buffer_head *bh;
         struct ext4_dir_entry_2 *de, *top;
-       int err, count = 0;
+       int err = 0, count = 0;
  
         dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n",
                                                         (unsigned long)block));
-       if (!(bh = ext4_bread (NULL, dir, block, 0, &err)))
+       if (!(bh = ext4_bread(NULL, dir, block, 0, &err))) {
+               if (!err) {
+                       err = -EIO;
+                       ext4_error(dir->i_sb,
+                                  "Directory hole detected on inode %lu\n",
+                                  dir->i_ino);
+               }
                 return err;
+       }
  
         if (!buffer_verified(bh) &&
             !ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data))
@@ -1267,8 +1300,15 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
                 return NULL;
         do {
                 block = dx_get_block(frame->at);
-               if (!(bh = ext4_bread(NULL, dir, block, 0, err)))
+               if (!(bh = ext4_bread(NULL, dir, block, 0, err))) {
+                       if (!(*err)) {
+                               *err = -EIO;
+                               ext4_error(dir->i_sb,
+                                          "Directory hole detected on inode %lu\n",
+                                          dir->i_ino);
+                       }
                         goto errout;
+               }
  
                 if (!buffer_verified(bh) &&
                     !ext4_dirent_csum_verify(dir,
@@ -1801,9 +1841,15 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
         }
         blocks = dir->i_size >> sb->s_blocksize_bits;
         for (block = 0; block < blocks; block++) {
-               bh = ext4_bread(handle, dir, block, 0, &retval);
-               if(!bh)
+               if (!(bh = ext4_bread(handle, dir, block, 0, &retval))) {
+                       if (!retval) {
+                               retval = -EIO;
+                               ext4_error(inode->i_sb,
+                                          "Directory hole detected on inode %lu\n",
+                                          inode->i_ino);
+                       }
                         return retval;
+               }
                 if (!buffer_verified(bh) &&
                     !ext4_dirent_csum_verify(dir,
                                 (struct ext4_dir_entry *)bh->b_data))
@@ -1860,8 +1906,15 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
         entries = frame->entries;
         at = frame->at;
  
-       if (!(bh = ext4_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
+       if (!(bh = ext4_bread(handle, dir, dx_get_block(frame->at), 0, &err))) {
+               if (!err) {
+                       err = -EIO;
+                       ext4_error(dir->i_sb,
+                                  "Directory hole detected on inode %lu\n",
+                                  dir->i_ino);
+               }
                 goto cleanup;
+       }
  
         if (!buffer_verified(bh) &&
             !ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data))
@@ -2149,9 +2202,7 @@ retry:
         err = PTR_ERR(inode);
         if (!IS_ERR(inode)) {
                 init_special_inode(inode, inode->i_mode, rdev);
-#ifdef CONFIG_EXT4_FS_XATTR
                 inode->i_op = &ext4_special_inode_operations;
-#endif
                 err = ext4_add_nondir(handle, dentry, inode);
         }
         ext4_journal_stop(handle);
@@ -2199,9 +2250,15 @@ retry:
         inode->i_op = &ext4_dir_inode_operations;
         inode->i_fop = &ext4_dir_operations;
         inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
-       dir_block = ext4_bread(handle, inode, 0, 1, &err);
-       if (!dir_block)
+       if (!(dir_block = ext4_bread(handle, inode, 0, 1, &err))) {
+               if (!err) {
+                       err = -EIO;
+                       ext4_error(inode->i_sb,
+                                  "Directory hole detected on inode %lu\n",
+                                  inode->i_ino);
+               }
                 goto out_clear_inode;
+       }
         BUFFER_TRACE(dir_block, "get_write_access");
         err = ext4_journal_get_write_access(handle, dir_block);
         if (err)
@@ -2318,6 +2375,11 @@ static int empty_dir(struct inode *inode)
                                         EXT4_ERROR_INODE(inode,
                                                 "error %d reading directory "
                                                 "lblock %u", err, lblock);
+                               else
+                                       ext4_warning(inode->i_sb,
+                                               "bad directory (dir #%lu) - no data block",
+                                               inode->i_ino);
+
                                 offset += sb->s_blocksize;
                                 continue;
                         }
@@ -2362,7 +2424,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
         struct ext4_iloc iloc;
         int err = 0, rc;
  
-       if (!ext4_handle_valid(handle))
+       if (!EXT4_SB(sb)->s_journal)
                 return 0;
  
         mutex_lock(&EXT4_SB(sb)->s_orphan_lock);
@@ -2436,8 +2498,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
         struct ext4_iloc iloc;
         int err = 0;
  
-       /* ext4_handle_valid() assumes a valid handle_t pointer */
-       if (handle && !ext4_handle_valid(handle))
+       if (!EXT4_SB(inode->i_sb)->s_journal)
                 return 0;
  
         mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
@@ -2456,7 +2517,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
          * transaction handle with which to update the orphan list on
          * disk, but we still need to remove the inode from the linked
          * list in memory. */
-       if (sbi->s_journal && !handle)
+       if (!handle)
                 goto out;
  
         err = ext4_reserve_inode_write(handle, inode, &iloc);
@@ -2826,9 +2887,15 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                                 goto end_rename;
                 }
                 retval = -EIO;
-               dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval);
-               if (!dir_bh)
+               if (!(dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval))) {
+                       if (!retval) {
+                               retval = -EIO;
+                               ext4_error(old_inode->i_sb,
+                                          "Directory hole detected on inode %lu\n",
+                                          old_inode->i_ino);
+                       }
                         goto end_rename;
+               }
                 if (!buffer_verified(dir_bh) &&
                     !ext4_dirent_csum_verify(old_inode,
                                 (struct ext4_dir_entry *)dir_bh->b_data))
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c

index dcdeef1..68e896e 100644 (file)
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -71,6 +71,9 @@ void ext4_free_io_end(ext4_io_end_t *io)
         int i;
  
         BUG_ON(!io);
+       BUG_ON(!list_empty(&io->list));
+       BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN);
+
         if (io->page)
                 put_page(io->page);
         for (i = 0; i < io->num_io_pages; i++)
@@ -81,13 +84,8 @@ void ext4_free_io_end(ext4_io_end_t *io)
         kmem_cache_free(io_end_cachep, io);
  }
  
-/*
- * check a range of space and convert unwritten extents to written.
- *
- * Called with inode->i_mutex; we depend on this when we manipulate
- * io->flag, since we could otherwise race with ext4_flush_completed_IO()
- */
-int ext4_end_io_nolock(ext4_io_end_t *io)
+/* check a range of space and convert unwritten extents to written. */
+static int ext4_end_io(ext4_io_end_t *io)
  {
         struct inode *inode = io->inode;
         loff_t offset = io->offset;
@@ -106,63 +104,136 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
                          "(inode %lu, offset %llu, size %zd, error %d)",
                          inode->i_ino, offset, size, ret);
         }
-
         if (io->iocb)
                 aio_complete(io->iocb, io->result, 0);
  
         if (io->flag & EXT4_IO_END_DIRECT)
                 inode_dio_done(inode);
         /* Wake up anyone waiting on unwritten extent conversion */
-       if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten))
+       if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
                 wake_up_all(ext4_ioend_wq(io->inode));
         return ret;
  }
  
-/*
- * work on completed aio dio IO, to convert unwritten extents to extents
- */
-static void ext4_end_io_work(struct work_struct *work)
+static void dump_completed_IO(struct inode *inode)
+{
+#ifdef EXT4FS_DEBUG
+       struct list_head *cur, *before, *after;
+       ext4_io_end_t *io, *io0, *io1;
+       unsigned long flags;
+
+       if (list_empty(&EXT4_I(inode)->i_completed_io_list)) {
+               ext4_debug("inode %lu completed_io list is empty\n",
+                          inode->i_ino);
+               return;
+       }
+
+       ext4_debug("Dump inode %lu completed_io list\n", inode->i_ino);
+       list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list) {
+               cur = &io->list;
+               before = cur->prev;
+               io0 = container_of(before, ext4_io_end_t, list);
+               after = cur->next;
+               io1 = container_of(after, ext4_io_end_t, list);
+
+               ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
+                           io, inode->i_ino, io0, io1);
+       }
+#endif
+}
+
+/* Add the io_end to per-inode completed end_io list. */
+void ext4_add_complete_io(ext4_io_end_t *io_end)
  {
-       ext4_io_end_t           *io = container_of(work, ext4_io_end_t, work);
-       struct inode            *inode = io->inode;
-       struct ext4_inode_info  *ei = EXT4_I(inode);
-       unsigned long           flags;
+       struct ext4_inode_info *ei = EXT4_I(io_end->inode);
+       struct workqueue_struct *wq;
+       unsigned long flags;
+
+       BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
+       wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
  
         spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-       if (io->flag & EXT4_IO_END_IN_FSYNC)
-               goto requeue;
-       if (list_empty(&io->list)) {
-               spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-               goto free;
+       if (list_empty(&ei->i_completed_io_list)) {
+               io_end->flag |= EXT4_IO_END_QUEUED;
+               queue_work(wq, &io_end->work);
         }
+       list_add_tail(&io_end->list, &ei->i_completed_io_list);
+       spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+}
  
-       if (!mutex_trylock(&inode->i_mutex)) {
-               bool was_queued;
-requeue:
-               was_queued = !!(io->flag & EXT4_IO_END_QUEUED);
-               io->flag |= EXT4_IO_END_QUEUED;
-               spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-               /*
-                * Requeue the work instead of waiting so that the work
-                * items queued after this can be processed.
-                */
-               queue_work(EXT4_SB(inode->i_sb)->dio_unwritten_wq, &io->work);
-               /*
-                * To prevent the ext4-dio-unwritten thread from keeping
-                * requeueing end_io requests and occupying cpu for too long,
-                * yield the cpu if it sees an end_io request that has already
-                * been requeued.
-                */
-               if (was_queued)
-                       yield();
-               return;
+static int ext4_do_flush_completed_IO(struct inode *inode,
+                                     ext4_io_end_t *work_io)
+{
+       ext4_io_end_t *io;
+       struct list_head unwritten, complete, to_free;
+       unsigned long flags;
+       struct ext4_inode_info *ei = EXT4_I(inode);
+       int err, ret = 0;
+
+       INIT_LIST_HEAD(&complete);
+       INIT_LIST_HEAD(&to_free);
+
+       spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+       dump_completed_IO(inode);
+       list_replace_init(&ei->i_completed_io_list, &unwritten);
+       spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+
+       while (!list_empty(&unwritten)) {
+               io = list_entry(unwritten.next, ext4_io_end_t, list);
+               BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN));
+               list_del_init(&io->list);
+
+               err = ext4_end_io(io);
+               if (unlikely(!ret && err))
+                       ret = err;
+
+               list_add_tail(&io->list, &complete);
+       }
+       spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+       while (!list_empty(&complete)) {
+               io = list_entry(complete.next, ext4_io_end_t, list);
+               io->flag &= ~EXT4_IO_END_UNWRITTEN;
+               /* end_io context can not be destroyed now because it still
+                * used by queued worker. Worker thread will destroy it later */
+               if (io->flag & EXT4_IO_END_QUEUED)
+                       list_del_init(&io->list);
+               else
+                       list_move(&io->list, &to_free);
+       }
+       /* If we are called from worker context, it is time to clear queued
+        * flag, and destroy it's end_io if it was converted already */
+       if (work_io) {
+               work_io->flag &= ~EXT4_IO_END_QUEUED;
+               if (!(work_io->flag & EXT4_IO_END_UNWRITTEN))
+                       list_add_tail(&work_io->list, &to_free);
         }
-       list_del_init(&io->list);
         spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-       (void) ext4_end_io_nolock(io);
-       mutex_unlock(&inode->i_mutex);
-free:
-       ext4_free_io_end(io);
+
+       while (!list_empty(&to_free)) {
+               io = list_entry(to_free.next, ext4_io_end_t, list);
+               list_del_init(&io->list);
+               ext4_free_io_end(io);
+       }
+       return ret;
+}
+
+/*
+ * work on completed aio dio IO, to convert unwritten extents to extents
+ */
+static void ext4_end_io_work(struct work_struct *work)
+{
+       ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
+       ext4_do_flush_completed_IO(io->inode, io);
+}
+
+int ext4_flush_unwritten_io(struct inode *inode)
+{
+       int ret;
+       WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex) &&
+                    !(inode->i_state & I_FREEING));
+       ret = ext4_do_flush_completed_IO(inode, NULL);
+       ext4_unwritten_wait(inode);
+       return ret;
  }
  
  ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
@@ -195,9 +266,7 @@ static void buffer_io_error(struct buffer_head *bh)
  static void ext4_end_bio(struct bio *bio, int error)
  {
         ext4_io_end_t *io_end = bio->bi_private;
-       struct workqueue_struct *wq;
         struct inode *inode;
-       unsigned long flags;
         int i;
         sector_t bi_sector = bio->bi_sector;
  
@@ -255,14 +324,7 @@ static void ext4_end_bio(struct bio *bio, int error)
                 return;
         }
  
-       /* Add the io_end to per-inode completed io list*/
-       spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
-       list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
-       spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
-
-       wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
-       /* queue the work to convert unwritten extents to written */
-       queue_work(wq, &io_end->work);
+       ext4_add_complete_io(io_end);
  }
  
  void ext4_io_submit(struct ext4_io_submit *io)
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c

index 41f6ef6..7a75e10 100644 (file)
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -45,6 +45,28 @@ void ext4_resize_end(struct super_block *sb)
         smp_mb__after_clear_bit();
  }
  
+static ext4_group_t ext4_meta_bg_first_group(struct super_block *sb,
+                                            ext4_group_t group) {
+       return (group >> EXT4_DESC_PER_BLOCK_BITS(sb)) <<
+              EXT4_DESC_PER_BLOCK_BITS(sb);
+}
+
+static ext4_fsblk_t ext4_meta_bg_first_block_no(struct super_block *sb,
+                                            ext4_group_t group) {
+       group = ext4_meta_bg_first_group(sb, group);
+       return ext4_group_first_block_no(sb, group);
+}
+
+static ext4_grpblk_t ext4_group_overhead_blocks(struct super_block *sb,
+                                               ext4_group_t group) {
+       ext4_grpblk_t overhead;
+       overhead = ext4_bg_num_gdb(sb, group);
+       if (ext4_bg_has_super(sb, group))
+               overhead += 1 +
+                         le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks);
+       return overhead;
+}
+
  #define outside(b, first, last)        ((b) < (first) || (b) >= (last))
  #define inside(b, first, last) ((b) >= (first) && (b) < (last))
  
@@ -57,9 +79,7 @@ static int verify_group_input(struct super_block *sb,
         ext4_fsblk_t end = start + input->blocks_count;
         ext4_group_t group = input->group;
         ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group;
-       unsigned overhead = ext4_bg_has_super(sb, group) ?
-               (1 + ext4_bg_num_gdb(sb, group) +
-                le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
+       unsigned overhead = ext4_group_overhead_blocks(sb, group);
         ext4_fsblk_t metaend = start + overhead;
         struct buffer_head *bh = NULL;
         ext4_grpblk_t free_blocks_count, offset;
@@ -200,13 +220,15 @@ static void free_flex_gd(struct ext4_new_flex_group_data *flex_gd)
   * be a partial of a flex group.
   *
   * @sb: super block of fs to which the groups belongs
+ *
+ * Returns 0 on a successful allocation of the metadata blocks in the
+ * block group.
   */
-static void ext4_alloc_group_tables(struct super_block *sb,
+static int ext4_alloc_group_tables(struct super_block *sb,
                                 struct ext4_new_flex_group_data *flex_gd,
                                 int flexbg_size)
  {
         struct ext4_new_group_data *group_data = flex_gd->groups;
-       struct ext4_super_block *es = EXT4_SB(sb)->s_es;
         ext4_fsblk_t start_blk;
         ext4_fsblk_t last_blk;
         ext4_group_t src_group;
@@ -226,23 +248,24 @@ static void ext4_alloc_group_tables(struct super_block *sb,
                (last_group & ~(flexbg_size - 1))));
  next_group:
         group = group_data[0].group;
+       if (src_group >= group_data[0].group + flex_gd->count)
+               return -ENOSPC;
         start_blk = ext4_group_first_block_no(sb, src_group);
         last_blk = start_blk + group_data[src_group - group].blocks_count;
  
-       overhead = ext4_bg_has_super(sb, src_group) ?
-                  (1 + ext4_bg_num_gdb(sb, src_group) +
-                   le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
+       overhead = ext4_group_overhead_blocks(sb, src_group);
  
         start_blk += overhead;
  
-       BUG_ON(src_group >= group_data[0].group + flex_gd->count);
         /* We collect contiguous blocks as much as possible. */
         src_group++;
-       for (; src_group <= last_group; src_group++)
-               if (!ext4_bg_has_super(sb, src_group))
+       for (; src_group <= last_group; src_group++) {
+               overhead = ext4_group_overhead_blocks(sb, src_group);
+               if (overhead != 0)
                         last_blk += group_data[src_group - group].blocks_count;
                 else
                         break;
+       }
  
         /* Allocate block bitmaps */
         for (; bb_index < flex_gd->count; bb_index++) {
@@ -300,6 +323,7 @@ next_group:
                                group_data[i].free_blocks_count);
                 }
         }
+       return 0;
  }
  
  static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
@@ -433,11 +457,13 @@ static int setup_new_flex_group_blocks(struct super_block *sb,
         ext4_group_t group, count;
         struct buffer_head *bh = NULL;
         int reserved_gdb, i, j, err = 0, err2;
+       int meta_bg;
  
         BUG_ON(!flex_gd->count || !group_data ||
                group_data[0].group != sbi->s_groups_count);
  
         reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks);
+       meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG);
  
         /* This transaction may be extended/restarted along the way */
         handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA);
@@ -447,12 +473,25 @@ static int setup_new_flex_group_blocks(struct super_block *sb,
         group = group_data[0].group;
         for (i = 0; i < flex_gd->count; i++, group++) {
                 unsigned long gdblocks;
+               ext4_grpblk_t overhead;
  
                 gdblocks = ext4_bg_num_gdb(sb, group);
                 start = ext4_group_first_block_no(sb, group);
  
+               if (meta_bg == 0 && !ext4_bg_has_super(sb, group))
+                       goto handle_itb;
+
+               if (meta_bg == 1) {
+                       ext4_group_t first_group;
+                       first_group = ext4_meta_bg_first_group(sb, group);
+                       if (first_group != group + 1 &&
+                           first_group != group + EXT4_DESC_PER_BLOCK(sb) - 1)
+                               goto handle_itb;
+               }
+
+               block = start + ext4_bg_has_super(sb, group);
                 /* Copy all of the GDT blocks into the backup in this group */
-               for (j = 0, block = start + 1; j < gdblocks; j++, block++) {
+               for (j = 0; j < gdblocks; j++, block++) {
                         struct buffer_head *gdb;
  
                         ext4_debug("update backup group %#04llx\n", block);
@@ -493,6 +532,7 @@ static int setup_new_flex_group_blocks(struct super_block *sb,
                                 goto out;
                 }
  
+handle_itb:
                 /* Initialize group tables of the grop @group */
                 if (!(bg_flags[i] & EXT4_BG_INODE_ZEROED))
                         goto handle_bb;
@@ -521,11 +561,11 @@ handle_bb:
                         err = PTR_ERR(bh);
                         goto out;
                 }
-               if (ext4_bg_has_super(sb, group)) {
+               overhead = ext4_group_overhead_blocks(sb, group);
+               if (overhead != 0) {
                         ext4_debug("mark backup superblock %#04llx (+0)\n",
                                    start);
-                       ext4_set_bits(bh->b_data, 0, gdblocks + reserved_gdb +
-                                                    1);
+                       ext4_set_bits(bh->b_data, 0, overhead);
                 }
                 ext4_mark_bitmap_end(group_data[i].blocks_count,
                                      sb->s_blocksize * 8, bh->b_data);
@@ -822,6 +862,45 @@ exit_bh:
  }
  
  /*
+ * add_new_gdb_meta_bg is the sister of add_new_gdb.
+ */
+static int add_new_gdb_meta_bg(struct super_block *sb,
+                              handle_t *handle, ext4_group_t group) {
+       ext4_fsblk_t gdblock;
+       struct buffer_head *gdb_bh;
+       struct buffer_head **o_group_desc, **n_group_desc;
+       unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
+       int err;
+
+       gdblock = ext4_meta_bg_first_block_no(sb, group) +
+                  ext4_bg_has_super(sb, group);
+       gdb_bh = sb_bread(sb, gdblock);
+       if (!gdb_bh)
+               return -EIO;
+       n_group_desc = ext4_kvmalloc((gdb_num + 1) *
+                                    sizeof(struct buffer_head *),
+                                    GFP_NOFS);
+       if (!n_group_desc) {
+               err = -ENOMEM;
+               ext4_warning(sb, "not enough memory for %lu groups",
+                            gdb_num + 1);
+               return err;
+       }
+
+       o_group_desc = EXT4_SB(sb)->s_group_desc;
+       memcpy(n_group_desc, o_group_desc,
+              EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
+       n_group_desc[gdb_num] = gdb_bh;
+       EXT4_SB(sb)->s_group_desc = n_group_desc;
+       EXT4_SB(sb)->s_gdb_count++;
+       ext4_kvfree(o_group_desc);
+       err = ext4_journal_get_write_access(handle, gdb_bh);
+       if (unlikely(err))
+               brelse(gdb_bh);
+       return err;
+}
+
+/*
   * Called when we are adding a new group which has a backup copy of each of
   * the GDT blocks (i.e. sparse group) and there are reserved GDT blocks.
   * We need to add these reserved backup GDT blocks to the resize inode, so
@@ -949,16 +1028,16 @@ exit_free:
   * do not copy the full number of backups at this time.  The resize
   * which changed s_groups_count will backup again.
   */
-static void update_backups(struct super_block *sb,
-                          int blk_off, char *data, int size)
+static void update_backups(struct super_block *sb, int blk_off, char *data,
+                          int size, int meta_bg)
  {
         struct ext4_sb_info *sbi = EXT4_SB(sb);
-       const ext4_group_t last = sbi->s_groups_count;
+       ext4_group_t last;
         const int bpg = EXT4_BLOCKS_PER_GROUP(sb);
         unsigned three = 1;
         unsigned five = 5;
         unsigned seven = 7;
-       ext4_group_t group;
+       ext4_group_t group = 0;
         int rest = sb->s_blocksize - size;
         handle_t *handle;
         int err = 0, err2;
@@ -970,10 +1049,17 @@ static void update_backups(struct super_block *sb,
                 goto exit_err;
         }
  
-       ext4_superblock_csum_set(sb, (struct ext4_super_block *)data);
+       if (meta_bg == 0) {
+               group = ext4_list_backups(sb, &three, &five, &seven);
+               last = sbi->s_groups_count;
+       } else {
+               group = ext4_meta_bg_first_group(sb, group) + 1;
+               last = (ext4_group_t)(group + EXT4_DESC_PER_BLOCK(sb) - 2);
+       }
  
-       while ((group = ext4_list_backups(sb, &three, &five, &seven)) < last) {
+       while (group < sbi->s_groups_count) {
                 struct buffer_head *bh;
+               ext4_fsblk_t backup_block;
  
                 /* Out of journal space, and can't get more - abort - so sad */
                 if (ext4_handle_valid(handle) &&
@@ -982,13 +1068,20 @@ static void update_backups(struct super_block *sb,
                     (err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA)))
                         break;
  
-               bh = sb_getblk(sb, group * bpg + blk_off);
+               if (meta_bg == 0)
+                       backup_block = group * bpg + blk_off;
+               else
+                       backup_block = (ext4_group_first_block_no(sb, group) +
+                                       ext4_bg_has_super(sb, group));
+
+               bh = sb_getblk(sb, backup_block);
                 if (!bh) {
                         err = -EIO;
                         break;
                 }
-               ext4_debug("update metadata backup %#04lx\n",
-                         (unsigned long)bh->b_blocknr);
+               ext4_debug("update metadata backup %llu(+%llu)\n",
+                          backup_block, backup_block -
+                          ext4_group_first_block_no(sb, group));
                 if ((err = ext4_journal_get_write_access(handle, bh)))
                         break;
                 lock_buffer(bh);
@@ -1001,6 +1094,13 @@ static void update_backups(struct super_block *sb,
                 if (unlikely(err))
                         ext4_std_error(sb, err);
                 brelse(bh);
+
+               if (meta_bg == 0)
+                       group = ext4_list_backups(sb, &three, &five, &seven);
+               else if (group == last)
+                       break;
+               else
+                       group = last;
         }
         if ((err2 = ext4_journal_stop(handle)) && !err)
                 err = err2;
@@ -1043,7 +1143,9 @@ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb,
         struct ext4_super_block *es = sbi->s_es;
         struct buffer_head *gdb_bh;
         int i, gdb_off, gdb_num, err = 0;
+       int meta_bg;
  
+       meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG);
         for (i = 0; i < count; i++, group++) {
                 int reserved_gdb = ext4_bg_has_super(sb, group) ?
                         le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
@@ -1063,8 +1165,11 @@ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb,
  
                         if (!err && reserved_gdb && ext4_bg_num_gdb(sb, group))
                                 err = reserve_backup_gdb(handle, resize_inode, group);
-               } else
+               } else if (meta_bg != 0) {
+                       err = add_new_gdb_meta_bg(sb, handle, group);
+               } else {
                         err = add_new_gdb(handle, resize_inode, group);
+               }
                 if (err)
                         break;
         }
@@ -1076,17 +1181,12 @@ static struct buffer_head *ext4_get_bitmap(struct super_block *sb, __u64 block)
         struct buffer_head *bh = sb_getblk(sb, block);
         if (!bh)
                 return NULL;
-
-       if (bitmap_uptodate(bh))
-               return bh;
-
-       lock_buffer(bh);
-       if (bh_submit_read(bh) < 0) {
-               unlock_buffer(bh);
-               brelse(bh);
-               return NULL;
+       if (!bh_uptodate_or_lock(bh)) {
+               if (bh_submit_read(bh) < 0) {
+                       brelse(bh);
+                       return NULL;
+               }
         }
-       unlock_buffer(bh);
  
         return bh;
  }
@@ -1161,6 +1261,9 @@ static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb,
                 ext4_free_group_clusters_set(sb, gdp,
                                              EXT4_B2C(sbi, group_data->free_blocks_count));
                 ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));
+               if (ext4_has_group_desc_csum(sb))
+                       ext4_itable_unused_set(sb, gdp,
+                                              EXT4_INODES_PER_GROUP(sb));
                 gdp->bg_flags = cpu_to_le16(*bg_flags);
                 ext4_group_desc_csum_set(sb, group, gdp);
  
@@ -1216,7 +1319,7 @@ static void ext4_update_super(struct super_block *sb,
         }
  
         reserved_blocks = ext4_r_blocks_count(es) * 100;
-       do_div(reserved_blocks, ext4_blocks_count(es));
+       reserved_blocks = div64_u64(reserved_blocks, ext4_blocks_count(es));
         reserved_blocks *= blocks_count;
         do_div(reserved_blocks, 100);
  
@@ -1227,6 +1330,7 @@ static void ext4_update_super(struct super_block *sb,
         le32_add_cpu(&es->s_free_inodes_count, EXT4_INODES_PER_GROUP(sb) *
                      flex_gd->count);
  
+       ext4_debug("free blocks count %llu", ext4_free_blocks_count(es));
         /*
          * We need to protect s_groups_count against other CPUs seeing
          * inconsistent state in the superblock.
@@ -1261,6 +1365,8 @@ static void ext4_update_super(struct super_block *sb,
         percpu_counter_add(&sbi->s_freeinodes_counter,
                            EXT4_INODES_PER_GROUP(sb) * flex_gd->count);
  
+       ext4_debug("free blocks count %llu",
+                  percpu_counter_read(&sbi->s_freeclusters_counter));
         if (EXT4_HAS_INCOMPAT_FEATURE(sb,
                                       EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
             sbi->s_log_groups_per_flex) {
@@ -1349,16 +1455,24 @@ exit_journal:
                 err = err2;
  
         if (!err) {
-               int i;
+               int gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
+               int gdb_num_end = ((group + flex_gd->count - 1) /
+                                  EXT4_DESC_PER_BLOCK(sb));
+               int meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb,
+                               EXT4_FEATURE_INCOMPAT_META_BG);
+               sector_t old_gdb = 0;
+
                 update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es,
-                              sizeof(struct ext4_super_block));
-               for (i = 0; i < flex_gd->count; i++, group++) {
+                              sizeof(struct ext4_super_block), 0);
+               for (; gdb_num <= gdb_num_end; gdb_num++) {
                         struct buffer_head *gdb_bh;
-                       int gdb_num;
-                       gdb_num = group / EXT4_BLOCKS_PER_GROUP(sb);
+
                         gdb_bh = sbi->s_group_desc[gdb_num];
+                       if (old_gdb == gdb_bh->b_blocknr)
+                               continue;
                         update_backups(sb, gdb_bh->b_blocknr, gdb_bh->b_data,
-                                      gdb_bh->b_size);
+                                      gdb_bh->b_size, meta_bg);
+                       old_gdb = gdb_bh->b_blocknr;
                 }
         }
  exit:
@@ -1402,9 +1516,7 @@ static int ext4_setup_next_flex_gd(struct super_block *sb,
  
                 group_data[i].group = group + i;
                 group_data[i].blocks_count = blocks_per_group;
-               overhead = ext4_bg_has_super(sb, group + i) ?
-                          (1 + ext4_bg_num_gdb(sb, group + i) +
-                           le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
+               overhead = ext4_group_overhead_blocks(sb, group + i);
                 group_data[i].free_blocks_count = blocks_per_group - overhead;
                 if (ext4_has_group_desc_csum(sb))
                         flex_gd->bg_flags[i] = EXT4_BG_BLOCK_UNINIT |
@@ -1492,6 +1604,14 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
         if (err)
                 goto out;
  
+       err = ext4_alloc_flex_bg_array(sb, input->group + 1);
+       if (err)
+               return err;
+
+       err = ext4_mb_alloc_groupinfo(sb, input->group + 1);
+       if (err)
+               goto out;
+
         flex_gd.count = 1;
         flex_gd.groups = input;
         flex_gd.bg_flags = &bg_flags;
@@ -1544,11 +1664,13 @@ errout:
                 err = err2;
  
         if (!err) {
+               ext4_fsblk_t first_block;
+               first_block = ext4_group_first_block_no(sb, 0);
                 if (test_opt(sb, DEBUG))
                         printk(KERN_DEBUG "EXT4-fs: extended group to %llu "
                                "blocks\n", ext4_blocks_count(es));
-               update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr, (char *)es,
-                              sizeof(struct ext4_super_block));
+               update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr - first_block,
+                              (char *)es, sizeof(struct ext4_super_block), 0);
         }
         return err;
  }
@@ -1631,6 +1753,94 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
         return err;
  } /* ext4_group_extend */
  
+
+static int num_desc_blocks(struct super_block *sb, ext4_group_t groups)
+{
+       return (groups + EXT4_DESC_PER_BLOCK(sb) - 1) / EXT4_DESC_PER_BLOCK(sb);
+}
+
+/*
+ * Release the resize inode and drop the resize_inode feature if there
+ * are no more reserved gdt blocks, and then convert the file system
+ * to enable meta_bg
+ */
+static int ext4_convert_meta_bg(struct super_block *sb, struct inode *inode)
+{
+       handle_t *handle;
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+       struct ext4_super_block *es = sbi->s_es;
+       struct ext4_inode_info *ei = EXT4_I(inode);
+       ext4_fsblk_t nr;
+       int i, ret, err = 0;
+       int credits = 1;
+
+       ext4_msg(sb, KERN_INFO, "Converting file system to meta_bg");
+       if (inode) {
+               if (es->s_reserved_gdt_blocks) {
+                       ext4_error(sb, "Unexpected non-zero "
+                                  "s_reserved_gdt_blocks");
+                       return -EPERM;
+               }
+
+               /* Do a quick sanity check of the resize inode */
+               if (inode->i_blocks != 1 << (inode->i_blkbits - 9))
+                       goto invalid_resize_inode;
+               for (i = 0; i < EXT4_N_BLOCKS; i++) {
+                       if (i == EXT4_DIND_BLOCK) {
+                               if (ei->i_data[i])
+                                       continue;
+                               else
+                                       goto invalid_resize_inode;
+                       }
+                       if (ei->i_data[i])
+                               goto invalid_resize_inode;
+               }
+               credits += 3;   /* block bitmap, bg descriptor, resize inode */
+       }
+
+       handle = ext4_journal_start_sb(sb, credits);
+       if (IS_ERR(handle))
+               return PTR_ERR(handle);
+
+       err = ext4_journal_get_write_access(handle, sbi->s_sbh);
+       if (err)
+               goto errout;
+
+       EXT4_CLEAR_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE);
+       EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG);
+       sbi->s_es->s_first_meta_bg =
+               cpu_to_le32(num_desc_blocks(sb, sbi->s_groups_count));
+
+       err = ext4_handle_dirty_super(handle, sb);
+       if (err) {
+               ext4_std_error(sb, err);
+               goto errout;
+       }
+
+       if (inode) {
+               nr = le32_to_cpu(ei->i_data[EXT4_DIND_BLOCK]);
+               ext4_free_blocks(handle, inode, NULL, nr, 1,
+                                EXT4_FREE_BLOCKS_METADATA |
+                                EXT4_FREE_BLOCKS_FORGET);
+               ei->i_data[EXT4_DIND_BLOCK] = 0;
+               inode->i_blocks = 0;
+
+               err = ext4_mark_inode_dirty(handle, inode);
+               if (err)
+                       ext4_std_error(sb, err);
+       }
+
+errout:
+       ret = ext4_journal_stop(handle);
+       if (!err)
+               err = ret;
+       return ret;
+
+invalid_resize_inode:
+       ext4_error(sb, "corrupted/inconsistent resize inode");
+       return -EINVAL;
+}
+
  /*
   * ext4_resize_fs() resizes a fs to new size specified by @n_blocks_count
   *
@@ -1643,21 +1853,31 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
         struct ext4_sb_info *sbi = EXT4_SB(sb);
         struct ext4_super_block *es = sbi->s_es;
         struct buffer_head *bh;
-       struct inode *resize_inode;
-       ext4_fsblk_t o_blocks_count;
-       ext4_group_t o_group;
-       ext4_group_t n_group;
-       ext4_grpblk_t offset, add;
+       struct inode *resize_inode = NULL;
+       ext4_grpblk_t add, offset;
         unsigned long n_desc_blocks;
         unsigned long o_desc_blocks;
-       unsigned long desc_blocks;
-       int err = 0, flexbg_size = 1;
+       ext4_group_t o_group;
+       ext4_group_t n_group;
+       ext4_fsblk_t o_blocks_count;
+       ext4_fsblk_t n_blocks_count_retry = 0;
+       unsigned long last_update_time = 0;
+       int err = 0, flexbg_size = 1 << sbi->s_log_groups_per_flex;
+       int meta_bg;
  
+       /* See if the device is actually as big as what was requested */
+       bh = sb_bread(sb, n_blocks_count - 1);
+       if (!bh) {
+               ext4_warning(sb, "can't read last block, resize aborted");
+               return -ENOSPC;
+       }
+       brelse(bh);
+
+retry:
         o_blocks_count = ext4_blocks_count(es);
  
-       if (test_opt(sb, DEBUG))
-               ext4_msg(sb, KERN_DEBUG, "resizing filesystem from %llu "
-                      "to %llu blocks", o_blocks_count, n_blocks_count);
+       ext4_msg(sb, KERN_INFO, "resizing filesystem from %llu "
+                "to %llu blocks", o_blocks_count, n_blocks_count);
  
         if (n_blocks_count < o_blocks_count) {
                 /* On-line shrinking not supported */
@@ -1672,32 +1892,49 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
         ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &offset);
         ext4_get_group_no_and_offset(sb, o_blocks_count - 1, &o_group, &offset);
  
-       n_desc_blocks = (n_group + EXT4_DESC_PER_BLOCK(sb)) /
-                       EXT4_DESC_PER_BLOCK(sb);
-       o_desc_blocks = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
-                       EXT4_DESC_PER_BLOCK(sb);
-       desc_blocks = n_desc_blocks - o_desc_blocks;
+       n_desc_blocks = num_desc_blocks(sb, n_group + 1);
+       o_desc_blocks = num_desc_blocks(sb, sbi->s_groups_count);
  
-       if (desc_blocks &&
-           (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE) ||
-            le16_to_cpu(es->s_reserved_gdt_blocks) < desc_blocks)) {
-               ext4_warning(sb, "No reserved GDT blocks, can't resize");
-               return -EPERM;
-       }
+       meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG);
  
-       resize_inode = ext4_iget(sb, EXT4_RESIZE_INO);
-       if (IS_ERR(resize_inode)) {
-               ext4_warning(sb, "Error opening resize inode");
-               return PTR_ERR(resize_inode);
+       if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE)) {
+               if (meta_bg) {
+                       ext4_error(sb, "resize_inode and meta_bg enabled "
+                                  "simultaneously");
+                       return -EINVAL;
+               }
+               if (n_desc_blocks > o_desc_blocks +
+                   le16_to_cpu(es->s_reserved_gdt_blocks)) {
+                       n_blocks_count_retry = n_blocks_count;
+                       n_desc_blocks = o_desc_blocks +
+                               le16_to_cpu(es->s_reserved_gdt_blocks);
+                       n_group = n_desc_blocks * EXT4_DESC_PER_BLOCK(sb);
+                       n_blocks_count = n_group * EXT4_BLOCKS_PER_GROUP(sb);
+                       n_group--; /* set to last group number */
+               }
+
+               if (!resize_inode)
+                       resize_inode = ext4_iget(sb, EXT4_RESIZE_INO);
+               if (IS_ERR(resize_inode)) {
+                       ext4_warning(sb, "Error opening resize inode");
+                       return PTR_ERR(resize_inode);
+               }
         }
  
-       /* See if the device is actually as big as what was requested */
-       bh = sb_bread(sb, n_blocks_count - 1);
-       if (!bh) {
-               ext4_warning(sb, "can't read last block, resize aborted");
-               return -ENOSPC;
+       if ((!resize_inode && !meta_bg) || n_blocks_count == o_blocks_count) {
+               err = ext4_convert_meta_bg(sb, resize_inode);
+               if (err)
+                       goto out;
+               if (resize_inode) {
+                       iput(resize_inode);
+                       resize_inode = NULL;
+               }
+               if (n_blocks_count_retry) {
+                       n_blocks_count = n_blocks_count_retry;
+                       n_blocks_count_retry = 0;
+                       goto retry;
+               }
         }
-       brelse(bh);
  
         /* extend the last group */
         if (n_group == o_group)
@@ -1710,12 +1947,15 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
                         goto out;
         }
  
-       if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
-           es->s_log_groups_per_flex)
-               flexbg_size = 1 << es->s_log_groups_per_flex;
+       if (ext4_blocks_count(es) == n_blocks_count)
+               goto out;
  
-       o_blocks_count = ext4_blocks_count(es);
-       if (o_blocks_count == n_blocks_count)
+       err = ext4_alloc_flex_bg_array(sb, n_group + 1);
+       if (err)
+               return err;
+
+       err = ext4_mb_alloc_groupinfo(sb, n_group + 1);
+       if (err)
                 goto out;
  
         flex_gd = alloc_flex_gd(flexbg_size);
@@ -1729,19 +1969,33 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
          */
         while (ext4_setup_next_flex_gd(sb, flex_gd, n_blocks_count,
                                               flexbg_size)) {
-               ext4_alloc_group_tables(sb, flex_gd, flexbg_size);
+               if (jiffies - last_update_time > HZ * 10) {
+                       if (last_update_time)
+                               ext4_msg(sb, KERN_INFO,
+                                        "resized to %llu blocks",
+                                        ext4_blocks_count(es));
+                       last_update_time = jiffies;
+               }
+               if (ext4_alloc_group_tables(sb, flex_gd, flexbg_size) != 0)
+                       break;
                 err = ext4_flex_group_add(sb, resize_inode, flex_gd);
                 if (unlikely(err))
                         break;
         }
  
+       if (!err && n_blocks_count_retry) {
+               n_blocks_count = n_blocks_count_retry;
+               n_blocks_count_retry = 0;
+               free_flex_gd(flex_gd);
+               flex_gd = NULL;
+               goto retry;
+       }
+
  out:
         if (flex_gd)
                 free_flex_gd(flex_gd);
-
-       iput(resize_inode);
-       if (test_opt(sb, DEBUG))
-               ext4_msg(sb, KERN_DEBUG, "resized filesystem from %llu "
-                      "upto %llu blocks", o_blocks_count, n_blocks_count);
+       if (resize_inode != NULL)
+               iput(resize_inode);
+       ext4_msg(sb, KERN_INFO, "resized filesystem to %llu", n_blocks_count);
         return err;
  }
diff --git a/fs/ext4/super.c b/fs/ext4/super.c

index 69c55d4..7265a03 100644 (file)
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -420,7 +420,7 @@ static void __save_error_info(struct super_block *sb, const char *func,
          */
         if (!es->s_error_count)
                 mod_timer(&EXT4_SB(sb)->s_err_report, jiffies + 24*60*60*HZ);
-       es->s_error_count = cpu_to_le32(le32_to_cpu(es->s_error_count) + 1);
+       le32_add_cpu(&es->s_error_count, 1);
  }
  
  static void save_error_info(struct super_block *sb, const char *func,
@@ -850,7 +850,6 @@ static void ext4_put_super(struct super_block *sb)
         flush_workqueue(sbi->dio_unwritten_wq);
         destroy_workqueue(sbi->dio_unwritten_wq);
  
-       lock_super(sb);
         if (sbi->s_journal) {
                 err = jbd2_journal_destroy(sbi->s_journal);
                 sbi->s_journal = NULL;
@@ -917,7 +916,6 @@ static void ext4_put_super(struct super_block *sb)
          * Now that we are completely done shutting down the
          * superblock, we need to actually destroy the kobject.
          */
-       unlock_super(sb);
         kobject_put(&sbi->s_kobj);
         wait_for_completion(&sbi->s_kobj_unregister);
         if (sbi->s_chksum_driver)
@@ -956,11 +954,10 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
         ei->jinode = NULL;
         INIT_LIST_HEAD(&ei->i_completed_io_list);
         spin_lock_init(&ei->i_completed_io_lock);
-       ei->cur_aio_dio = NULL;
         ei->i_sync_tid = 0;
         ei->i_datasync_tid = 0;
         atomic_set(&ei->i_ioend_count, 0);
-       atomic_set(&ei->i_aiodio_unwritten, 0);
+       atomic_set(&ei->i_unwritten, 0);
  
         return &ei->vfs_inode;
  }
@@ -1224,6 +1221,7 @@ enum {
         Opt_inode_readahead_blks, Opt_journal_ioprio,
         Opt_dioread_nolock, Opt_dioread_lock,
         Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
+       Opt_max_dir_size_kb,
  };
  
  static const match_table_t tokens = {
@@ -1297,6 +1295,7 @@ static const match_table_t tokens = {
         {Opt_init_itable, "init_itable=%u"},
         {Opt_init_itable, "init_itable"},
         {Opt_noinit_itable, "noinit_itable"},
+       {Opt_max_dir_size_kb, "max_dir_size_kb=%u"},
         {Opt_removed, "check=none"},    /* mount option from ext2/3 */
         {Opt_removed, "nocheck"},       /* mount option from ext2/3 */
         {Opt_removed, "reservation"},   /* mount option from ext2/3 */
@@ -1477,6 +1476,7 @@ static const struct mount_opts {
         {Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT},
         {Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT},
         {Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT},
+       {Opt_max_dir_size_kb, 0, MOPT_GTE0},
         {Opt_err, 0, 0}
  };
  
@@ -1592,6 +1592,8 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
                         if (!args->from)
                                 arg = EXT4_DEF_LI_WAIT_MULT;
                         sbi->s_li_wait_mult = arg;
+               } else if (token == Opt_max_dir_size_kb) {
+                       sbi->s_max_dir_size_kb = arg;
                 } else if (token == Opt_stripe) {
                         sbi->s_stripe = arg;
                 } else if (m->flags & MOPT_DATAJ) {
@@ -1664,7 +1666,7 @@ static int parse_options(char *options, struct super_block *sb,
                  * Initialize args struct so we know whether arg was
                  * found; some options take optional arguments.
                  */
-               args[0].to = args[0].from = 0;
+               args[0].to = args[0].from = NULL;
                 token = match_token(p, tokens, args);
                 if (handle_mount_opt(sb, p, token, args, journal_devnum,
                                      journal_ioprio, is_remount) < 0)
@@ -1740,7 +1742,7 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
  
  static const char *token2str(int token)
  {
-       static const struct match_token *t;
+       const struct match_token *t;
  
         for (t = tokens; t->token != Opt_err; t++)
                 if (t->token == token && !strchr(t->pattern, '='))
@@ -1823,6 +1825,8 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
         if (nodefs || (test_opt(sb, INIT_INODE_TABLE) &&
                        (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)))
                 SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult);
+       if (nodefs || sbi->s_max_dir_size_kb)
+               SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb);
  
         ext4_show_quota_options(seq, sb);
         return 0;
@@ -1914,15 +1918,45 @@ done:
         return res;
  }
  
+int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup)
+{
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+       struct flex_groups *new_groups;
+       int size;
+
+       if (!sbi->s_log_groups_per_flex)
+               return 0;
+
+       size = ext4_flex_group(sbi, ngroup - 1) + 1;
+       if (size <= sbi->s_flex_groups_allocated)
+               return 0;
+
+       size = roundup_pow_of_two(size * sizeof(struct flex_groups));
+       new_groups = ext4_kvzalloc(size, GFP_KERNEL);
+       if (!new_groups) {
+               ext4_msg(sb, KERN_ERR, "not enough memory for %d flex groups",
+                        size / (int) sizeof(struct flex_groups));
+               return -ENOMEM;
+       }
+
+       if (sbi->s_flex_groups) {
+               memcpy(new_groups, sbi->s_flex_groups,
+                      (sbi->s_flex_groups_allocated *
+                       sizeof(struct flex_groups)));
+               ext4_kvfree(sbi->s_flex_groups);
+       }
+       sbi->s_flex_groups = new_groups;
+       sbi->s_flex_groups_allocated = size / sizeof(struct flex_groups);
+       return 0;
+}
+
  static int ext4_fill_flex_info(struct super_block *sb)
  {
         struct ext4_sb_info *sbi = EXT4_SB(sb);
         struct ext4_group_desc *gdp = NULL;
-       ext4_group_t flex_group_count;
         ext4_group_t flex_group;
         unsigned int groups_per_flex = 0;
-       size_t size;
-       int i;
+       int i, err;
  
         sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
         if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) {
@@ -1931,17 +1965,9 @@ static int ext4_fill_flex_info(struct super_block *sb)
         }
         groups_per_flex = 1 << sbi->s_log_groups_per_flex;
  
-       /* We allocate both existing and potentially added groups */
-       flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) +
-                       ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) <<
-                             EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex;
-       size = flex_group_count * sizeof(struct flex_groups);
-       sbi->s_flex_groups = ext4_kvzalloc(size, GFP_KERNEL);
-       if (sbi->s_flex_groups == NULL) {
-               ext4_msg(sb, KERN_ERR, "not enough memory for %u flex groups",
-                        flex_group_count);
+       err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count);
+       if (err)
                 goto failed;
-       }
  
         for (i = 0; i < sbi->s_groups_count; i++) {
                 gdp = ext4_get_group_desc(sb, i, NULL);
@@ -2144,10 +2170,12 @@ static void ext4_orphan_cleanup(struct super_block *sb,
         }
  
         if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
-               if (es->s_last_orphan)
+               /* don't clear list on RO mount w/ errors */
+               if (es->s_last_orphan && !(s_flags & MS_RDONLY)) {
                         jbd_debug(1, "Errors on filesystem, "
                                   "clearing orphan list.\n");
-               es->s_last_orphan = 0;
+                       es->s_last_orphan = 0;
+               }
                 jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
                 return;
         }
@@ -2528,6 +2556,7 @@ EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
  EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
  EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
  EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump);
+EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
  EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error);
  
  static struct attribute *ext4_attrs[] = {
@@ -2543,6 +2572,7 @@ static struct attribute *ext4_attrs[] = {
         ATTR_LIST(mb_stream_req),
         ATTR_LIST(mb_group_prealloc),
         ATTR_LIST(max_writeback_mb_bump),
+       ATTR_LIST(extent_max_zeroout_kb),
         ATTR_LIST(trigger_fs_error),
         NULL,
  };
@@ -2550,10 +2580,12 @@ static struct attribute *ext4_attrs[] = {
  /* Features this copy of ext4 supports */
  EXT4_INFO_ATTR(lazy_itable_init);
  EXT4_INFO_ATTR(batched_discard);
+EXT4_INFO_ATTR(meta_bg_resize);
  
  static struct attribute *ext4_feat_attrs[] = {
         ATTR_LIST(lazy_itable_init),
         ATTR_LIST(batched_discard),
+       ATTR_LIST(meta_bg_resize),
         NULL,
  };
  
@@ -3374,7 +3406,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
          * enable delayed allocation by default
          * Use -o nodelalloc to turn it off
          */
-       if (!IS_EXT3_SB(sb) &&
+       if (!IS_EXT3_SB(sb) && !IS_EXT2_SB(sb) &&
             ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
                 set_opt(sb, DELALLOC);
  
@@ -3743,6 +3775,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
  
         sbi->s_stripe = ext4_get_stripe_size(sbi);
         sbi->s_max_writeback_mb_bump = 128;
+       sbi->s_extent_max_zeroout_kb = 32;
  
         /*
          * set up enough so that it can read an inode
@@ -4519,11 +4552,9 @@ static int ext4_unfreeze(struct super_block *sb)
         if (sb->s_flags & MS_RDONLY)
                 return 0;
  
-       lock_super(sb);
         /* Reset the needs_recovery flag before the fs is unlocked. */
         EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
         ext4_commit_super(sb, 1);
-       unlock_super(sb);
         return 0;
  }
  
@@ -4559,7 +4590,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
         char *orig_data = kstrdup(data, GFP_KERNEL);
  
         /* Store the original options */
-       lock_super(sb);
         old_sb_flags = sb->s_flags;
         old_opts.s_mount_opt = sbi->s_mount_opt;
         old_opts.s_mount_opt2 = sbi->s_mount_opt2;
@@ -4701,7 +4731,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
         if (sbi->s_journal == NULL)
                 ext4_commit_super(sb, 1);
  
-       unlock_super(sb);
  #ifdef CONFIG_QUOTA
         /* Release old quota file names */
         for (i = 0; i < MAXQUOTAS; i++)
@@ -4714,10 +4743,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                 else if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
                                         EXT4_FEATURE_RO_COMPAT_QUOTA)) {
                         err = ext4_enable_quotas(sb);
-                       if (err) {
-                               lock_super(sb);
+                       if (err)
                                 goto restore_opts;
-                       }
                 }
         }
  #endif
@@ -4744,7 +4771,6 @@ restore_opts:
                 sbi->s_qf_names[i] = old_opts.s_qf_names[i];
         }
  #endif
-       unlock_super(sb);
         kfree(orig_data);
         return err;
  }
@@ -5269,8 +5295,10 @@ static int __init ext4_init_fs(void)
         if (err)
                 goto out6;
         ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
-       if (!ext4_kset)
+       if (!ext4_kset) {
+               err = -ENOMEM;
                 goto out5;
+       }
         ext4_proc_root = proc_mkdir("fs/ext4", NULL);
  
         err = ext4_init_feat_adverts();
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c

index 6d46c0d..8e1d7b9 100644 (file)
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -63,6 +63,7 @@ int writeback_in_progress(struct backing_dev_info *bdi)
  {
         return test_bit(BDI_writeback_running, &bdi->state);
  }
+EXPORT_SYMBOL(writeback_in_progress);
  
  static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
  {
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c

index af5280f..3091d42 100644 (file)
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -1014,17 +1014,35 @@ restart_loop:
                  * there's no point in keeping a checkpoint record for
                  * it. */
  
-               /* A buffer which has been freed while still being
-                * journaled by a previous transaction may end up still
-                * being dirty here, but we want to avoid writing back
-                * that buffer in the future after the "add to orphan"
-                * operation been committed,  That's not only a performance
-                * gain, it also stops aliasing problems if the buffer is
-                * left behind for writeback and gets reallocated for another
-                * use in a different page. */
-               if (buffer_freed(bh) && !jh->b_next_transaction) {
-                       clear_buffer_freed(bh);
-                       clear_buffer_jbddirty(bh);
+               /*
+               * A buffer which has been freed while still being journaled by
+               * a previous transaction.
+               */
+               if (buffer_freed(bh)) {
+                       /*
+                        * If the running transaction is the one containing
+                        * "add to orphan" operation (b_next_transaction !=
+                        * NULL), we have to wait for that transaction to
+                        * commit before we can really get rid of the buffer.
+                        * So just clear b_modified to not confuse transaction
+                        * credit accounting and refile the buffer to
+                        * BJ_Forget of the running transaction. If the just
+                        * committed transaction contains "add to orphan"
+                        * operation, we can completely invalidate the buffer
+                        * now. We are rather through in that since the
+                        * buffer may be still accessible when blocksize <
+                        * pagesize and it is attached to the last partial
+                        * page.
+                        */
+                       jh->b_modified = 0;
+                       if (!jh->b_next_transaction) {
+                               clear_buffer_freed(bh);
+                               clear_buffer_jbddirty(bh);
+                               clear_buffer_mapped(bh);
+                               clear_buffer_new(bh);
+                               clear_buffer_req(bh);
+                               bh->b_bdev = NULL;
+                       }
                 }
  
                 if (buffer_jbddirty(bh)) {
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c

index e149b99..484b8d1 100644 (file)
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1354,6 +1354,11 @@ static void jbd2_mark_journal_empty(journal_t *journal)
  
         BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
         read_lock(&journal->j_state_lock);
+       /* Is it already empty? */
+       if (sb->s_start == 0) {
+               read_unlock(&journal->j_state_lock);
+               return;
+       }
         jbd_debug(1, "JBD2: Marking journal as empty (seq %d)\n",
                   journal->j_tail_sequence);
  
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c

index 0131e43..626846b 100644 (file)
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -289,8 +289,11 @@ int jbd2_journal_recover(journal_t *journal)
         if (!err)
                 err = err2;
         /* Make sure all replayed data is on permanent storage */
-       if (journal->j_flags & JBD2_BARRIER)
-               blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
+       if (journal->j_flags & JBD2_BARRIER) {
+               err2 = blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
+               if (!err)
+                       err = err2;
+       }
         return err;
  }
  
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c

index fb1ab95..a74ba46 100644 (file)
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1841,15 +1841,16 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
   * We're outside-transaction here.  Either or both of j_running_transaction
   * and j_committing_transaction may be NULL.
   */
-static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
+static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh,
+                               int partial_page)
  {
         transaction_t *transaction;
         struct journal_head *jh;
         int may_free = 1;
-       int ret;
  
         BUFFER_TRACE(bh, "entry");
  
+retry:
         /*
          * It is safe to proceed here without the j_list_lock because the
          * buffers cannot be stolen by try_to_free_buffers as long as we are
@@ -1878,10 +1879,18 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
          * clear the buffer dirty bit at latest at the moment when the
          * transaction marking the buffer as freed in the filesystem
          * structures is committed because from that moment on the
-        * buffer can be reallocated and used by a different page.
+        * block can be reallocated and used by a different page.
          * Since the block hasn't been freed yet but the inode has
          * already been added to orphan list, it is safe for us to add
          * the buffer to BJ_Forget list of the newest transaction.
+        *
+        * Also we have to clear buffer_mapped flag of a truncated buffer
+        * because the buffer_head may be attached to the page straddling
+        * i_size (can happen only when blocksize < pagesize) and thus the
+        * buffer_head can be reused when the file is extended again. So we end
+        * up keeping around invalidated buffers attached to transactions'
+        * BJ_Forget list just to stop checkpointing code from cleaning up
+        * the transaction this buffer was modified in.
          */
         transaction = jh->b_transaction;
         if (transaction == NULL) {
@@ -1908,13 +1917,9 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
                          * committed, the buffer won't be needed any
                          * longer. */
                         JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget");
-                       ret = __dispose_buffer(jh,
+                       may_free = __dispose_buffer(jh,
                                         journal->j_running_transaction);
-                       jbd2_journal_put_journal_head(jh);
-                       spin_unlock(&journal->j_list_lock);
-                       jbd_unlock_bh_state(bh);
-                       write_unlock(&journal->j_state_lock);
-                       return ret;
+                       goto zap_buffer;
                 } else {
                         /* There is no currently-running transaction. So the
                          * orphan record which we wrote for this file must have
@@ -1922,13 +1927,9 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
                          * the committing transaction, if it exists. */
                         if (journal->j_committing_transaction) {
                                 JBUFFER_TRACE(jh, "give to committing trans");
-                               ret = __dispose_buffer(jh,
+                               may_free = __dispose_buffer(jh,
                                         journal->j_committing_transaction);
-                               jbd2_journal_put_journal_head(jh);
-                               spin_unlock(&journal->j_list_lock);
-                               jbd_unlock_bh_state(bh);
-                               write_unlock(&journal->j_state_lock);
-                               return ret;
+                               goto zap_buffer;
                         } else {
                                 /* The orphan record's transaction has
                                  * committed.  We can cleanse this buffer */
@@ -1940,10 +1941,24 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
                 JBUFFER_TRACE(jh, "on committing transaction");
                 /*
                  * The buffer is committing, we simply cannot touch
-                * it. So we just set j_next_transaction to the
-                * running transaction (if there is one) and mark
-                * buffer as freed so that commit code knows it should
-                * clear dirty bits when it is done with the buffer.
+                * it. If the page is straddling i_size we have to wait
+                * for commit and try again.
+                */
+               if (partial_page) {
+                       tid_t tid = journal->j_committing_transaction->t_tid;
+
+                       jbd2_journal_put_journal_head(jh);
+                       spin_unlock(&journal->j_list_lock);
+                       jbd_unlock_bh_state(bh);
+                       write_unlock(&journal->j_state_lock);
+                       jbd2_log_wait_commit(journal, tid);
+                       goto retry;
+               }
+               /*
+                * OK, buffer won't be reachable after truncate. We just set
+                * j_next_transaction to the running transaction (if there is
+                * one) and mark buffer as freed so that commit code knows it
+                * should clear dirty bits when it is done with the buffer.
                  */
                 set_buffer_freed(bh);
                 if (journal->j_running_transaction && buffer_jbddirty(bh))
@@ -1966,6 +1981,15 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
         }
  
  zap_buffer:
+       /*
+        * This is tricky. Although the buffer is truncated, it may be reused
+        * if blocksize < pagesize and it is attached to the page straddling
+        * EOF. Since the buffer might have been added to BJ_Forget list of the
+        * running transaction, journal_get_write_access() won't clear
+        * b_modified and credit accounting gets confused. So clear b_modified
+        * here.
+        */
+       jh->b_modified = 0;
         jbd2_journal_put_journal_head(jh);
  zap_buffer_no_jh:
         spin_unlock(&journal->j_list_lock);
@@ -2017,7 +2041,8 @@ void jbd2_journal_invalidatepage(journal_t *journal,
                 if (offset <= curr_off) {
                         /* This block is wholly outside the truncation point */
                         lock_buffer(bh);
-                       may_free &= journal_unmap_buffer(journal, bh);
+                       may_free &= journal_unmap_buffer(journal, bh,
+                                                        offset > 0);
                         unlock_buffer(bh);
                 }
                 curr_off = next_off;
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c

index a4d56ac..5b387a4 100644 (file)
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -116,6 +116,7 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
         if (unlikely(ret))
                 goto out;
  
+       file_update_time(vma->vm_file);
         ret = __block_page_mkwrite(vma, vmf, nilfs_get_block);
         if (ret) {
                 nilfs_transaction_abort(inode->i_sb);
diff --git a/include/linux/falloc.h b/include/linux/falloc.h

index 73e0b62..d39b824 100644 (file)
--- a/include/linux/falloc.h
+++ b/include/linux/falloc.h
@@ -3,6 +3,7 @@
  
  #define FALLOC_FL_KEEP_SIZE    0x01 /* default is extend size */
  #define FALLOC_FL_PUNCH_HOLE   0x02 /* de-allocates range */
+#define FALLOC_FL_NO_HIDE_STALE        0x04 /* reserved codepoint */
  
  #ifdef __KERNEL__
  
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h

index 69d8a69..d49b285 100644 (file)
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -26,19 +26,19 @@ TRACE_EVENT(ext4_free_inode,
         TP_STRUCT__entry(
                 __field(        dev_t,  dev                     )
                 __field(        ino_t,  ino                     )
-               __field(        __u16, mode                     )
                 __field(        uid_t,  uid                     )
                 __field(        gid_t,  gid                     )
                 __field(        __u64, blocks                   )
+               __field(        __u16, mode                     )
         ),
  
         TP_fast_assign(
                 __entry->dev    = inode->i_sb->s_dev;
                 __entry->ino    = inode->i_ino;
-               __entry->mode   = inode->i_mode;
                 __entry->uid    = i_uid_read(inode);
                 __entry->gid    = i_gid_read(inode);
                 __entry->blocks = inode->i_blocks;
+               __entry->mode   = inode->i_mode;
         ),
  
         TP_printk("dev %d,%d ino %lu mode 0%o uid %u gid %u blocks %llu",
@@ -300,10 +300,10 @@ TRACE_EVENT(ext4_da_writepages,
                 __field(        long,   pages_skipped           )
                 __field(        loff_t, range_start             )
                 __field(        loff_t, range_end               )
+               __field(       pgoff_t, writeback_index         )
                 __field(        int,    sync_mode               )
                 __field(        char,   for_kupdate             )
                 __field(        char,   range_cyclic            )
-               __field(       pgoff_t, writeback_index         )
         ),
  
         TP_fast_assign(
@@ -313,14 +313,14 @@ TRACE_EVENT(ext4_da_writepages,
                 __entry->pages_skipped  = wbc->pages_skipped;
                 __entry->range_start    = wbc->range_start;
                 __entry->range_end      = wbc->range_end;
+               __entry->writeback_index = inode->i_mapping->writeback_index;
                 __entry->sync_mode      = wbc->sync_mode;
                 __entry->for_kupdate    = wbc->for_kupdate;
                 __entry->range_cyclic   = wbc->range_cyclic;
-               __entry->writeback_index = inode->i_mapping->writeback_index;
         ),
  
         TP_printk("dev %d,%d ino %lu nr_to_write %ld pages_skipped %ld "
-                 "range_start %lld range_end %lld sync_mode %d"
+                 "range_start %lld range_end %lld sync_mode %d "
                   "for_kupdate %d range_cyclic %d writeback_index %lu",
                   MAJOR(__entry->dev), MINOR(__entry->dev),
                   (unsigned long) __entry->ino, __entry->nr_to_write,
@@ -382,8 +382,8 @@ TRACE_EVENT(ext4_da_writepages_result,
                 __field(        int,    ret                     )
                 __field(        int,    pages_written           )
                 __field(        long,   pages_skipped           )
-               __field(        int,    sync_mode               )
                 __field(       pgoff_t, writeback_index         )
+               __field(        int,    sync_mode               )
         ),
  
         TP_fast_assign(
@@ -392,8 +392,8 @@ TRACE_EVENT(ext4_da_writepages_result,
                 __entry->ret            = ret;
                 __entry->pages_written  = pages_written;
                 __entry->pages_skipped  = wbc->pages_skipped;
-               __entry->sync_mode      = wbc->sync_mode;
                 __entry->writeback_index = inode->i_mapping->writeback_index;
+               __entry->sync_mode      = wbc->sync_mode;
         ),
  
         TP_printk("dev %d,%d ino %lu ret %d pages_written %d pages_skipped %ld "
@@ -411,16 +411,16 @@ DECLARE_EVENT_CLASS(ext4__page_op,
         TP_ARGS(page),
  
         TP_STRUCT__entry(
-               __field(        pgoff_t, index                  )
-               __field(        ino_t,  ino                     )
                 __field(        dev_t,  dev                     )
+               __field(        ino_t,  ino                     )
+               __field(        pgoff_t, index                  )
  
         ),
  
         TP_fast_assign(
-               __entry->index  = page->index;
-               __entry->ino    = page->mapping->host->i_ino;
                 __entry->dev    = page->mapping->host->i_sb->s_dev;
+               __entry->ino    = page->mapping->host->i_ino;
+               __entry->index  = page->index;
         ),
  
         TP_printk("dev %d,%d ino %lu page_index %lu",
@@ -456,18 +456,18 @@ TRACE_EVENT(ext4_invalidatepage,
         TP_ARGS(page, offset),
  
         TP_STRUCT__entry(
+               __field(        dev_t,  dev                     )
+               __field(        ino_t,  ino                     )
                 __field(        pgoff_t, index                  )
                 __field(        unsigned long, offset           )
-               __field(        ino_t,  ino                     )
-               __field(        dev_t,  dev                     )
  
         ),
  
         TP_fast_assign(
+               __entry->dev    = page->mapping->host->i_sb->s_dev;
+               __entry->ino    = page->mapping->host->i_ino;
                 __entry->index  = page->index;
                 __entry->offset = offset;
-               __entry->ino    = page->mapping->host->i_ino;
-               __entry->dev    = page->mapping->host->i_sb->s_dev;
         ),
  
         TP_printk("dev %d,%d ino %lu page_index %lu offset %lu",
@@ -510,8 +510,8 @@ DECLARE_EVENT_CLASS(ext4__mb_new_pa,
                 __field(        dev_t,  dev                     )
                 __field(        ino_t,  ino                     )
                 __field(        __u64,  pa_pstart               )
-               __field(        __u32,  pa_len                  )
                 __field(        __u64,  pa_lstart               )
+               __field(        __u32,  pa_len                  )
  
         ),
  
@@ -519,8 +519,8 @@ DECLARE_EVENT_CLASS(ext4__mb_new_pa,
                 __entry->dev            = ac->ac_sb->s_dev;
                 __entry->ino            = ac->ac_inode->i_ino;
                 __entry->pa_pstart      = pa->pa_pstart;
-               __entry->pa_len         = pa->pa_len;
                 __entry->pa_lstart      = pa->pa_lstart;
+               __entry->pa_len         = pa->pa_len;
         ),
  
         TP_printk("dev %d,%d ino %lu pstart %llu len %u lstart %llu",
@@ -645,7 +645,6 @@ TRACE_EVENT(ext4_request_blocks,
         TP_STRUCT__entry(
                 __field(        dev_t,  dev                     )
                 __field(        ino_t,  ino                     )
-               __field(        unsigned int, flags             )
                 __field(        unsigned int, len               )
                 __field(        __u32,  logical                 )
                 __field(        __u32,  lleft                   )
@@ -653,12 +652,12 @@ TRACE_EVENT(ext4_request_blocks,
                 __field(        __u64,  goal                    )
                 __field(        __u64,  pleft                   )
                 __field(        __u64,  pright                  )
+               __field(        unsigned int, flags             )
         ),
  
         TP_fast_assign(
                 __entry->dev    = ar->inode->i_sb->s_dev;
                 __entry->ino    = ar->inode->i_ino;
-               __entry->flags  = ar->flags;
                 __entry->len    = ar->len;
                 __entry->logical = ar->logical;
                 __entry->goal   = ar->goal;
@@ -666,6 +665,7 @@ TRACE_EVENT(ext4_request_blocks,
                 __entry->lright = ar->lright;
                 __entry->pleft  = ar->pleft;
                 __entry->pright = ar->pright;
+               __entry->flags  = ar->flags;
         ),
  
         TP_printk("dev %d,%d ino %lu flags %u len %u lblk %u goal %llu "
@@ -686,7 +686,6 @@ TRACE_EVENT(ext4_allocate_blocks,
                 __field(        dev_t,  dev                     )
                 __field(        ino_t,  ino                     )
                 __field(        __u64,  block                   )
-               __field(        unsigned int, flags             )
                 __field(        unsigned int, len               )
                 __field(        __u32,  logical                 )
                 __field(        __u32,  lleft                   )
@@ -694,13 +693,13 @@ TRACE_EVENT(ext4_allocate_blocks,
                 __field(        __u64,  goal                    )
                 __field(        __u64,  pleft                   )
                 __field(        __u64,  pright                  )
+               __field(        unsigned int, flags             )
         ),
  
         TP_fast_assign(
                 __entry->dev    = ar->inode->i_sb->s_dev;
                 __entry->ino    = ar->inode->i_ino;
                 __entry->block  = block;
-               __entry->flags  = ar->flags;
                 __entry->len    = ar->len;
                 __entry->logical = ar->logical;
                 __entry->goal   = ar->goal;
@@ -708,6 +707,7 @@ TRACE_EVENT(ext4_allocate_blocks,
                 __entry->lright = ar->lright;
                 __entry->pleft  = ar->pleft;
                 __entry->pright = ar->pright;
+               __entry->flags  = ar->flags;
         ),
  
         TP_printk("dev %d,%d ino %lu flags %u len %u block %llu lblk %u "
@@ -728,19 +728,19 @@ TRACE_EVENT(ext4_free_blocks,
         TP_STRUCT__entry(
                 __field(        dev_t,  dev                     )
                 __field(        ino_t,  ino                     )
-               __field(        __u16,  mode                    )
                 __field(        __u64,  block                   )
                 __field(        unsigned long,  count           )
                 __field(        int,    flags                   )
+               __field(        __u16,  mode                    )
         ),
  
         TP_fast_assign(
                 __entry->dev            = inode->i_sb->s_dev;
                 __entry->ino            = inode->i_ino;
-               __entry->mode           = inode->i_mode;
                 __entry->block          = block;
                 __entry->count          = count;
                 __entry->flags          = flags;
+               __entry->mode           = inode->i_mode;
         ),
  
         TP_printk("dev %d,%d ino %lu mode 0%o block %llu count %lu flags %d",
@@ -783,15 +783,15 @@ TRACE_EVENT(ext4_sync_file_exit,
         TP_ARGS(inode, ret),
  
         TP_STRUCT__entry(
-               __field(        int,    ret                     )
-               __field(        ino_t,  ino                     )
                 __field(        dev_t,  dev                     )
+               __field(        ino_t,  ino                     )
+               __field(        int,    ret                     )
         ),
  
         TP_fast_assign(
-               __entry->ret            = ret;
-               __entry->ino            = inode->i_ino;
                 __entry->dev            = inode->i_sb->s_dev;
+               __entry->ino            = inode->i_ino;
+               __entry->ret            = ret;
         ),
  
         TP_printk("dev %d,%d ino %lu ret %d",
@@ -854,12 +854,6 @@ TRACE_EVENT(ext4_mballoc_alloc,
         TP_STRUCT__entry(
                 __field(        dev_t,  dev                     )
                 __field(        ino_t,  ino                     )
-               __field(        __u16,  found                   )
-               __field(        __u16,  groups                  )
-               __field(        __u16,  buddy                   )
-               __field(        __u16,  flags                   )
-               __field(        __u16,  tail                    )
-               __field(        __u8,   cr                      )
                 __field(        __u32,  orig_logical            )
                 __field(          int,  orig_start              )
                 __field(        __u32,  orig_group              )
@@ -872,17 +866,17 @@ TRACE_EVENT(ext4_mballoc_alloc,
                 __field(          int,  result_start            )
                 __field(        __u32,  result_group            )
                 __field(          int,  result_len              )
+               __field(        __u16,  found                   )
+               __field(        __u16,  groups                  )
+               __field(        __u16,  buddy                   )
+               __field(        __u16,  flags                   )
+               __field(        __u16,  tail                    )
+               __field(        __u8,   cr                      )
         ),
  
         TP_fast_assign(
                 __entry->dev            = ac->ac_inode->i_sb->s_dev;
                 __entry->ino            = ac->ac_inode->i_ino;
-               __entry->found          = ac->ac_found;
-               __entry->flags          = ac->ac_flags;
-               __entry->groups         = ac->ac_groups_scanned;
-               __entry->buddy          = ac->ac_buddy;
-               __entry->tail           = ac->ac_tail;
-               __entry->cr             = ac->ac_criteria;
                 __entry->orig_logical   = ac->ac_o_ex.fe_logical;
                 __entry->orig_start     = ac->ac_o_ex.fe_start;
                 __entry->orig_group     = ac->ac_o_ex.fe_group;
@@ -895,6 +889,12 @@ TRACE_EVENT(ext4_mballoc_alloc,
                 __entry->result_start   = ac->ac_f_ex.fe_start;
                 __entry->result_group   = ac->ac_f_ex.fe_group;
                 __entry->result_len     = ac->ac_f_ex.fe_len;
+               __entry->found          = ac->ac_found;
+               __entry->flags          = ac->ac_flags;
+               __entry->groups         = ac->ac_groups_scanned;
+               __entry->buddy          = ac->ac_buddy;
+               __entry->tail           = ac->ac_tail;
+               __entry->cr             = ac->ac_criteria;
         ),
  
         TP_printk("dev %d,%d inode %lu orig %u/%d/%u@%u goal %u/%d/%u@%u "
@@ -1015,17 +1015,17 @@ TRACE_EVENT(ext4_forget,
         TP_STRUCT__entry(
                 __field(        dev_t,  dev                     )
                 __field(        ino_t,  ino                     )
-               __field(        __u16,  mode                    )
-               __field(        int,    is_metadata             )
                 __field(        __u64,  block                   )
+               __field(        int,    is_metadata             )
+               __field(        __u16,  mode                    )
         ),
  
         TP_fast_assign(
                 __entry->dev    = inode->i_sb->s_dev;
                 __entry->ino    = inode->i_ino;
-               __entry->mode   = inode->i_mode;
-               __entry->is_metadata = is_metadata;
                 __entry->block  = block;
+               __entry->is_metadata = is_metadata;
+               __entry->mode   = inode->i_mode;
         ),
  
         TP_printk("dev %d,%d ino %lu mode 0%o is_metadata %d block %llu",
@@ -1042,19 +1042,18 @@ TRACE_EVENT(ext4_da_update_reserve_space,
         TP_STRUCT__entry(
                 __field(        dev_t,  dev                     )
                 __field(        ino_t,  ino                     )
-               __field(        __u16,  mode                    )
                 __field(        __u64,  i_blocks                )
                 __field(        int,    used_blocks             )
                 __field(        int,    reserved_data_blocks    )
                 __field(        int,    reserved_meta_blocks    )
                 __field(        int,    allocated_meta_blocks   )
                 __field(        int,    quota_claim             )
+               __field(        __u16,  mode                    )
         ),
  
         TP_fast_assign(
                 __entry->dev    = inode->i_sb->s_dev;
                 __entry->ino    = inode->i_ino;
-               __entry->mode   = inode->i_mode;
                 __entry->i_blocks = inode->i_blocks;
                 __entry->used_blocks = used_blocks;
                 __entry->reserved_data_blocks =
@@ -1064,6 +1063,7 @@ TRACE_EVENT(ext4_da_update_reserve_space,
                 __entry->allocated_meta_blocks =
                                 EXT4_I(inode)->i_allocated_meta_blocks;
                 __entry->quota_claim = quota_claim;
+               __entry->mode   = inode->i_mode;
         ),
  
         TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu used_blocks %d "
@@ -1085,21 +1085,21 @@ TRACE_EVENT(ext4_da_reserve_space,
         TP_STRUCT__entry(
                 __field(        dev_t,  dev                     )
                 __field(        ino_t,  ino                     )
-               __field(        __u16,  mode                    )
                 __field(        __u64,  i_blocks                )
                 __field(        int,    md_needed               )
                 __field(        int,    reserved_data_blocks    )
                 __field(        int,    reserved_meta_blocks    )
+               __field(        __u16,  mode                    )
         ),
  
         TP_fast_assign(
                 __entry->dev    = inode->i_sb->s_dev;
                 __entry->ino    = inode->i_ino;
-               __entry->mode   = inode->i_mode;
                 __entry->i_blocks = inode->i_blocks;
                 __entry->md_needed = md_needed;
                 __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks;
                 __entry->reserved_meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks;
+               __entry->mode   = inode->i_mode;
         ),
  
         TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu md_needed %d "
@@ -1119,23 +1119,23 @@ TRACE_EVENT(ext4_da_release_space,
         TP_STRUCT__entry(
                 __field(        dev_t,  dev                     )
                 __field(        ino_t,  ino                     )
-               __field(        __u16,  mode                    )
                 __field(        __u64,  i_blocks                )
                 __field(        int,    freed_blocks            )
                 __field(        int,    reserved_data_blocks    )
                 __field(        int,    reserved_meta_blocks    )
                 __field(        int,    allocated_meta_blocks   )
+               __field(        __u16,  mode                    )
         ),
  
         TP_fast_assign(
                 __entry->dev    = inode->i_sb->s_dev;
                 __entry->ino    = inode->i_ino;
-               __entry->mode   = inode->i_mode;
                 __entry->i_blocks = inode->i_blocks;
                 __entry->freed_blocks = freed_blocks;
                 __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks;
                 __entry->reserved_meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks;
                 __entry->allocated_meta_blocks = EXT4_I(inode)->i_allocated_meta_blocks;
+               __entry->mode   = inode->i_mode;
         ),
  
         TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu freed_blocks %d "
@@ -1203,16 +1203,16 @@ TRACE_EVENT(ext4_direct_IO_enter,
         TP_ARGS(inode, offset, len, rw),
  
         TP_STRUCT__entry(
-               __field(        ino_t,  ino                     )
                 __field(        dev_t,  dev                     )
+               __field(        ino_t,  ino                     )
                 __field(        loff_t, pos                     )
                 __field(        unsigned long,  len             )
                 __field(        int,    rw                      )
         ),
  
         TP_fast_assign(
-               __entry->ino    = inode->i_ino;
                 __entry->dev    = inode->i_sb->s_dev;
+               __entry->ino    = inode->i_ino;
                 __entry->pos    = offset;
                 __entry->len    = len;
                 __entry->rw     = rw;
@@ -1231,8 +1231,8 @@ TRACE_EVENT(ext4_direct_IO_exit,
         TP_ARGS(inode, offset, len, rw, ret),
  
         TP_STRUCT__entry(
-               __field(        ino_t,  ino                     )
                 __field(        dev_t,  dev                     )
+               __field(        ino_t,  ino                     )
                 __field(        loff_t, pos                     )
                 __field(        unsigned long,  len             )
                 __field(        int,    rw                      )
@@ -1240,8 +1240,8 @@ TRACE_EVENT(ext4_direct_IO_exit,
         ),
  
         TP_fast_assign(
-               __entry->ino    = inode->i_ino;
                 __entry->dev    = inode->i_sb->s_dev;
+               __entry->ino    = inode->i_ino;
                 __entry->pos    = offset;
                 __entry->len    = len;
                 __entry->rw     = rw;
@@ -1261,16 +1261,16 @@ TRACE_EVENT(ext4_fallocate_enter,
         TP_ARGS(inode, offset, len, mode),
  
         TP_STRUCT__entry(
-               __field(        ino_t,  ino                     )
                 __field(        dev_t,  dev                     )
+               __field(        ino_t,  ino                     )
                 __field(        loff_t, pos                     )
                 __field(        loff_t, len                     )
                 __field(        int,    mode                    )
         ),
  
         TP_fast_assign(
-               __entry->ino    = inode->i_ino;
                 __entry->dev    = inode->i_sb->s_dev;
+               __entry->ino    = inode->i_ino;
                 __entry->pos    = offset;
                 __entry->len    = len;
                 __entry->mode   = mode;
@@ -1289,16 +1289,16 @@ TRACE_EVENT(ext4_fallocate_exit,
         TP_ARGS(inode, offset, max_blocks, ret),
  
         TP_STRUCT__entry(
-               __field(        ino_t,  ino                     )
                 __field(        dev_t,  dev                     )
+               __field(        ino_t,  ino                     )
                 __field(        loff_t, pos                     )
                 __field(        unsigned int,   blocks          )
                 __field(        int,    ret                     )
         ),
  
         TP_fast_assign(
-               __entry->ino    = inode->i_ino;
                 __entry->dev    = inode->i_sb->s_dev;
+               __entry->ino    = inode->i_ino;
                 __entry->pos    = offset;
                 __entry->blocks = max_blocks;
                 __entry->ret    = ret;
@@ -1317,17 +1317,17 @@ TRACE_EVENT(ext4_unlink_enter,
         TP_ARGS(parent, dentry),
  
         TP_STRUCT__entry(
-               __field(        ino_t,  parent                  )
+               __field(        dev_t,  dev                     )
                 __field(        ino_t,  ino                     )
+               __field(        ino_t,  parent                  )
                 __field(        loff_t, size                    )
-               __field(        dev_t,  dev                     )
         ),
  
         TP_fast_assign(
-               __entry->parent         = parent->i_ino;
+               __entry->dev            = dentry->d_inode->i_sb->s_dev;
                 __entry->ino            = dentry->d_inode->i_ino;
+               __entry->parent         = parent->i_ino;
                 __entry->size           = dentry->d_inode->i_size;
-               __entry->dev            = dentry->d_inode->i_sb->s_dev;
         ),
  
         TP_printk("dev %d,%d ino %lu size %lld parent %lu",
@@ -1342,14 +1342,14 @@ TRACE_EVENT(ext4_unlink_exit,
         TP_ARGS(dentry, ret),
  
         TP_STRUCT__entry(
-               __field(        ino_t,  ino                     )
                 __field(        dev_t,  dev                     )
+               __field(        ino_t,  ino                     )
                 __field(        int,    ret                     )
         ),
  
         TP_fast_assign(
-               __entry->ino            = dentry->d_inode->i_ino;
                 __entry->dev            = dentry->d_inode->i_sb->s_dev;
+               __entry->ino            = dentry->d_inode->i_ino;
                 __entry->ret            = ret;
         ),
  
@@ -1365,14 +1365,14 @@ DECLARE_EVENT_CLASS(ext4__truncate,
         TP_ARGS(inode),
  
         TP_STRUCT__entry(
-               __field(        ino_t,          ino             )
-               __field(        dev_t,          dev             )
+               __field(        dev_t,          dev             )
+               __field(        ino_t,          ino             )
                 __field(        __u64,          blocks          )
         ),
  
         TP_fast_assign(
-               __entry->ino    = inode->i_ino;
                 __entry->dev    = inode->i_sb->s_dev;
+               __entry->ino    = inode->i_ino;
                 __entry->blocks = inode->i_blocks;
         ),
  
@@ -1403,8 +1403,8 @@ TRACE_EVENT(ext4_ext_convert_to_initialized_enter,
         TP_ARGS(inode, map, ux),
  
         TP_STRUCT__entry(
-               __field(        ino_t,          ino     )
                 __field(        dev_t,          dev     )
+               __field(        ino_t,          ino     )
                 __field(        ext4_lblk_t,    m_lblk  )
                 __field(        unsigned,       m_len   )
                 __field(        ext4_lblk_t,    u_lblk  )
@@ -1413,8 +1413,8 @@ TRACE_EVENT(ext4_ext_convert_to_initialized_enter,
         ),
  
         TP_fast_assign(
-               __entry->ino            = inode->i_ino;
                 __entry->dev            = inode->i_sb->s_dev;
+               __entry->ino            = inode->i_ino;
                 __entry->m_lblk         = map->m_lblk;
                 __entry->m_len          = map->m_len;
                 __entry->u_lblk         = le32_to_cpu(ux->ee_block);
@@ -1441,8 +1441,8 @@ TRACE_EVENT(ext4_ext_convert_to_initialized_fastpath,
         TP_ARGS(inode, map, ux, ix),
  
         TP_STRUCT__entry(
-               __field(        ino_t,          ino     )
                 __field(        dev_t,          dev     )
+               __field(        ino_t,          ino     )
                 __field(        ext4_lblk_t,    m_lblk  )
                 __field(        unsigned,       m_len   )
                 __field(        ext4_lblk_t,    u_lblk  )
@@ -1454,8 +1454,8 @@ TRACE_EVENT(ext4_ext_convert_to_initialized_fastpath,
         ),
  
         TP_fast_assign(
-               __entry->ino            = inode->i_ino;
                 __entry->dev            = inode->i_sb->s_dev;
+               __entry->ino            = inode->i_ino;
                 __entry->m_lblk         = map->m_lblk;
                 __entry->m_len          = map->m_len;
                 __entry->u_lblk         = le32_to_cpu(ux->ee_block);
@@ -1483,16 +1483,16 @@ DECLARE_EVENT_CLASS(ext4__map_blocks_enter,
         TP_ARGS(inode, lblk, len, flags),
  
         TP_STRUCT__entry(
-               __field(        ino_t,          ino             )
-               __field(        dev_t,          dev             )
+               __field(        dev_t,          dev             )
+               __field(        ino_t,          ino             )
                 __field(        ext4_lblk_t,    lblk            )
                 __field(        unsigned int,   len             )
                 __field(        unsigned int,   flags           )
         ),
  
         TP_fast_assign(
-               __entry->ino    = inode->i_ino;
                 __entry->dev    = inode->i_sb->s_dev;
+               __entry->ino    = inode->i_ino;
                 __entry->lblk   = lblk;
                 __entry->len    = len;
                 __entry->flags  = flags;
@@ -1525,19 +1525,19 @@ DECLARE_EVENT_CLASS(ext4__map_blocks_exit,
         TP_ARGS(inode, lblk, pblk, len, ret),
  
         TP_STRUCT__entry(
-               __field(        ino_t,          ino             )
                 __field(        dev_t,          dev             )
-               __field(        ext4_lblk_t,    lblk            )
+               __field(        ino_t,          ino             )
                 __field(        ext4_fsblk_t,   pblk            )
+               __field(        ext4_lblk_t,    lblk            )
                 __field(        unsigned int,   len             )
                 __field(        int,            ret             )
         ),
  
         TP_fast_assign(
-               __entry->ino    = inode->i_ino;
                 __entry->dev    = inode->i_sb->s_dev;
-               __entry->lblk   = lblk;
+               __entry->ino    = inode->i_ino;
                 __entry->pblk   = pblk;
+               __entry->lblk   = lblk;
                 __entry->len    = len;
                 __entry->ret    = ret;
         ),
@@ -1569,17 +1569,17 @@ TRACE_EVENT(ext4_ext_load_extent,
         TP_ARGS(inode, lblk, pblk),
  
         TP_STRUCT__entry(
-               __field(        ino_t,          ino             )
                 __field(        dev_t,          dev             )
-               __field(        ext4_lblk_t,    lblk            )
+               __field(        ino_t,          ino             )
                 __field(        ext4_fsblk_t,   pblk            )
+               __field(        ext4_lblk_t,    lblk            )
         ),
  
         TP_fast_assign(
-               __entry->ino    = inode->i_ino;
                 __entry->dev    = inode->i_sb->s_dev;
-               __entry->lblk   = lblk;
+               __entry->ino    = inode->i_ino;
                 __entry->pblk   = pblk;
+               __entry->lblk   = lblk;
         ),
  
         TP_printk("dev %d,%d ino %lu lblk %u pblk %llu",
@@ -1594,13 +1594,13 @@ TRACE_EVENT(ext4_load_inode,
         TP_ARGS(inode),
  
         TP_STRUCT__entry(
-               __field(        ino_t,  ino             )
                 __field(        dev_t,  dev             )
+               __field(        ino_t,  ino             )
         ),
  
         TP_fast_assign(
-               __entry->ino            = inode->i_ino;
                 __entry->dev            = inode->i_sb->s_dev;
+               __entry->ino            = inode->i_ino;
         ),
  
         TP_printk("dev %d,%d ino %ld",
@@ -1615,14 +1615,14 @@ TRACE_EVENT(ext4_journal_start,
  
         TP_STRUCT__entry(
                 __field(        dev_t,  dev                     )
-               __field(          int,  nblocks                 )
                 __field(unsigned long,  ip                      )
+               __field(        int,    nblocks                 )
         ),
  
         TP_fast_assign(
                 __entry->dev     = sb->s_dev;
-               __entry->nblocks = nblocks;
                 __entry->ip      = IP;
+               __entry->nblocks = nblocks;
         ),
  
         TP_printk("dev %d,%d nblocks %d caller %pF",
@@ -1686,23 +1686,23 @@ TRACE_EVENT(ext4_ext_handle_uninitialized_extents,
         TP_ARGS(inode, map, allocated, newblock),
  
         TP_STRUCT__entry(
-               __field(        ino_t,          ino             )
                 __field(        dev_t,          dev             )
+               __field(        ino_t,          ino             )
+               __field(        int,            flags           )
                 __field(        ext4_lblk_t,    lblk            )
                 __field(        ext4_fsblk_t,   pblk            )
                 __field(        unsigned int,   len             )
-               __field(        int,            flags           )
                 __field(        unsigned int,   allocated       )
                 __field(        ext4_fsblk_t,   newblk          )
         ),
  
         TP_fast_assign(
-               __entry->ino            = inode->i_ino;
                 __entry->dev            = inode->i_sb->s_dev;
+               __entry->ino            = inode->i_ino;
+               __entry->flags          = map->m_flags;
                 __entry->lblk           = map->m_lblk;
                 __entry->pblk           = map->m_pblk;
                 __entry->len            = map->m_len;
-               __entry->flags          = map->m_flags;
                 __entry->allocated      = allocated;
                 __entry->newblk         = newblock;
         ),
@@ -1724,19 +1724,19 @@ TRACE_EVENT(ext4_get_implied_cluster_alloc_exit,
  
         TP_STRUCT__entry(
                 __field(        dev_t,          dev     )
+               __field(        unsigned int,   flags   )
                 __field(        ext4_lblk_t,    lblk    )
                 __field(        ext4_fsblk_t,   pblk    )
                 __field(        unsigned int,   len     )
-               __field(        unsigned int,   flags   )
                 __field(        int,            ret     )
         ),
  
         TP_fast_assign(
                 __entry->dev    = sb->s_dev;
+               __entry->flags  = map->m_flags;
                 __entry->lblk   = map->m_lblk;
                 __entry->pblk   = map->m_pblk;
                 __entry->len    = map->m_len;
-               __entry->flags  = map->m_flags;
                 __entry->ret    = ret;
         ),
  
@@ -1753,16 +1753,16 @@ TRACE_EVENT(ext4_ext_put_in_cache,
         TP_ARGS(inode, lblk, len, start),
  
         TP_STRUCT__entry(
-               __field(        ino_t,          ino     )
                 __field(        dev_t,          dev     )
+               __field(        ino_t,          ino     )
                 __field(        ext4_lblk_t,    lblk    )
                 __field(        unsigned int,   len     )
                 __field(        ext4_fsblk_t,   start   )
         ),
  
         TP_fast_assign(
-               __entry->ino    = inode->i_ino;
                 __entry->dev    = inode->i_sb->s_dev;
+               __entry->ino    = inode->i_ino;
                 __entry->lblk   = lblk;
                 __entry->len    = len;
                 __entry->start  = start;
@@ -1782,15 +1782,15 @@ TRACE_EVENT(ext4_ext_in_cache,
         TP_ARGS(inode, lblk, ret),
  
         TP_STRUCT__entry(
-               __field(        ino_t,          ino     )
                 __field(        dev_t,          dev     )
+               __field(        ino_t,          ino     )
                 __field(        ext4_lblk_t,    lblk    )
                 __field(        int,            ret     )
         ),
  
         TP_fast_assign(
-               __entry->ino    = inode->i_ino;
                 __entry->dev    = inode->i_sb->s_dev;
+               __entry->ino    = inode->i_ino;
                 __entry->lblk   = lblk;
                 __entry->ret    = ret;
         ),
@@ -1810,8 +1810,8 @@ TRACE_EVENT(ext4_find_delalloc_range,
         TP_ARGS(inode, from, to, reverse, found, found_blk),
  
         TP_STRUCT__entry(
-               __field(        ino_t,          ino             )
                 __field(        dev_t,          dev             )
+               __field(        ino_t,          ino             )
                 __field(        ext4_lblk_t,    from            )
                 __field(        ext4_lblk_t,    to              )
                 __field(        int,            reverse         )
@@ -1820,8 +1820,8 @@ TRACE_EVENT(ext4_find_delalloc_range,
         ),
  
         TP_fast_assign(
-               __entry->ino            = inode->i_ino;
                 __entry->dev            = inode->i_sb->s_dev;
+               __entry->ino            = inode->i_ino;
                 __entry->from           = from;
                 __entry->to             = to;
                 __entry->reverse        = reverse;
@@ -1844,15 +1844,15 @@ TRACE_EVENT(ext4_get_reserved_cluster_alloc,
         TP_ARGS(inode, lblk, len),
  
         TP_STRUCT__entry(
-               __field(        ino_t,          ino     )
                 __field(        dev_t,          dev     )
+               __field(        ino_t,          ino     )
                 __field(        ext4_lblk_t,    lblk    )
                 __field(        unsigned int,   len     )
         ),
  
         TP_fast_assign(
-               __entry->ino    = inode->i_ino;
                 __entry->dev    = inode->i_sb->s_dev;
+               __entry->ino    = inode->i_ino;
                 __entry->lblk   = lblk;
                 __entry->len    = len;
         ),
@@ -1871,18 +1871,18 @@ TRACE_EVENT(ext4_ext_show_extent,
         TP_ARGS(inode, lblk, pblk, len),
  
         TP_STRUCT__entry(
-               __field(        ino_t,          ino     )
                 __field(        dev_t,          dev     )
-               __field(        ext4_lblk_t,    lblk    )
+               __field(        ino_t,          ino     )
                 __field(        ext4_fsblk_t,   pblk    )
+               __field(        ext4_lblk_t,    lblk    )
                 __field(        unsigned short, len     )
         ),
  
         TP_fast_assign(
-               __entry->ino    = inode->i_ino;
                 __entry->dev    = inode->i_sb->s_dev;
-               __entry->lblk   = lblk;
+               __entry->ino    = inode->i_ino;
                 __entry->pblk   = pblk;
+               __entry->lblk   = lblk;
                 __entry->len    = len;
         ),
  
@@ -1902,25 +1902,25 @@ TRACE_EVENT(ext4_remove_blocks,
         TP_ARGS(inode, ex, from, to, partial_cluster),
  
         TP_STRUCT__entry(
-               __field(        ino_t,          ino     )
                 __field(        dev_t,          dev     )
-               __field(        ext4_lblk_t,    ee_lblk )
-               __field(        ext4_fsblk_t,   ee_pblk )
-               __field(        unsigned short, ee_len  )
+               __field(        ino_t,          ino     )
                 __field(        ext4_lblk_t,    from    )
                 __field(        ext4_lblk_t,    to      )
                 __field(        ext4_fsblk_t,   partial )
+               __field(        ext4_fsblk_t,   ee_pblk )
+               __field(        ext4_lblk_t,    ee_lblk )
+               __field(        unsigned short, ee_len  )
         ),
  
         TP_fast_assign(
-               __entry->ino            = inode->i_ino;
                 __entry->dev            = inode->i_sb->s_dev;
-               __entry->ee_lblk        = cpu_to_le32(ex->ee_block);
-               __entry->ee_pblk        = ext4_ext_pblock(ex);
-               __entry->ee_len         = ext4_ext_get_actual_len(ex);
+               __entry->ino            = inode->i_ino;
                 __entry->from           = from;
                 __entry->to             = to;
                 __entry->partial        = partial_cluster;
+               __entry->ee_pblk        = ext4_ext_pblock(ex);
+               __entry->ee_lblk        = cpu_to_le32(ex->ee_block);
+               __entry->ee_len         = ext4_ext_get_actual_len(ex);
         ),
  
         TP_printk("dev %d,%d ino %lu extent [%u(%llu), %u]"
@@ -1942,23 +1942,23 @@ TRACE_EVENT(ext4_ext_rm_leaf,
         TP_ARGS(inode, start, ex, partial_cluster),
  
         TP_STRUCT__entry(
-               __field(        ino_t,          ino     )
                 __field(        dev_t,          dev     )
+               __field(        ino_t,          ino     )
+               __field(        ext4_fsblk_t,   partial )
                 __field(        ext4_lblk_t,    start   )
                 __field(        ext4_lblk_t,    ee_lblk )
                 __field(        ext4_fsblk_t,   ee_pblk )
                 __field(        short,          ee_len  )
-               __field(        ext4_fsblk_t,   partial )
         ),
  
         TP_fast_assign(
-               __entry->ino            = inode->i_ino;
                 __entry->dev            = inode->i_sb->s_dev;
+               __entry->ino            = inode->i_ino;
+               __entry->partial        = partial_cluster;
                 __entry->start          = start;
                 __entry->ee_lblk        = le32_to_cpu(ex->ee_block);
                 __entry->ee_pblk        = ext4_ext_pblock(ex);
                 __entry->ee_len         = ext4_ext_get_actual_len(ex);
-               __entry->partial        = partial_cluster;
         ),
  
         TP_printk("dev %d,%d ino %lu start_lblk %u last_extent [%u(%llu), %u]"
@@ -1978,14 +1978,14 @@ TRACE_EVENT(ext4_ext_rm_idx,
         TP_ARGS(inode, pblk),
  
         TP_STRUCT__entry(
-               __field(        ino_t,          ino     )
                 __field(        dev_t,          dev     )
+               __field(        ino_t,          ino     )
                 __field(        ext4_fsblk_t,   pblk    )
         ),
  
         TP_fast_assign(
-               __entry->ino    = inode->i_ino;
                 __entry->dev    = inode->i_sb->s_dev;
+               __entry->ino    = inode->i_ino;
                 __entry->pblk   = pblk;
         ),
  
@@ -2001,15 +2001,15 @@ TRACE_EVENT(ext4_ext_remove_space,
         TP_ARGS(inode, start, depth),
  
         TP_STRUCT__entry(
-               __field(        ino_t,          ino     )
                 __field(        dev_t,          dev     )
+               __field(        ino_t,          ino     )
                 __field(        ext4_lblk_t,    start   )
                 __field(        int,            depth   )
         ),
  
         TP_fast_assign(
-               __entry->ino    = inode->i_ino;
                 __entry->dev    = inode->i_sb->s_dev;
+               __entry->ino    = inode->i_ino;
                 __entry->start  = start;
                 __entry->depth  = depth;
         ),
@@ -2028,8 +2028,8 @@ TRACE_EVENT(ext4_ext_remove_space_done,
         TP_ARGS(inode, start, depth, partial, eh_entries),
  
         TP_STRUCT__entry(
-               __field(        ino_t,          ino             )
                 __field(        dev_t,          dev             )
+               __field(        ino_t,          ino             )
                 __field(        ext4_lblk_t,    start           )
                 __field(        int,            depth           )
                 __field(        ext4_lblk_t,    partial         )
@@ -2037,8 +2037,8 @@ TRACE_EVENT(ext4_ext_remove_space_done,
         ),
  
         TP_fast_assign(
-               __entry->ino            = inode->i_ino;
                 __entry->dev            = inode->i_sb->s_dev;
+               __entry->ino            = inode->i_ino;
                 __entry->start          = start;
                 __entry->depth          = depth;
                 __entry->partial        = partial;
author	Linus Torvalds <torvalds@linux-foundation.org>
	Sun, 7 Oct 2012 21:36:39 +0000 (06:36 +0900)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Sun, 7 Oct 2012 21:36:39 +0000 (06:36 +0900)
Documentation/ABI/testing/sysfs-fs-ext4		patch \| blob \| history
Documentation/filesystems/ext4.txt		patch \| blob \| history
fs/buffer.c		patch \| blob \| history
fs/ext4/ext4.h		patch \| blob \| history
fs/ext4/extents.c		patch \| blob \| history
fs/ext4/file.c		patch \| blob \| history
fs/ext4/fsync.c		patch \| blob \| history
fs/ext4/ialloc.c		patch \| blob \| history
fs/ext4/indirect.c		patch \| blob \| history
fs/ext4/inode.c		patch \| blob \| history
fs/ext4/ioctl.c		patch \| blob \| history
fs/ext4/mballoc.c		patch \| blob \| history
fs/ext4/mballoc.h		patch \| blob \| history
fs/ext4/move_extent.c		patch \| blob \| history
fs/ext4/namei.c		patch \| blob \| history
fs/ext4/page-io.c		patch \| blob \| history
fs/ext4/resize.c		patch \| blob \| history
fs/ext4/super.c		patch \| blob \| history
fs/fs-writeback.c		patch \| blob \| history
fs/jbd2/commit.c		patch \| blob \| history
fs/jbd2/journal.c		patch \| blob \| history
fs/jbd2/recovery.c		patch \| blob \| history
fs/jbd2/transaction.c		patch \| blob \| history
fs/nilfs2/file.c		patch \| blob \| history
include/linux/falloc.h		patch \| blob \| history
include/trace/events/ext4.h		patch \| blob \| history