ext4: add delalloc support for inline data

author Tao Ma <boyu.mt@taobao.com>

Mon, 10 Dec 2012 19:05:57 +0000 (14:05 -0500)

committer Theodore Ts'o <tytso@mit.edu>

Mon, 10 Dec 2012 19:05:57 +0000 (14:05 -0500)
author Tao Ma <boyu.mt@taobao.com>
Mon, 10 Dec 2012 19:05:57 +0000 (14:05 -0500)
committer Theodore Ts'o <tytso@mit.edu>
Mon, 10 Dec 2012 19:05:57 +0000 (14:05 -0500)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h

index 9f4efc6..268636a 100644 (file)
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2022,6 +2022,8 @@ int ext4_get_block_write(struct inode *inode, sector_t iblock,
                          struct buffer_head *bh_result, int create);
  int ext4_get_block(struct inode *inode, sector_t iblock,
                                 struct buffer_head *bh_result, int create);
+int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
+                          struct buffer_head *bh, int create);
  int ext4_walk_page_buffers(handle_t *handle,
                            struct buffer_head *head,
                            unsigned from,
@@ -2031,6 +2033,8 @@ int ext4_walk_page_buffers(handle_t *handle,
                                      struct buffer_head *bh));
  int do_journal_get_write_access(handle_t *handle,
                                 struct buffer_head *bh);
+#define FALL_BACK_TO_NONDELALLOC 1
+#define CONVERT_INLINE_DATA     2
  
  extern struct inode *ext4_iget(struct super_block *, unsigned long);
  extern int  ext4_write_inode(struct inode *, struct writeback_control *);
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c

index 01274b1..65f7ffb 100644 (file)
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -771,6 +771,183 @@ ext4_journalled_write_inline_data(struct inode *inode,
         return iloc.bh;
  }
  
+/*
+ * Try to make the page cache and handle ready for the inline data case.
+ * We can call this function in 2 cases:
+ * 1. The inode is created and the first write exceeds inline size. We can
+ *    clear the inode state safely.
+ * 2. The inode has inline data, then we need to read the data, make it
+ *    update and dirty so that ext4_da_writepages can handle it. We don't
+ *    need to start the journal since the file's metatdata isn't changed now.
+ */
+static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping,
+                                                struct inode *inode,
+                                                unsigned flags,
+                                                void **fsdata)
+{
+       int ret = 0, inline_size;
+       struct page *page;
+
+       page = grab_cache_page_write_begin(mapping, 0, flags);
+       if (!page)
+               return -ENOMEM;
+
+       down_read(&EXT4_I(inode)->xattr_sem);
+       if (!ext4_has_inline_data(inode)) {
+               ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+               goto out;
+       }
+
+       inline_size = ext4_get_inline_size(inode);
+
+       if (!PageUptodate(page)) {
+               ret = ext4_read_inline_page(inode, page);
+               if (ret < 0)
+                       goto out;
+       }
+
+       ret = __block_write_begin(page, 0, inline_size,
+                                 ext4_da_get_block_prep);
+       if (ret) {
+               ext4_truncate_failed_write(inode);
+               goto out;
+       }
+
+       SetPageDirty(page);
+       SetPageUptodate(page);
+       ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+       *fsdata = (void *)CONVERT_INLINE_DATA;
+
+out:
+       up_read(&EXT4_I(inode)->xattr_sem);
+       if (page) {
+               unlock_page(page);
+               page_cache_release(page);
+       }
+       return ret;
+}
+
+/*
+ * Prepare the write for the inline data.
+ * If the the data can be written into the inode, we just read
+ * the page and make it uptodate, and start the journal.
+ * Otherwise read the page, makes it dirty so that it can be
+ * handle in writepages(the i_disksize update is left to the
+ * normal ext4_da_write_end).
+ */
+int ext4_da_write_inline_data_begin(struct address_space *mapping,
+                                   struct inode *inode,
+                                   loff_t pos, unsigned len,
+                                   unsigned flags,
+                                   struct page **pagep,
+                                   void **fsdata)
+{
+       int ret, inline_size;
+       handle_t *handle;
+       struct page *page;
+       struct ext4_iloc iloc;
+
+       ret = ext4_get_inode_loc(inode, &iloc);
+       if (ret)
+               return ret;
+
+       handle = ext4_journal_start(inode, 1);
+       if (IS_ERR(handle)) {
+               ret = PTR_ERR(handle);
+               handle = NULL;
+               goto out;
+       }
+
+       inline_size = ext4_get_max_inline_size(inode);
+
+       ret = -ENOSPC;
+       if (inline_size >= pos + len) {
+               ret = ext4_prepare_inline_data(handle, inode, pos + len);
+               if (ret && ret != -ENOSPC)
+                       goto out;
+       }
+
+       if (ret == -ENOSPC) {
+               ret = ext4_da_convert_inline_data_to_extent(mapping,
+                                                           inode,
+                                                           flags,
+                                                           fsdata);
+               goto out;
+       }
+
+       /*
+        * We cannot recurse into the filesystem as the transaction
+        * is already started.
+        */
+       flags |= AOP_FLAG_NOFS;
+
+       page = grab_cache_page_write_begin(mapping, 0, flags);
+       if (!page) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       down_read(&EXT4_I(inode)->xattr_sem);
+       if (!ext4_has_inline_data(inode)) {
+               ret = 0;
+               goto out_release_page;
+       }
+
+       if (!PageUptodate(page)) {
+               ret = ext4_read_inline_page(inode, page);
+               if (ret < 0)
+                       goto out_release_page;
+       }
+
+       up_read(&EXT4_I(inode)->xattr_sem);
+       *pagep = page;
+       handle = NULL;
+       brelse(iloc.bh);
+       return 1;
+out_release_page:
+       up_read(&EXT4_I(inode)->xattr_sem);
+       unlock_page(page);
+       page_cache_release(page);
+out:
+       if (handle)
+               ext4_journal_stop(handle);
+       brelse(iloc.bh);
+       return ret;
+}
+
+int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
+                                 unsigned len, unsigned copied,
+                                 struct page *page)
+{
+       int i_size_changed = 0;
+
+       copied = ext4_write_inline_data_end(inode, pos, len, copied, page);
+
+       /*
+        * No need to use i_size_read() here, the i_size
+        * cannot change under us because we hold i_mutex.
+        *
+        * But it's important to update i_size while still holding page lock:
+        * page writeout could otherwise come in and zero beyond i_size.
+        */
+       if (pos+copied > inode->i_size) {
+               i_size_write(inode, pos+copied);
+               i_size_changed = 1;
+       }
+       unlock_page(page);
+       page_cache_release(page);
+
+       /*
+        * Don't mark the inode dirty under page lock. First, it unnecessarily
+        * makes the holding time of page lock longer. Second, it forces lock
+        * ordering of page lock and transaction start for journaling
+        * filesystems.
+        */
+       if (i_size_changed)
+               mark_inode_dirty(inode);
+
+       return copied;
+}
  
  int ext4_destroy_inline_data(handle_t *handle, struct inode *inode)
  {
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c

index 5c91622..f16ae02 100644 (file)
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1790,7 +1790,19 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
          * file system block.
          */
         down_read((&EXT4_I(inode)->i_data_sem));
-       if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+       if (ext4_has_inline_data(inode)) {
+               /*
+                * We will soon create blocks for this page, and let
+                * us pretend as if the blocks aren't allocated yet.
+                * In case of clusters, we have to handle the work
+                * of mapping from cluster so that the reserved space
+                * is calculated properly.
+                */
+               if ((EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) &&
+                   ext4_find_delalloc_cluster(inode, map->m_lblk))
+                       map->m_flags |= EXT4_MAP_FROM_CLUSTER;
+               retval = 0;
+       } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                 retval = ext4_ext_map_blocks(NULL, inode, map, 0);
         else
                 retval = ext4_ind_map_blocks(NULL, inode, map, 0);
@@ -1841,8 +1853,8 @@ out_unlock:
   * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev
   * initialized properly.
   */
-static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
-                                 struct buffer_head *bh, int create)
+int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
+                          struct buffer_head *bh, int create)
  {
         struct ext4_map_blocks map;
         int ret = 0;
@@ -2119,7 +2131,8 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
   * mpage_da_map_and_submit to map a single contiguous memory region
   * and then write them.
   */
-static int write_cache_pages_da(struct address_space *mapping,
+static int write_cache_pages_da(handle_t *handle,
+                               struct address_space *mapping,
                                 struct writeback_control *wbc,
                                 struct mpage_da_data *mpd,
                                 pgoff_t *done_index)
@@ -2198,6 +2211,17 @@ static int write_cache_pages_da(struct address_space *mapping,
                         wait_on_page_writeback(page);
                         BUG_ON(PageWriteback(page));
  
+                       /*
+                        * If we have inline data and arrive here, it means that
+                        * we will soon create the block for the 1st page, so
+                        * we'd better clear the inline data here.
+                        */
+                       if (ext4_has_inline_data(inode)) {
+                               BUG_ON(ext4_test_inode_state(inode,
+                                               EXT4_STATE_MAY_INLINE_DATA));
+                               ext4_destroy_inline_data(handle, inode);
+                       }
+
                         if (mpd->next_page != page->index)
                                 mpd->first_page = page->index;
                         mpd->next_page = page->index + 1;
@@ -2404,7 +2428,8 @@ retry:
                  * contiguous region of logical blocks that need
                  * blocks to be allocated by ext4 and submit them.
                  */
-               ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index);
+               ret = write_cache_pages_da(handle, mapping,
+                                          wbc, &mpd, &done_index);
                 /*
                  * If we have a contiguous extent of pages and we
                  * haven't done the I/O yet, map the blocks and submit
@@ -2468,7 +2493,6 @@ out_writepages:
         return ret;
  }
  
-#define FALL_BACK_TO_NONDELALLOC 1
  static int ext4_nonda_switch(struct super_block *sb)
  {
         s64 free_blocks, dirty_blocks;
@@ -2525,6 +2549,19 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
         }
         *fsdata = (void *)0;
         trace_ext4_da_write_begin(inode, pos, len, flags);
+
+       if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
+               ret = ext4_da_write_inline_data_begin(mapping, inode,
+                                                     pos, len, flags,
+                                                     pagep, fsdata);
+               if (ret < 0)
+                       goto out;
+               if (ret == 1) {
+                       ret = 0;
+                       goto out;
+               }
+       }
+
  retry:
         /*
          * With delayed allocation, we don't log the i_disksize update
@@ -2626,10 +2663,10 @@ static int ext4_da_write_end(struct file *file,
          * changes.  So let's piggyback the i_disksize mark_inode_dirty
          * into that.
          */
-
         new_i_size = pos + copied;
         if (copied && new_i_size > EXT4_I(inode)->i_disksize) {
-               if (ext4_da_should_update_i_disksize(page, end)) {
+               if (ext4_has_inline_data(inode) ||
+                   ext4_da_should_update_i_disksize(page, end)) {
                         down_write(&EXT4_I(inode)->i_data_sem);
                         if (new_i_size > EXT4_I(inode)->i_disksize)
                                 EXT4_I(inode)->i_disksize = new_i_size;
@@ -2641,8 +2678,16 @@ static int ext4_da_write_end(struct file *file,
                         ext4_mark_inode_dirty(handle, inode);
                 }
         }
-       ret2 = generic_write_end(file, mapping, pos, len, copied,
+
+       if (write_mode != CONVERT_INLINE_DATA &&
+           ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) &&
+           ext4_has_inline_data(inode))
+               ret2 = ext4_da_write_inline_data_end(inode, pos, len, copied,
+                                                    page);
+       else
+               ret2 = generic_write_end(file, mapping, pos, len, copied,
                                                         page, fsdata);
+
         copied = ret2;
         if (ret2 < 0)
                 ret = ret2;
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h

index 7095ac1..37e66f8 100644 (file)
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -154,6 +154,15 @@ extern struct buffer_head *
  ext4_journalled_write_inline_data(struct inode *inode,
                                   unsigned len,
                                   struct page *page);
+extern int ext4_da_write_inline_data_begin(struct address_space *mapping,
+                                          struct inode *inode,
+                                          loff_t pos, unsigned len,
+                                          unsigned flags,
+                                          struct page **pagep,
+                                          void **fsdata);
+extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
+                                        unsigned len, unsigned copied,
+                                        struct page *page);
  # else  /* CONFIG_EXT4_FS_XATTR */
  
  static inline int
@@ -300,6 +309,24 @@ ext4_journalled_write_inline_data(struct inode *inode,
  {
         return NULL;
  }
+
+static inline int
+ext4_da_write_inline_data_begin(struct address_space *mapping,
+                               struct inode *inode,
+                               loff_t pos, unsigned len,
+                               unsigned flags,
+                               struct page **pagep,
+                               void **fsdata)
+{
+       return 0;
+}
+
+static inline int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
+                                               unsigned len, unsigned copied,
+                                               struct page *page)
+{
+       return 0;
+}
  # endif  /* CONFIG_EXT4_FS_XATTR */
  
  #ifdef CONFIG_EXT4_FS_SECURITY
author	Tao Ma <boyu.mt@taobao.com>
	Mon, 10 Dec 2012 19:05:57 +0000 (14:05 -0500)
committer	Theodore Ts'o <tytso@mit.edu>
	Mon, 10 Dec 2012 19:05:57 +0000 (14:05 -0500)
fs/ext4/ext4.h		patch \| blob \| history
fs/ext4/inline.c		patch \| blob \| history
fs/ext4/inode.c		patch \| blob \| history
fs/ext4/xattr.h		patch \| blob \| history