Btrfs: add hole punching
authorJosef Bacik <jbacik@fusionio.com>
Wed, 29 Aug 2012 18:27:18 +0000 (14:27 -0400)
committerChris Mason <chris.mason@fusionio.com>
Mon, 1 Oct 2012 19:19:07 +0000 (15:19 -0400)
This patch adds hole punching via fallocate.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
fs/btrfs/ctree.h
fs/btrfs/extent-tree.c
fs/btrfs/file.c
fs/btrfs/inode.c
fs/btrfs/tree-log.c

index 71d6ff1..88adfe6 100644 (file)
@@ -3250,6 +3250,8 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root,
                        struct inode *dir, u64 objectid,
                        const char *name, int name_len);
+int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
+                       int front);
 int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
                               struct btrfs_root *root,
                               struct inode *inode, u64 new_size,
@@ -3323,7 +3325,7 @@ extern const struct file_operations btrfs_file_operations;
 int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
                         struct btrfs_root *root, struct inode *inode,
                         struct btrfs_path *path, u64 start, u64 end,
-                        int drop_cache);
+                        u64 *drop_end, int drop_cache);
 int btrfs_drop_extents(struct btrfs_trans_handle *trans,
                       struct btrfs_root *root, struct inode *inode, u64 start,
                       u64 end, int drop_cache);
index df2f17b..ee51b7a 100644 (file)
@@ -4132,6 +4132,8 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
 void btrfs_free_block_rsv(struct btrfs_root *root,
                          struct btrfs_block_rsv *rsv)
 {
+       if (!rsv)
+               return;
        btrfs_block_rsv_release(root, rsv, (u64)-1);
        kfree(rsv);
 }
index 58598c2..57026a6 100644 (file)
@@ -39,6 +39,7 @@
 #include "tree-log.h"
 #include "locking.h"
 #include "compat.h"
+#include "volumes.h"
 
 /*
  * when auto defrag is enabled we
@@ -584,7 +585,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
                         struct btrfs_root *root, struct inode *inode,
                         struct btrfs_path *path, u64 start, u64 end,
-                        int drop_cache)
+                        u64 *drop_end, int drop_cache)
 {
        struct extent_buffer *leaf;
        struct btrfs_file_extent_item *fi;
@@ -822,6 +823,8 @@ next_slot:
                        btrfs_abort_transaction(trans, root, ret);
        }
 
+       if (drop_end)
+               *drop_end = min(end, extent_end);
        btrfs_release_path(path);
        return ret;
 }
@@ -836,7 +839,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
-       ret = __btrfs_drop_extents(trans, root, inode, path, start, end,
+       ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL,
                                   drop_cache);
        btrfs_free_path(path);
        return ret;
@@ -1645,6 +1648,324 @@ static int btrfs_file_mmap(struct file  *filp, struct vm_area_struct *vma)
        return 0;
 }
 
+static int hole_mergeable(struct inode *inode, struct extent_buffer *leaf,
+                         int slot, u64 start, u64 end)
+{
+       struct btrfs_file_extent_item *fi;
+       struct btrfs_key key;
+
+       if (slot < 0 || slot >= btrfs_header_nritems(leaf))
+               return 0;
+
+       btrfs_item_key_to_cpu(leaf, &key, slot);
+       if (key.objectid != btrfs_ino(inode) ||
+           key.type != BTRFS_EXTENT_DATA_KEY)
+               return 0;
+
+       fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+
+       if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
+               return 0;
+
+       if (btrfs_file_extent_disk_bytenr(leaf, fi))
+               return 0;
+
+       if (key.offset == end)
+               return 1;
+       if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start)
+               return 1;
+       return 0;
+}
+
+static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode,
+                     struct btrfs_path *path, u64 offset, u64 end)
+{
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct extent_buffer *leaf;
+       struct btrfs_file_extent_item *fi;
+       struct extent_map *hole_em;
+       struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+       struct btrfs_key key;
+       int ret;
+
+       key.objectid = btrfs_ino(inode);
+       key.type = BTRFS_EXTENT_DATA_KEY;
+       key.offset = offset;
+
+
+       ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+       if (ret < 0)
+               return ret;
+       BUG_ON(!ret);
+
+       leaf = path->nodes[0];
+       if (hole_mergeable(inode, leaf, path->slots[0]-1, offset, end)) {
+               u64 num_bytes;
+
+               path->slots[0]--;
+               fi = btrfs_item_ptr(leaf, path->slots[0],
+                                   struct btrfs_file_extent_item);
+               num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +
+                       end - offset;
+               btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
+               btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
+               btrfs_set_file_extent_offset(leaf, fi, 0);
+               btrfs_mark_buffer_dirty(leaf);
+               goto out;
+       }
+
+       if (hole_mergeable(inode, leaf, path->slots[0]+1, offset, end)) {
+               u64 num_bytes;
+
+               path->slots[0]++;
+               key.offset = offset;
+               btrfs_set_item_key_safe(trans, root, path, &key);
+               fi = btrfs_item_ptr(leaf, path->slots[0],
+                                   struct btrfs_file_extent_item);
+               num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
+                       offset;
+               btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
+               btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
+               btrfs_set_file_extent_offset(leaf, fi, 0);
+               btrfs_mark_buffer_dirty(leaf);
+               goto out;
+       }
+       btrfs_release_path(path);
+
+       ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset,
+                                      0, 0, end - offset, 0, end - offset,
+                                      0, 0, 0);
+       if (ret)
+               return ret;
+
+out:
+       btrfs_release_path(path);
+
+       hole_em = alloc_extent_map();
+       if (!hole_em) {
+               btrfs_drop_extent_cache(inode, offset, end - 1, 0);
+               set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+                       &BTRFS_I(inode)->runtime_flags);
+       } else {
+               hole_em->start = offset;
+               hole_em->len = end - offset;
+               hole_em->orig_start = offset;
+
+               hole_em->block_start = EXTENT_MAP_HOLE;
+               hole_em->block_len = 0;
+               hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
+               hole_em->compress_type = BTRFS_COMPRESS_NONE;
+               hole_em->generation = trans->transid;
+
+               do {
+                       btrfs_drop_extent_cache(inode, offset, end - 1, 0);
+                       write_lock(&em_tree->lock);
+                       ret = add_extent_mapping(em_tree, hole_em);
+                       if (!ret)
+                               list_move(&hole_em->list,
+                                         &em_tree->modified_extents);
+                       write_unlock(&em_tree->lock);
+               } while (ret == -EEXIST);
+               free_extent_map(hole_em);
+               if (ret)
+                       set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+                               &BTRFS_I(inode)->runtime_flags);
+       }
+
+       return 0;
+}
+
+static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
+{
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct extent_state *cached_state = NULL;
+       struct btrfs_path *path;
+       struct btrfs_block_rsv *rsv;
+       struct btrfs_trans_handle *trans;
+       u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
+       u64 lockstart = (offset + mask) & ~mask;
+       u64 lockend = ((offset + len) & ~mask) - 1;
+       u64 cur_offset = lockstart;
+       u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
+       u64 drop_end;
+       unsigned long nr;
+       int ret = 0;
+       int err = 0;
+       bool same_page = (offset >> PAGE_CACHE_SHIFT) ==
+               ((offset + len) >> PAGE_CACHE_SHIFT);
+
+       btrfs_wait_ordered_range(inode, offset, len);
+
+       mutex_lock(&inode->i_mutex);
+       if (offset >= inode->i_size) {
+               mutex_unlock(&inode->i_mutex);
+               return 0;
+       }
+
+       /*
+        * Only do this if we are in the same page and we aren't doing the
+        * entire page.
+        */
+       if (same_page && len < PAGE_CACHE_SIZE) {
+               ret = btrfs_truncate_page(inode, offset, len, 0);
+               mutex_unlock(&inode->i_mutex);
+               return ret;
+       }
+
+       /* zero back part of the first page */
+       ret = btrfs_truncate_page(inode, offset, 0, 0);
+       if (ret) {
+               mutex_unlock(&inode->i_mutex);
+               return ret;
+       }
+
+       /* zero the front end of the last page */
+       ret = btrfs_truncate_page(inode, offset + len, 0, 1);
+       if (ret) {
+               mutex_unlock(&inode->i_mutex);
+               return ret;
+       }
+
+       if (lockend < lockstart) {
+               mutex_unlock(&inode->i_mutex);
+               return 0;
+       }
+
+       while (1) {
+               struct btrfs_ordered_extent *ordered;
+
+               truncate_pagecache_range(inode, lockstart, lockend);
+
+               lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+                                0, &cached_state);
+               ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
+
+               /*
+                * We need to make sure we have no ordered extents in this range
+                * and nobody raced in and read a page in this range, if we did
+                * we need to try again.
+                */
+               if ((!ordered ||
+                   (ordered->file_offset + ordered->len < lockstart ||
+                    ordered->file_offset > lockend)) &&
+                    !test_range_bit(&BTRFS_I(inode)->io_tree, lockstart,
+                                    lockend, EXTENT_UPTODATE, 0,
+                                    cached_state)) {
+                       if (ordered)
+                               btrfs_put_ordered_extent(ordered);
+                       break;
+               }
+               if (ordered)
+                       btrfs_put_ordered_extent(ordered);
+               unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
+                                    lockend, &cached_state, GFP_NOFS);
+               btrfs_wait_ordered_range(inode, lockstart,
+                                        lockend - lockstart + 1);
+       }
+
+       path = btrfs_alloc_path();
+       if (!path) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       rsv = btrfs_alloc_block_rsv(root);
+       if (!rsv) {
+               ret = -ENOMEM;
+               goto out_free;
+       }
+       rsv->size = btrfs_calc_trunc_metadata_size(root, 1);
+       rsv->failfast = 1;
+
+       /*
+        * 1 - update the inode
+        * 1 - removing the extents in the range
+        * 1 - adding the hole extent
+        */
+       trans = btrfs_start_transaction(root, 3);
+       if (IS_ERR(trans)) {
+               err = PTR_ERR(trans);
+               goto out_free;
+       }
+
+       ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
+                                     min_size);
+       BUG_ON(ret);
+       trans->block_rsv = rsv;
+
+       while (cur_offset < lockend) {
+               ret = __btrfs_drop_extents(trans, root, inode, path,
+                                          cur_offset, lockend + 1,
+                                          &drop_end, 1);
+               if (ret != -ENOSPC)
+                       break;
+
+               trans->block_rsv = &root->fs_info->trans_block_rsv;
+
+               ret = fill_holes(trans, inode, path, cur_offset, drop_end);
+               if (ret) {
+                       err = ret;
+                       break;
+               }
+
+               cur_offset = drop_end;
+
+               ret = btrfs_update_inode(trans, root, inode);
+               if (ret) {
+                       err = ret;
+                       break;
+               }
+
+               nr = trans->blocks_used;
+               btrfs_end_transaction(trans, root);
+               btrfs_btree_balance_dirty(root, nr);
+
+               trans = btrfs_start_transaction(root, 3);
+               if (IS_ERR(trans)) {
+                       ret = PTR_ERR(trans);
+                       trans = NULL;
+                       break;
+               }
+
+               ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
+                                             rsv, min_size);
+               BUG_ON(ret);    /* shouldn't happen */
+               trans->block_rsv = rsv;
+       }
+
+       if (ret) {
+               err = ret;
+               goto out_trans;
+       }
+
+       trans->block_rsv = &root->fs_info->trans_block_rsv;
+       ret = fill_holes(trans, inode, path, cur_offset, drop_end);
+       if (ret) {
+               err = ret;
+               goto out_trans;
+       }
+
+out_trans:
+       if (!trans)
+               goto out_free;
+
+       trans->block_rsv = &root->fs_info->trans_block_rsv;
+       ret = btrfs_update_inode(trans, root, inode);
+       nr = trans->blocks_used;
+       btrfs_end_transaction(trans, root);
+       btrfs_btree_balance_dirty(root, nr);
+out_free:
+       btrfs_free_path(path);
+       btrfs_free_block_rsv(root, rsv);
+out:
+       unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+                            &cached_state, GFP_NOFS);
+       mutex_unlock(&inode->i_mutex);
+       if (ret && !err)
+               err = ret;
+       return err;
+}
+
 static long btrfs_fallocate(struct file *file, int mode,
                            loff_t offset, loff_t len)
 {
@@ -1663,10 +1984,13 @@ static long btrfs_fallocate(struct file *file, int mode,
        alloc_start = offset & ~mask;
        alloc_end =  (offset + len + mask) & ~mask;
 
-       /* We only support the FALLOC_FL_KEEP_SIZE mode */
-       if (mode & ~FALLOC_FL_KEEP_SIZE)
+       /* Make sure we aren't being give some crap mode */
+       if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
                return -EOPNOTSUPP;
 
+       if (mode & FALLOC_FL_PUNCH_HOLE)
+               return btrfs_punch_hole(inode, offset, len);
+
        /*
         * Make sure we have enough space before we do the
         * allocation.
index f0a4792..8cb272c 100644 (file)
@@ -3475,12 +3475,20 @@ error:
 }
 
 /*
- * taken from block_truncate_page, but does cow as it zeros out
- * any bytes left in the last page in the file.
+ * btrfs_truncate_page - read, zero a chunk and write a page
+ * @inode - inode that we're zeroing
+ * @from - the offset to start zeroing
+ * @len - the length to zero, 0 to zero the entire range respective to the
+ *     offset
+ * @front - zero up to the offset instead of from the offset on
+ *
+ * This will find the page for the "from" offset and cow the page and zero the
+ * part we want to zero.  This is used with truncate and hole punching.
  */
-static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
+int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
+                       int front)
 {
-       struct inode *inode = mapping->host;
+       struct address_space *mapping = inode->i_mapping;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        struct btrfs_ordered_extent *ordered;
@@ -3495,7 +3503,8 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
        u64 page_start;
        u64 page_end;
 
-       if ((offset & (blocksize - 1)) == 0)
+       if ((offset & (blocksize - 1)) == 0 &&
+           (!len || ((len & (blocksize - 1)) == 0)))
                goto out;
        ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
        if (ret)
@@ -3555,8 +3564,13 @@ again:
 
        ret = 0;
        if (offset != PAGE_CACHE_SIZE) {
+               if (!len)
+                       len = PAGE_CACHE_SIZE - offset;
                kaddr = kmap(page);
-               memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
+               if (front)
+                       memset(kaddr, 0, offset);
+               else
+                       memset(kaddr + offset, 0, len);
                flush_dcache_page(page);
                kunmap(page);
        }
@@ -6796,7 +6810,7 @@ static int btrfs_truncate(struct inode *inode)
        u64 mask = root->sectorsize - 1;
        u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
 
-       ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
+       ret = btrfs_truncate_page(inode, inode->i_size, 0, 0);
        if (ret)
                return ret;
 
index 0c39f58..f9b0fc9 100644 (file)
@@ -2842,7 +2842,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
 
        if (BTRFS_I(inode)->logged_trans == trans->transid) {
                ret = __btrfs_drop_extents(trans, log, inode, dst_path, start,
-                                          start + len, 0);
+                                          start + len, NULL, 0);
                if (ret)
                        return ret;
        }