Btrfs: Do metadata checksums for reads via a workqueue
authorChris Mason <chris.mason@oracle.com>
Wed, 9 Apr 2008 20:28:12 +0000 (16:28 -0400)
committerChris Mason <chris.mason@oracle.com>
Thu, 25 Sep 2008 15:04:01 +0000 (11:04 -0400)
Before, metadata checksumming was done by the callers of read_tree_block,
which would set EXTENT_CSUM bits in the extent tree to show that a given
range of pages was already checksummed and didn't need to be verified
again.

But, those bits could go away via try_to_releasepage, and the end
result was bogus checksum failures on pages that never left the cache.

The new code validates checksums when the page is read.  It is a little
tricky because metadata blocks can span pages and a single read may
end up going via multiple bios.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
fs/btrfs/ctree.c
fs/btrfs/ctree.h
fs/btrfs/disk-io.c
fs/btrfs/extent-tree.c
fs/btrfs/extent_io.c
fs/btrfs/extent_io.h
fs/btrfs/transaction.c

index c92c6b0..efce173 100644 (file)
@@ -158,9 +158,6 @@ int __btrfs_cow_block(struct btrfs_trans_handle *trans,
        } else {
                root_gen = 0;
        }
-       if (!(buf->flags & EXTENT_CSUM))
-               WARN_ON(1);
-
        WARN_ON(root->ref_cows && trans->transid !=
                root->fs_info->running_transaction->transid);
        WARN_ON(root->ref_cows && trans->transid != root->last_trans);
@@ -247,8 +244,6 @@ int btrfs_cow_block(struct btrfs_trans_handle *trans,
                       root->fs_info->generation);
                WARN_ON(1);
        }
-       if (!(buf->flags & EXTENT_CSUM))
-               WARN_ON(1);
 
        header_trans = btrfs_header_generation(buf);
        spin_lock(&root->fs_info->hash_lock);
index 4b3b204..e803c4d 100644 (file)
@@ -485,6 +485,10 @@ struct btrfs_fs_info {
        struct list_head trans_list;
        struct list_head hashers;
        struct list_head dead_roots;
+       struct list_head end_io_work_list;
+       struct work_struct end_io_work;
+       spinlock_t end_io_work_lock;
+
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
        struct work_struct trans_work;
 #else
index e444b99..8210920 100644 (file)
@@ -24,6 +24,7 @@
 #include <linux/radix-tree.h>
 #include <linux/writeback.h>
 #include <linux/buffer_head.h> // for block_sync_page
+#include <linux/workqueue.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -45,6 +46,16 @@ static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf)
 #endif
 
 static struct extent_io_ops btree_extent_io_ops;
+static struct workqueue_struct *end_io_workqueue;
+
+struct end_io_wq {
+       struct bio *bio;
+       bio_end_io_t *end_io;
+       void *private;
+       struct btrfs_fs_info *info;
+       int error;
+       struct list_head list;
+};
 
 struct extent_map *btree_get_extent(struct inode *inode, struct page *page,
                                    size_t page_offset, u64 start, u64 len,
@@ -219,11 +230,108 @@ static int btree_writepage_io_hook(struct page *page, u64 start, u64 end)
        return 0;
 }
 
+int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
+                              struct extent_state *state)
+{
+       struct extent_io_tree *tree;
+       u64 found_start;
+       int found_level;
+       unsigned long len;
+       struct extent_buffer *eb;
+       struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
+       int ret;
+
+       tree = &BTRFS_I(page->mapping->host)->io_tree;
+       if (page->private == EXTENT_PAGE_PRIVATE)
+               goto out;
+       if (!page->private)
+               goto out;
+       len = page->private >> 2;
+       if (len == 0) {
+               WARN_ON(1);
+       }
+       eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
+       read_extent_buffer_pages(tree, eb, start + PAGE_CACHE_SIZE, 1,
+                                btree_get_extent);
+       btrfs_clear_buffer_defrag(eb);
+       found_start = btrfs_header_bytenr(eb);
+       if (found_start != start) {
+               printk("warning: eb start incorrect %Lu buffer %Lu len %lu\n",
+                      start, found_start, len);
+               WARN_ON(1);
+               goto err;
+       }
+       if (eb->first_page != page) {
+               printk("bad first page %lu %lu\n", eb->first_page->index,
+                      page->index);
+               WARN_ON(1);
+               goto err;
+       }
+       found_level = btrfs_header_level(eb);
+
+       ret = csum_tree_block(root, eb, 1);
+
+       end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
+       end = eb->start + end - 1;
+       release_extent_buffer_tail_pages(eb);
+err:
+       free_extent_buffer(eb);
+out:
+       return 0;
+}
+
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
+static void end_workqueue_bio(struct bio *bio, int err)
+#else
+static int end_workqueue_bio(struct bio *bio,
+                                  unsigned int bytes_done, int err)
+#endif
+{
+       struct end_io_wq *end_io_wq = bio->bi_private;
+       struct btrfs_fs_info *fs_info;
+       unsigned long flags;
+
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
+       if (bio->bi_size)
+               return 1;
+#endif
+
+       fs_info = end_io_wq->info;
+       spin_lock_irqsave(&fs_info->end_io_work_lock, flags);
+       end_io_wq->error = err;
+       list_add_tail(&end_io_wq->list, &fs_info->end_io_work_list);
+       spin_unlock_irqrestore(&fs_info->end_io_work_lock, flags);
+       queue_work(end_io_workqueue, &fs_info->end_io_work);
+
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
+       return 0;
+#endif
+}
+
 static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct end_io_wq *end_io_wq;
        u64 offset;
        offset = bio->bi_sector << 9;
+
+       if (rw & (1 << BIO_RW)) {
+               return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio);
+       }
+
+       end_io_wq = kmalloc(sizeof(*end_io_wq), GFP_NOFS);
+       if (!end_io_wq)
+               return -ENOMEM;
+
+       end_io_wq->private = bio->bi_private;
+       end_io_wq->end_io = bio->bi_end_io;
+       end_io_wq->info = root->fs_info;
+       end_io_wq->error = 0;
+       end_io_wq->bio = bio;
+
+       bio->bi_private = end_io_wq;
+       bio->bi_end_io = end_workqueue_bio;
+
        if (offset == BTRFS_SUPER_INFO_OFFSET) {
                bio->bi_bdev = root->fs_info->sb->s_bdev;
                submit_bio(rw, bio);
@@ -363,36 +471,7 @@ static int close_all_devices(struct btrfs_fs_info *fs_info)
 int btrfs_verify_block_csum(struct btrfs_root *root,
                            struct extent_buffer *buf)
 {
-       struct extent_io_tree *io_tree;
-       u64 end;
-       int ret;
-
-       io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
-       if (buf->flags & EXTENT_CSUM)
-               return 0;
-
-       end = min_t(u64, buf->len, PAGE_CACHE_SIZE);
-       end = buf->start + end - 1;
-       if (test_range_bit(io_tree, buf->start, end, EXTENT_CSUM, 1)) {
-               buf->flags |= EXTENT_CSUM;
-               return 0;
-       }
-       lock_extent(io_tree, buf->start, end, GFP_NOFS);
-
-       if (test_range_bit(io_tree, buf->start, end, EXTENT_CSUM, 1)) {
-               buf->flags |= EXTENT_CSUM;
-               ret = 0;
-               goto out_unlock;
-       }
-WARN_ON(buf->flags & EXTENT_CSUM);
-
-       ret = csum_tree_block(root, buf, 1);
-       set_extent_bits(io_tree, buf->start, end, EXTENT_CSUM, GFP_NOFS);
-       buf->flags |= EXTENT_CSUM;
-
-out_unlock:
-       unlock_extent(io_tree, buf->start, end, GFP_NOFS);
-       return ret;
+       return btrfs_buffer_uptodate(buf);
 }
 
 struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
@@ -430,11 +509,15 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
        buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
        if (!buf)
                return NULL;
-       read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, buf, 0, 1,
-                                btree_get_extent);
 
-       ret = btrfs_verify_block_csum(root, buf);
+       ret = read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, buf, 0,
+                                      1, btree_get_extent);
+
+       if (ret == 0) {
+               buf->flags |= EXTENT_UPTODATE;
+       }
        return buf;
+
 }
 
 int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
@@ -724,6 +807,99 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
        return 0;
 }
 
+static int bio_ready_for_csum(struct bio *bio)
+{
+       u64 length = 0;
+       u64 buf_len = 0;
+       u64 start = 0;
+       struct page *page;
+       struct extent_io_tree *io_tree = NULL;
+       struct btrfs_fs_info *info = NULL;
+       struct bio_vec *bvec;
+       int i;
+       int ret;
+
+       bio_for_each_segment(bvec, bio, i) {
+               page = bvec->bv_page;
+               if (page->private == EXTENT_PAGE_PRIVATE) {
+                       length += bvec->bv_len;
+                       continue;
+               }
+               if (!page->private) {
+                       length += bvec->bv_len;
+                       continue;
+               }
+               length = bvec->bv_len;
+               buf_len = page->private >> 2;
+               start = page_offset(page) + bvec->bv_offset;
+               io_tree = &BTRFS_I(page->mapping->host)->io_tree;
+               info = BTRFS_I(page->mapping->host)->root->fs_info;
+       }
+       /* are we fully contained in this bio? */
+       if (buf_len <= length)
+               return 1;
+
+       ret = extent_range_uptodate(io_tree, start + length,
+                                   start + buf_len - 1);
+       if (ret == 1)
+               return ret;
+       return ret;
+}
+
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
+void btrfs_end_io_csum(void *p)
+#else
+void btrfs_end_io_csum(struct work_struct *work)
+#endif
+{
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
+       struct btrfs_fs_info *fs_info = p;
+#else
+       struct btrfs_fs_info *fs_info = container_of(work,
+                                                    struct btrfs_fs_info,
+                                                    end_io_work);
+#endif
+       unsigned long flags;
+       struct end_io_wq *end_io_wq;
+       struct bio *bio;
+       struct list_head *next;
+       int error;
+       int was_empty;
+
+       while(1) {
+               spin_lock_irqsave(&fs_info->end_io_work_lock, flags);
+               if (list_empty(&fs_info->end_io_work_list)) {
+                       spin_unlock_irqrestore(&fs_info->end_io_work_lock,
+                                              flags);
+                       return;
+               }
+               next = fs_info->end_io_work_list.next;
+               list_del(next);
+               spin_unlock_irqrestore(&fs_info->end_io_work_lock, flags);
+
+               end_io_wq = list_entry(next, struct end_io_wq, list);
+
+               bio = end_io_wq->bio;
+               if (!bio_ready_for_csum(bio)) {
+                       spin_lock_irqsave(&fs_info->end_io_work_lock, flags);
+                       was_empty = list_empty(&fs_info->end_io_work_list);
+                       list_add_tail(&end_io_wq->list,
+                                     &fs_info->end_io_work_list);
+                       spin_unlock_irqrestore(&fs_info->end_io_work_lock,
+                                              flags);
+                       if (was_empty)
+                               return;
+                       continue;
+               }
+               error = end_io_wq->error;
+               bio->bi_private = end_io_wq->private;
+               bio->bi_end_io = end_io_wq->end_io;
+               kfree(end_io_wq);
+               bio_endio(bio, error);
+       }
+}
+
+
 struct btrfs_root *open_ctree(struct super_block *sb,
                              struct btrfs_fs_devices *fs_devices)
 {
@@ -750,11 +926,16 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                err = -ENOMEM;
                goto fail;
        }
+       end_io_workqueue = create_workqueue("btrfs-end-io");
+       BUG_ON(!end_io_workqueue);
+
        INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS);
        INIT_LIST_HEAD(&fs_info->trans_list);
        INIT_LIST_HEAD(&fs_info->dead_roots);
        INIT_LIST_HEAD(&fs_info->hashers);
+       INIT_LIST_HEAD(&fs_info->end_io_work_list);
        spin_lock_init(&fs_info->hash_lock);
+       spin_lock_init(&fs_info->end_io_work_lock);
        spin_lock_init(&fs_info->delalloc_lock);
        spin_lock_init(&fs_info->new_trans_lock);
 
@@ -799,6 +980,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                             fs_info->btree_inode->i_mapping, GFP_NOFS);
        fs_info->do_barriers = 1;
 
+       INIT_WORK(&fs_info->end_io_work, btrfs_end_io_csum);
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
        INIT_WORK(&fs_info->trans_work, btrfs_transaction_cleaner, fs_info);
 #else
@@ -1044,6 +1226,8 @@ int close_ctree(struct btrfs_root *root)
        extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree);
 
        truncate_inode_pages(fs_info->btree_inode->i_mapping, 0);
+       flush_workqueue(end_io_workqueue);
+       destroy_workqueue(end_io_workqueue);
 
        iput(fs_info->btree_inode);
 #if 0
@@ -1171,12 +1355,18 @@ int btrfs_read_buffer(struct extent_buffer *buf)
 {
        struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
        struct inode *btree_inode = root->fs_info->btree_inode;
-       return read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
+       int ret;
+       ret = read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
                                        buf, 0, 1, btree_get_extent);
+       if (ret == 0) {
+               buf->flags |= EXTENT_UPTODATE;
+       }
+       return ret;
 }
 
 static struct extent_io_ops btree_extent_io_ops = {
        .writepage_io_hook = btree_writepage_io_hook,
+       .readpage_end_io_hook = btree_readpage_end_io_hook,
        .submit_bio_hook = btree_submit_bio_hook,
        /* note we're sharing with inode.c for the merge bio hook */
        .merge_bio_hook = btrfs_merge_bio_hook,
index 0cb7428..283b08a 100644 (file)
@@ -1898,10 +1898,6 @@ struct extent_buffer *__btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 
        set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
                         buf->start + buf->len - 1, GFP_NOFS);
-       set_extent_bits(&BTRFS_I(root->fs_info->btree_inode)->io_tree,
-                       buf->start, buf->start + buf->len - 1,
-                       EXTENT_CSUM, GFP_NOFS);
-       buf->flags |= EXTENT_CSUM;
        if (!btrfs_test_opt(root, SSD))
                btrfs_set_buffer_defrag(buf);
        trans->blocks_used++;
index 13cc236..cfc383c 100644 (file)
@@ -2592,6 +2592,22 @@ static inline struct page *extent_buffer_page(struct extent_buffer *eb,
        return p;
 }
 
+int release_extent_buffer_tail_pages(struct extent_buffer *eb)
+{
+       unsigned long num_pages = num_extent_pages(eb->start, eb->len);
+       struct page *page;
+       unsigned long i;
+
+       if (num_pages == 1)
+               return 0;
+       for (i = 1; i < num_pages; i++) {
+               page = extent_buffer_page(eb, i);
+               page_cache_release(page);
+       }
+       return 0;
+}
+
+
 int invalidate_extent_lru(struct extent_io_tree *tree, u64 start,
                          unsigned long len)
 {
@@ -2609,9 +2625,6 @@ int invalidate_extent_lru(struct extent_io_tree *tree, u64 start,
                if (eb->start <= start && eb->start + eb->len > start) {
                        eb->flags &= ~EXTENT_UPTODATE;
                }
-               if (eb->start == start) {
-                       eb->flags &= ~EXTENT_CSUM;
-               }
                cur = cur->next;
        } while (cur != lru);
 out:
@@ -2682,7 +2695,6 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
                page_cache_get(page0);
                mark_page_accessed(page0);
                set_page_extent_mapped(page0);
-               WARN_ON(!PageUptodate(page0));
                set_page_extent_head(page0, len);
        } else {
                i = 0;
@@ -2933,13 +2945,39 @@ int set_extent_buffer_uptodate(struct extent_io_tree *tree,
 }
 EXPORT_SYMBOL(set_extent_buffer_uptodate);
 
+int extent_range_uptodate(struct extent_io_tree *tree,
+                         u64 start, u64 end)
+{
+       struct page *page;
+       int ret;
+       int pg_uptodate = 1;
+       int uptodate;
+       unsigned long index;
+
+       ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1);
+       if (ret)
+               return 1;
+       while(start <= end) {
+               index = start >> PAGE_CACHE_SHIFT;
+               page = find_get_page(tree->mapping, index);
+               uptodate = PageUptodate(page);
+               page_cache_release(page);
+               if (!uptodate) {
+                       pg_uptodate = 0;
+                       break;
+               }
+               start += PAGE_CACHE_SIZE;
+       }
+       return pg_uptodate;
+}
+
 int extent_buffer_uptodate(struct extent_io_tree *tree,
-                            struct extent_buffer *eb)
+                          struct extent_buffer *eb)
 {
        int ret = 0;
        int ret2;
-       int num_pages;
-       int i;
+       unsigned long num_pages;
+       unsigned long i;
        struct page *page;
        int pg_uptodate = 1;
 
@@ -2975,13 +3013,16 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
        struct page *page;
        int err;
        int ret = 0;
+       int locked_pages = 0;
+       int all_uptodate = 1;
+       int inc_all_pages = 0;
        unsigned long num_pages;
        struct bio *bio = NULL;
 
        if (eb->flags & EXTENT_UPTODATE)
                return 0;
 
-       if (0 && test_range_bit(tree, eb->start, eb->start + eb->len - 1,
+       if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
                           EXTENT_UPTODATE, 1)) {
                return 0;
        }
@@ -2997,17 +3038,30 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
        num_pages = num_extent_pages(eb->start, eb->len);
        for (i = start_i; i < num_pages; i++) {
                page = extent_buffer_page(eb, i);
-               if (PageUptodate(page)) {
-                       continue;
-               }
                if (!wait) {
-                       if (TestSetPageLocked(page)) {
-                               continue;
-                       }
+                       if (TestSetPageLocked(page))
+                               goto unlock_exit;
                } else {
                        lock_page(page);
                }
+               locked_pages++;
+               if (!PageUptodate(page)) {
+                       all_uptodate = 0;
+               }
+       }
+       if (all_uptodate) {
+               if (start_i == 0)
+                       eb->flags |= EXTENT_UPTODATE;
+               goto unlock_exit;
+       }
+
+       for (i = start_i; i < num_pages; i++) {
+               page = extent_buffer_page(eb, i);
+               if (inc_all_pages)
+                       page_cache_get(page);
                if (!PageUptodate(page)) {
+                       if (start_i == 0)
+                               inc_all_pages = 1;
                        err = __extent_read_full_page(tree, page,
                                                      get_extent, &bio);
                        if (err) {
@@ -3034,6 +3088,16 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
        if (!ret)
                eb->flags |= EXTENT_UPTODATE;
        return ret;
+
+unlock_exit:
+       i = start_i;
+       while(locked_pages > 0) {
+               page = extent_buffer_page(eb, i);
+               i++;
+               unlock_page(page);
+               locked_pages--;
+       }
+       return ret;
 }
 EXPORT_SYMBOL(read_extent_buffer_pages);
 
@@ -3048,7 +3112,6 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,
        char *dst = (char *)dstv;
        size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
        unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
-       unsigned long num_pages = num_extent_pages(eb->start, eb->len);
 
        WARN_ON(start > eb->len);
        WARN_ON(start + len > eb->start + eb->len);
@@ -3057,11 +3120,6 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,
 
        while(len > 0) {
                page = extent_buffer_page(eb, i);
-               if (!PageUptodate(page)) {
-                       printk("page %lu not up to date i %lu, total %lu, len %lu\n", page->index, i, num_pages, eb->len);
-                       WARN_ON(1);
-               }
-               WARN_ON(!PageUptodate(page));
 
                cur = min(len, (PAGE_CACHE_SIZE - offset));
                kaddr = kmap_atomic(page, KM_USER1);
@@ -3105,7 +3163,6 @@ printk("bad mapping eb start %Lu len %lu, wanted %lu %lu\n", eb->start, eb->len,
        }
 
        p = extent_buffer_page(eb, i);
-       WARN_ON(!PageUptodate(p));
        kaddr = kmap_atomic(p, km);
        *token = kaddr;
        *map = kaddr + offset;
@@ -3165,7 +3222,6 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
 
        while(len > 0) {
                page = extent_buffer_page(eb, i);
-               WARN_ON(!PageUptodate(page));
 
                cur = min(len, (PAGE_CACHE_SIZE - offset));
 
index d9f5bc4..9d2991d 100644 (file)
@@ -13,7 +13,6 @@
 #define EXTENT_DEFRAG (1 << 6)
 #define EXTENT_DEFRAG_DONE (1 << 7)
 #define EXTENT_BUFFER_FILLED (1 << 8)
-#define EXTENT_CSUM (1 << 9)
 #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
 
 /*
@@ -218,4 +217,7 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
 void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km);
 int invalidate_extent_lru(struct extent_io_tree *tree, u64 start,
                          unsigned long len);
+int release_extent_buffer_tail_pages(struct extent_buffer *eb);
+int extent_range_uptodate(struct extent_io_tree *tree,
+                         u64 start, u64 end);
 #endif
index 5e9f692..1ed179c 100644 (file)
@@ -843,7 +843,7 @@ void btrfs_transaction_flush_work(struct btrfs_root *root)
 
 void __init btrfs_init_transaction_sys(void)
 {
-       trans_wq = create_workqueue("btrfs");
+       trans_wq = create_workqueue("btrfs-transaction");
 }
 
 void btrfs_exit_transaction_sys(void)