Btrfs: implement repair function when direct read fails
authorMiao Xie <miaox@cn.fujitsu.com>
Fri, 12 Sep 2014 10:44:03 +0000 (18:44 +0800)
committerChris Mason <clm@fb.com>
Wed, 17 Sep 2014 20:39:01 +0000 (13:39 -0700)
This patch implement data repair function when direct read fails.

The detail of the implementation is:
- When we find the data is not right, we try to read the data from the other
  mirror.
- When the io on the mirror ends, we will insert the endio work into the
  dedicated btrfs workqueue, not common read endio workqueue, because the
  original endio work is still blocked in the btrfs endio workqueue, if we
  insert the endio work of the io on the mirror into that workqueue, deadlock
  would happen.
- After we get right data, we write it back to the corrupted mirror.
- And if the data on the new mirror is still corrupted, we will try next
  mirror until we read right data or all the mirrors are traversed.
- After the above work, we set the uptodate flag according to the result.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
fs/btrfs/async-thread.c
fs/btrfs/async-thread.h
fs/btrfs/btrfs_inode.h
fs/btrfs/ctree.h
fs/btrfs/disk-io.c
fs/btrfs/disk-io.h
fs/btrfs/extent_io.c
fs/btrfs/extent_io.h
fs/btrfs/inode.c

index fbd76ded9a34b3260a5e794115fff0312b3c6e08..2da0a66790ba841bae9ed0d9d24ef2359cb1657c 100644 (file)
@@ -74,6 +74,7 @@ BTRFS_WORK_HELPER(endio_helper);
 BTRFS_WORK_HELPER(endio_meta_helper);
 BTRFS_WORK_HELPER(endio_meta_write_helper);
 BTRFS_WORK_HELPER(endio_raid56_helper);
+BTRFS_WORK_HELPER(endio_repair_helper);
 BTRFS_WORK_HELPER(rmw_helper);
 BTRFS_WORK_HELPER(endio_write_helper);
 BTRFS_WORK_HELPER(freespace_write_helper);
index e9e31c94758fd6ddea5cbd458d5aca04a27e9a73..e386c29ef1f62c559a184e4ae31884c2bcc425ba 100644 (file)
@@ -53,6 +53,7 @@ BTRFS_WORK_HELPER_PROTO(endio_helper);
 BTRFS_WORK_HELPER_PROTO(endio_meta_helper);
 BTRFS_WORK_HELPER_PROTO(endio_meta_write_helper);
 BTRFS_WORK_HELPER_PROTO(endio_raid56_helper);
+BTRFS_WORK_HELPER_PROTO(endio_repair_helper);
 BTRFS_WORK_HELPER_PROTO(rmw_helper);
 BTRFS_WORK_HELPER_PROTO(endio_write_helper);
 BTRFS_WORK_HELPER_PROTO(freespace_write_helper);
index 4d309471294e4ca4c28026291097febbe14ffeff..7a7521c87c88949f82e712013947e09ec9961397 100644 (file)
@@ -271,7 +271,7 @@ struct btrfs_dio_private {
         * The original bio may be splited to several sub-bios, this is
         * done during endio of sub-bios
         */
-       int (*subio_endio)(struct inode *, struct btrfs_io_bio *);
+       int (*subio_endio)(struct inode *, struct btrfs_io_bio *, int);
 };
 
 /*
index 0f3e4f7e454a442196703c26b3cf7806c80bdc35..51ff3f8dbab9fcc8ea5b1ec39e03485b795145b9 100644 (file)
@@ -1538,6 +1538,7 @@ struct btrfs_fs_info {
        struct btrfs_workqueue *endio_workers;
        struct btrfs_workqueue *endio_meta_workers;
        struct btrfs_workqueue *endio_raid56_workers;
+       struct btrfs_workqueue *endio_repair_workers;
        struct btrfs_workqueue *rmw_workers;
        struct btrfs_workqueue *endio_meta_write_workers;
        struct btrfs_workqueue *endio_write_workers;
index a224fb9b34a331b9a87fb896710f2ee66474ba81..48794f951427488b32325f54d501f5e30b8b797f 100644 (file)
@@ -713,7 +713,11 @@ static void end_workqueue_bio(struct bio *bio, int err)
                        func = btrfs_endio_write_helper;
                }
        } else {
-               if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) {
+               if (unlikely(end_io_wq->metadata ==
+                            BTRFS_WQ_ENDIO_DIO_REPAIR)) {
+                       wq = fs_info->endio_repair_workers;
+                       func = btrfs_endio_repair_helper;
+               } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) {
                        wq = fs_info->endio_raid56_workers;
                        func = btrfs_endio_raid56_helper;
                } else if (end_io_wq->metadata) {
@@ -741,6 +745,7 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
                        int metadata)
 {
        struct end_io_wq *end_io_wq;
+
        end_io_wq = kmalloc(sizeof(*end_io_wq), GFP_NOFS);
        if (!end_io_wq)
                return -ENOMEM;
@@ -2055,6 +2060,7 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
        btrfs_destroy_workqueue(fs_info->endio_workers);
        btrfs_destroy_workqueue(fs_info->endio_meta_workers);
        btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
+       btrfs_destroy_workqueue(fs_info->endio_repair_workers);
        btrfs_destroy_workqueue(fs_info->rmw_workers);
        btrfs_destroy_workqueue(fs_info->endio_meta_write_workers);
        btrfs_destroy_workqueue(fs_info->endio_write_workers);
@@ -2572,6 +2578,8 @@ int open_ctree(struct super_block *sb,
                btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2);
        fs_info->endio_raid56_workers =
                btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4);
+       fs_info->endio_repair_workers =
+               btrfs_alloc_workqueue("endio-repair", flags, 1, 0);
        fs_info->rmw_workers =
                btrfs_alloc_workqueue("rmw", flags, max_active, 2);
        fs_info->endio_write_workers =
@@ -2593,6 +2601,7 @@ int open_ctree(struct super_block *sb,
              fs_info->submit_workers && fs_info->flush_workers &&
              fs_info->endio_workers && fs_info->endio_meta_workers &&
              fs_info->endio_meta_write_workers &&
+             fs_info->endio_repair_workers &&
              fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
              fs_info->endio_freespace_worker && fs_info->rmw_workers &&
              fs_info->caching_workers && fs_info->readahead_workers &&
index 52a17db700fc93963810c7fa2b15d2f4c01e1fd6..14d06ee1e143aa9b0337fbda2b2df398780bdcb2 100644 (file)
@@ -30,6 +30,7 @@ enum {
        BTRFS_WQ_ENDIO_METADATA = 1,
        BTRFS_WQ_ENDIO_FREE_SPACE = 2,
        BTRFS_WQ_ENDIO_RAID56 = 3,
+       BTRFS_WQ_ENDIO_DIO_REPAIR = 4,
 };
 
 static inline u64 btrfs_sb_offset(int mirror)
index 05533c99f89d57c1707f72e82c244237cb1ebd95..9e2ef27672e55e572c0ea099d200494d8be2274c 100644 (file)
@@ -1962,7 +1962,7 @@ static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
                SetPageUptodate(page);
 }
 
-static int free_io_failure(struct inode *inode, struct io_failure_record *rec)
+int free_io_failure(struct inode *inode, struct io_failure_record *rec)
 {
        int ret;
        int err = 0;
@@ -2081,8 +2081,8 @@ int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
  * each time an IO finishes, we do a fast check in the IO failure tree
  * to see if we need to process or clean up an io_failure_record
  */
-static int clean_io_failure(struct inode *inode, u64 start,
-                           struct page *page, unsigned int pg_offset)
+int clean_io_failure(struct inode *inode, u64 start, struct page *page,
+                    unsigned int pg_offset)
 {
        u64 private;
        u64 private_failure;
@@ -2291,7 +2291,7 @@ int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio,
 struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
                                    struct io_failure_record *failrec,
                                    struct page *page, int pg_offset, int icsum,
-                                   bio_end_io_t *endio_func)
+                                   bio_end_io_t *endio_func, void *data)
 {
        struct bio *bio;
        struct btrfs_io_bio *btrfs_failed_bio;
@@ -2305,6 +2305,7 @@ struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
        bio->bi_iter.bi_sector = failrec->logical >> 9;
        bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
        bio->bi_iter.bi_size = 0;
+       bio->bi_private = data;
 
        btrfs_failed_bio = btrfs_io_bio(failed_bio);
        if (btrfs_failed_bio->csum) {
@@ -2362,7 +2363,8 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
        phy_offset >>= inode->i_sb->s_blocksize_bits;
        bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
                                      start - page_offset(page),
-                                     (int)phy_offset, failed_bio->bi_end_io);
+                                     (int)phy_offset, failed_bio->bi_end_io,
+                                     NULL);
        if (!bio) {
                free_io_failure(inode, failrec);
                return -EIO;
index bf0597f3a9e70bb165ff6f09de6209a259897d7f..176a4b1ed52080e015956ec9e22ae62448fe26a7 100644 (file)
@@ -341,6 +341,8 @@ struct btrfs_fs_info;
 int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
                      struct page *page, unsigned int pg_offset,
                      int mirror_num);
+int clean_io_failure(struct inode *inode, u64 start, struct page *page,
+                    unsigned int pg_offset);
 int end_extent_writepage(struct page *page, int err, u64 start, u64 end);
 int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
                         int mirror_num);
@@ -371,7 +373,8 @@ int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio,
 struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
                                    struct io_failure_record *failrec,
                                    struct page *page, int pg_offset, int icsum,
-                                   bio_end_io_t *endio_func);
+                                   bio_end_io_t *endio_func, void *data);
+int free_io_failure(struct inode *inode, struct io_failure_record *rec);
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 noinline u64 find_lock_delalloc_range(struct inode *inode,
                                      struct extent_io_tree *tree,
index 09d8c5ee88695ce9e91de079bf0fa078b06b590e..c3c3269a9e080b152ef723ef56ba4fe3735ef614 100644 (file)
@@ -7242,30 +7242,267 @@ unlock_err:
        return ret;
 }
 
-static int btrfs_subio_endio_read(struct inode *inode,
-                                 struct btrfs_io_bio *io_bio)
+static inline int submit_dio_repair_bio(struct inode *inode, struct bio *bio,
+                                       int rw, int mirror_num)
+{
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       int ret;
+
+       BUG_ON(rw & REQ_WRITE);
+
+       bio_get(bio);
+
+       ret = btrfs_bio_wq_end_io(root->fs_info, bio,
+                                 BTRFS_WQ_ENDIO_DIO_REPAIR);
+       if (ret)
+               goto err;
+
+       ret = btrfs_map_bio(root, rw, bio, mirror_num, 0);
+err:
+       bio_put(bio);
+       return ret;
+}
+
+static int btrfs_check_dio_repairable(struct inode *inode,
+                                     struct bio *failed_bio,
+                                     struct io_failure_record *failrec,
+                                     int failed_mirror)
+{
+       int num_copies;
+
+       num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info,
+                                     failrec->logical, failrec->len);
+       if (num_copies == 1) {
+               /*
+                * we only have a single copy of the data, so don't bother with
+                * all the retry and error correction code that follows. no
+                * matter what the error is, it is very likely to persist.
+                */
+               pr_debug("Check DIO Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n",
+                        num_copies, failrec->this_mirror, failed_mirror);
+               return 0;
+       }
+
+       failrec->failed_mirror = failed_mirror;
+       failrec->this_mirror++;
+       if (failrec->this_mirror == failed_mirror)
+               failrec->this_mirror++;
+
+       if (failrec->this_mirror > num_copies) {
+               pr_debug("Check DIO Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n",
+                        num_copies, failrec->this_mirror, failed_mirror);
+               return 0;
+       }
+
+       return 1;
+}
+
+static int dio_read_error(struct inode *inode, struct bio *failed_bio,
+                         struct page *page, u64 start, u64 end,
+                         int failed_mirror, bio_end_io_t *repair_endio,
+                         void *repair_arg)
+{
+       struct io_failure_record *failrec;
+       struct bio *bio;
+       int isector;
+       int read_mode;
+       int ret;
+
+       BUG_ON(failed_bio->bi_rw & REQ_WRITE);
+
+       ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
+       if (ret)
+               return ret;
+
+       ret = btrfs_check_dio_repairable(inode, failed_bio, failrec,
+                                        failed_mirror);
+       if (!ret) {
+               free_io_failure(inode, failrec);
+               return -EIO;
+       }
+
+       if (failed_bio->bi_vcnt > 1)
+               read_mode = READ_SYNC | REQ_FAILFAST_DEV;
+       else
+               read_mode = READ_SYNC;
+
+       isector = start - btrfs_io_bio(failed_bio)->logical;
+       isector >>= inode->i_sb->s_blocksize_bits;
+       bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
+                                     0, isector, repair_endio, repair_arg);
+       if (!bio) {
+               free_io_failure(inode, failrec);
+               return -EIO;
+       }
+
+       btrfs_debug(BTRFS_I(inode)->root->fs_info,
+                   "Repair DIO Read Error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d\n",
+                   read_mode, failrec->this_mirror, failrec->in_validation);
+
+       ret = submit_dio_repair_bio(inode, bio, read_mode,
+                                   failrec->this_mirror);
+       if (ret) {
+               free_io_failure(inode, failrec);
+               bio_put(bio);
+       }
+
+       return ret;
+}
+
+struct btrfs_retry_complete {
+       struct completion done;
+       struct inode *inode;
+       u64 start;
+       int uptodate;
+};
+
+static void btrfs_retry_endio_nocsum(struct bio *bio, int err)
+{
+       struct btrfs_retry_complete *done = bio->bi_private;
+       struct bio_vec *bvec;
+       int i;
+
+       if (err)
+               goto end;
+
+       done->uptodate = 1;
+       bio_for_each_segment_all(bvec, bio, i)
+               clean_io_failure(done->inode, done->start, bvec->bv_page, 0);
+end:
+       complete(&done->done);
+       bio_put(bio);
+}
+
+static int __btrfs_correct_data_nocsum(struct inode *inode,
+                                      struct btrfs_io_bio *io_bio)
 {
        struct bio_vec *bvec;
+       struct btrfs_retry_complete done;
        u64 start;
        int i;
        int ret;
-       int err = 0;
 
-       if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
-               return 0;
+       start = io_bio->logical;
+       done.inode = inode;
+
+       bio_for_each_segment_all(bvec, &io_bio->bio, i) {
+try_again:
+               done.uptodate = 0;
+               done.start = start;
+               init_completion(&done.done);
+
+               ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start,
+                                    start + bvec->bv_len - 1,
+                                    io_bio->mirror_num,
+                                    btrfs_retry_endio_nocsum, &done);
+               if (ret)
+                       return ret;
+
+               wait_for_completion(&done.done);
+
+               if (!done.uptodate) {
+                       /* We might have another mirror, so try again */
+                       goto try_again;
+               }
+
+               start += bvec->bv_len;
+       }
+
+       return 0;
+}
+
+static void btrfs_retry_endio(struct bio *bio, int err)
+{
+       struct btrfs_retry_complete *done = bio->bi_private;
+       struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+       struct bio_vec *bvec;
+       int uptodate;
+       int ret;
+       int i;
+
+       if (err)
+               goto end;
+
+       uptodate = 1;
+       bio_for_each_segment_all(bvec, bio, i) {
+               ret = __readpage_endio_check(done->inode, io_bio, i,
+                                            bvec->bv_page, 0,
+                                            done->start, bvec->bv_len);
+               if (!ret)
+                       clean_io_failure(done->inode, done->start,
+                                        bvec->bv_page, 0);
+               else
+                       uptodate = 0;
+       }
+
+       done->uptodate = uptodate;
+end:
+       complete(&done->done);
+       bio_put(bio);
+}
 
+static int __btrfs_subio_endio_read(struct inode *inode,
+                                   struct btrfs_io_bio *io_bio, int err)
+{
+       struct bio_vec *bvec;
+       struct btrfs_retry_complete done;
+       u64 start;
+       u64 offset = 0;
+       int i;
+       int ret;
+
+       err = 0;
        start = io_bio->logical;
+       done.inode = inode;
+
        bio_for_each_segment_all(bvec, &io_bio->bio, i) {
                ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page,
                                             0, start, bvec->bv_len);
-               if (ret)
-                       err = -EIO;
+               if (likely(!ret))
+                       goto next;
+try_again:
+               done.uptodate = 0;
+               done.start = start;
+               init_completion(&done.done);
+
+               ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start,
+                                    start + bvec->bv_len - 1,
+                                    io_bio->mirror_num,
+                                    btrfs_retry_endio, &done);
+               if (ret) {
+                       err = ret;
+                       goto next;
+               }
+
+               wait_for_completion(&done.done);
+
+               if (!done.uptodate) {
+                       /* We might have another mirror, so try again */
+                       goto try_again;
+               }
+next:
+               offset += bvec->bv_len;
                start += bvec->bv_len;
        }
 
        return err;
 }
 
+static int btrfs_subio_endio_read(struct inode *inode,
+                                 struct btrfs_io_bio *io_bio, int err)
+{
+       bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+
+       if (skip_csum) {
+               if (unlikely(err))
+                       return __btrfs_correct_data_nocsum(inode, io_bio);
+               else
+                       return 0;
+       } else {
+               return __btrfs_subio_endio_read(inode, io_bio, err);
+       }
+}
+
 static void btrfs_endio_direct_read(struct bio *bio, int err)
 {
        struct btrfs_dio_private *dip = bio->bi_private;
@@ -7273,8 +7510,8 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
        struct bio *dio_bio;
        struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
 
-       if (!err && (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED))
-               err = btrfs_subio_endio_read(inode, io_bio);
+       if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED)
+               err = btrfs_subio_endio_read(inode, io_bio, err);
 
        unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
                      dip->logical_offset + dip->bytes - 1);
@@ -7353,19 +7590,16 @@ static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
 static void btrfs_end_dio_bio(struct bio *bio, int err)
 {
        struct btrfs_dio_private *dip = bio->bi_private;
-       int ret;
 
-       if (err) {
-               btrfs_err(BTRFS_I(dip->inode)->root->fs_info,
-                         "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d",
-                     btrfs_ino(dip->inode), bio->bi_rw,
-                     (unsigned long long)bio->bi_iter.bi_sector,
-                     bio->bi_iter.bi_size, err);
-       } else if (dip->subio_endio) {
-               ret = dip->subio_endio(dip->inode, btrfs_io_bio(bio));
-               if (ret)
-                       err = ret;
-       }
+       if (err)
+               btrfs_warn(BTRFS_I(dip->inode)->root->fs_info,
+                          "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d",
+                          btrfs_ino(dip->inode), bio->bi_rw,
+                          (unsigned long long)bio->bi_iter.bi_sector,
+                          bio->bi_iter.bi_size, err);
+
+       if (dip->subio_endio)
+               err = dip->subio_endio(dip->inode, btrfs_io_bio(bio), err);
 
        if (err) {
                dip->errors = 1;