fs/btrfs/scrub.c

   1 /*
   2  * Copyright (C) 2011, 2012 STRATO.  All rights reserved.
   3  *
   4  * This program is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU General Public
   6  * License v2 as published by the Free Software Foundation.
   7  *
   8  * This program is distributed in the hope that it will be useful,
   9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11  * General Public License for more details.
  12  *
  13  * You should have received a copy of the GNU General Public
  14  * License along with this program; if not, write to the
  15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16  * Boston, MA 021110-1307, USA.
  17  */
  18
  19 #include <linux/blkdev.h>
  20 #include <linux/ratelimit.h>
  21 #include "ctree.h"
  22 #include "volumes.h"
  23 #include "disk-io.h"
  24 #include "ordered-data.h"
  25 #include "transaction.h"
  26 #include "backref.h"
  27 #include "extent_io.h"
  28 #include "dev-replace.h"
  29 #include "check-integrity.h"
  30 #include "rcu-string.h"
  31 #include "raid56.h"
  32
  33 /*
  34  * This is only the first step towards a full-features scrub. It reads all
  35  * extent and super block and verifies the checksums. In case a bad checksum
  36  * is found or the extent cannot be read, good data will be written back if
  37  * any can be found.
  38  *
  39  * Future enhancements:
  40  *  - In case an unrepairable extent is encountered, track which files are
  41  *    affected and report them
  42  *  - track and record media errors, throw out bad devices
  43  *  - add a mode to also read unallocated space
  44  */
  45
  46 struct scrub_block;
  47 struct scrub_ctx;
  48
  49 /*
  50  * the following three values only influence the performance.
  51  * The last one configures the number of parallel and outstanding I/O
  52  * operations. The first two values configure an upper limit for the number
  53  * of (dynamically allocated) pages that are added to a bio.
  54  */
  55 #define SCRUB_PAGES_PER_RD_BIO  32      /* 128k per bio */
  56 #define SCRUB_PAGES_PER_WR_BIO  32      /* 128k per bio */
  57 #define SCRUB_BIOS_PER_SCTX     64      /* 8MB per device in flight */
  58
  59 /*
  60  * the following value times PAGE_SIZE needs to be large enough to match the
  61  * largest node/leaf/sector size that shall be supported.
  62  * Values larger than BTRFS_STRIPE_LEN are not supported.
  63  */
  64 #define SCRUB_MAX_PAGES_PER_BLOCK       16      /* 64k per node/leaf/sector */
  65
  66 struct scrub_page {
  67         struct scrub_block      *sblock;
  68         struct page             *page;
  69         struct btrfs_device     *dev;
  70         u64                     flags;  /* extent flags */
  71         u64                     generation;
  72         u64                     logical;
  73         u64                     physical;
  74         u64                     physical_for_dev_replace;
  75         atomic_t                ref_count;
  76         struct {
  77                 unsigned int    mirror_num:8;
  78                 unsigned int    have_csum:1;
  79                 unsigned int    io_error:1;
  80         };
  81         u8                      csum[BTRFS_CSUM_SIZE];
  82 };
  83
  84 struct scrub_bio {
  85         int                     index;
  86         struct scrub_ctx        *sctx;
  87         struct btrfs_device     *dev;
  88         struct bio              *bio;
  89         int                     err;
  90         u64                     logical;
  91         u64                     physical;
  92 #if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
  93         struct scrub_page       *pagev[SCRUB_PAGES_PER_WR_BIO];
  94 #else
  95         struct scrub_page       *pagev[SCRUB_PAGES_PER_RD_BIO];
  96 #endif
  97         int                     page_count;
  98         int                     next_free;
  99         struct btrfs_work       work;
 100 };
 101
 102 struct scrub_block {
 103         struct scrub_page       *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
 104         int                     page_count;
 105         atomic_t                outstanding_pages;
 106         atomic_t                ref_count; /* free mem on transition to zero */
 107         struct scrub_ctx        *sctx;
 108         struct {
 109                 unsigned int    header_error:1;
 110                 unsigned int    checksum_error:1;
 111                 unsigned int    no_io_error_seen:1;
 112                 unsigned int    generation_error:1; /* also sets header_error */
 113         };
 114 };
 115
 116 struct scrub_wr_ctx {
 117         struct scrub_bio *wr_curr_bio;
 118         struct btrfs_device *tgtdev;
 119         int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
 120         atomic_t flush_all_writes;
 121         struct mutex wr_lock;
 122 };
 123
 124 struct scrub_ctx {
 125         struct scrub_bio        *bios[SCRUB_BIOS_PER_SCTX];
 126         struct btrfs_root       *dev_root;
 127         int                     first_free;
 128         int                     curr;
 129         atomic_t                bios_in_flight;
 130         atomic_t                workers_pending;
 131         spinlock_t              list_lock;
 132         wait_queue_head_t       list_wait;
 133         u16                     csum_size;
 134         struct list_head        csum_list;
 135         atomic_t                cancel_req;
 136         int                     readonly;
 137         int                     pages_per_rd_bio;
 138         u32                     sectorsize;
 139         u32                     nodesize;
 140         u32                     leafsize;
 141
 142         int                     is_dev_replace;
 143         struct scrub_wr_ctx     wr_ctx;
 144
 145         /*
 146          * statistics
 147          */
 148         struct btrfs_scrub_progress stat;
 149         spinlock_t              stat_lock;
 150 };
 151
 152 struct scrub_fixup_nodatasum {
 153         struct scrub_ctx        *sctx;
 154         struct btrfs_device     *dev;
 155         u64                     logical;
 156         struct btrfs_root       *root;
 157         struct btrfs_work       work;
 158         int                     mirror_num;
 159 };
 160
 161 struct scrub_nocow_inode {
 162         u64                     inum;
 163         u64                     offset;
 164         u64                     root;
 165         struct list_head        list;
 166 };
 167
 168 struct scrub_copy_nocow_ctx {
 169         struct scrub_ctx        *sctx;
 170         u64                     logical;
 171         u64                     len;
 172         int                     mirror_num;
 173         u64                     physical_for_dev_replace;
 174         struct list_head        inodes;
 175         struct btrfs_work       work;
 176 };
 177
 178 struct scrub_warning {
 179         struct btrfs_path       *path;
 180         u64                     extent_item_size;
 181         char                    *scratch_buf;
 182         char                    *msg_buf;
 183         const char              *errstr;
 184         sector_t                sector;
 185         u64                     logical;
 186         struct btrfs_device     *dev;
 187         int                     msg_bufsize;
 188         int                     scratch_bufsize;
 189 };
 190
 191
 192 static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
 193 static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
 194 static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
 195 static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
 196 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
 197 static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
 198                                      struct btrfs_fs_info *fs_info,
 199                                      struct scrub_block *original_sblock,
 200                                      u64 length, u64 logical,
 201                                      struct scrub_block *sblocks_for_recheck);
 202 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
 203                                 struct scrub_block *sblock, int is_metadata,
 204                                 int have_csum, u8 *csum, u64 generation,
 205                                 u16 csum_size);
 206 static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
 207                                          struct scrub_block *sblock,
 208                                          int is_metadata, int have_csum,
 209                                          const u8 *csum, u64 generation,
 210                                          u16 csum_size);
 211 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
 212                                              struct scrub_block *sblock_good,
 213                                              int force_write);
 214 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
 215                                             struct scrub_block *sblock_good,
 216                                             int page_num, int force_write);
 217 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
 218 static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
 219                                            int page_num);
 220 static int scrub_checksum_data(struct scrub_block *sblock);
 221 static int scrub_checksum_tree_block(struct scrub_block *sblock);
 222 static int scrub_checksum_super(struct scrub_block *sblock);
 223 static void scrub_block_get(struct scrub_block *sblock);
 224 static void scrub_block_put(struct scrub_block *sblock);
 225 static void scrub_page_get(struct scrub_page *spage);
 226 static void scrub_page_put(struct scrub_page *spage);
 227 static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
 228                                     struct scrub_page *spage);
 229 static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
 230                        u64 physical, struct btrfs_device *dev, u64 flags,
 231                        u64 gen, int mirror_num, u8 *csum, int force,
 232                        u64 physical_for_dev_replace);
 233 static void scrub_bio_end_io(struct bio *bio, int err);
 234 static void scrub_bio_end_io_worker(struct btrfs_work *work);
 235 static void scrub_block_complete(struct scrub_block *sblock);
 236 static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
 237                                u64 extent_logical, u64 extent_len,
 238                                u64 *extent_physical,
 239                                struct btrfs_device **extent_dev,
 240                                int *extent_mirror_num);
 241 static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
 242                               struct scrub_wr_ctx *wr_ctx,
 243                               struct btrfs_fs_info *fs_info,
 244                               struct btrfs_device *dev,
 245                               int is_dev_replace);
 246 static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx);
 247 static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
 248                                     struct scrub_page *spage);
 249 static void scrub_wr_submit(struct scrub_ctx *sctx);
 250 static void scrub_wr_bio_end_io(struct bio *bio, int err);
 251 static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
 252 static int write_page_nocow(struct scrub_ctx *sctx,
 253                             u64 physical_for_dev_replace, struct page *page);
 254 static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
 255                                       struct scrub_copy_nocow_ctx *ctx);
 256 static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
 257                             int mirror_num, u64 physical_for_dev_replace);
 258 static void copy_nocow_pages_worker(struct btrfs_work *work);
 259 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
 260 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
 261
 262
 263 static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
 264 {
 265         atomic_inc(&sctx->bios_in_flight);
 266 }
 267
 268 static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
 269 {
 270         atomic_dec(&sctx->bios_in_flight);
 271         wake_up(&sctx->list_wait);
 272 }
 273
 274 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
 275 {
 276         while (atomic_read(&fs_info->scrub_pause_req)) {
 277                 mutex_unlock(&fs_info->scrub_lock);
 278                 wait_event(fs_info->scrub_pause_wait,
 279                    atomic_read(&fs_info->scrub_pause_req) == 0);
 280                 mutex_lock(&fs_info->scrub_lock);
 281         }
 282 }
 283
 284 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
 285 {
 286         atomic_inc(&fs_info->scrubs_paused);
 287         wake_up(&fs_info->scrub_pause_wait);
 288
 289         mutex_lock(&fs_info->scrub_lock);
 290         __scrub_blocked_if_needed(fs_info);
 291         atomic_dec(&fs_info->scrubs_paused);
 292         mutex_unlock(&fs_info->scrub_lock);
 293
 294         wake_up(&fs_info->scrub_pause_wait);
 295 }
 296
 297 /*
 298  * used for workers that require transaction commits (i.e., for the
 299  * NOCOW case)
 300  */
 301 static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
 302 {
 303         struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
 304
 305         /*
 306          * increment scrubs_running to prevent cancel requests from
 307          * completing as long as a worker is running. we must also
 308          * increment scrubs_paused to prevent deadlocking on pause
 309          * requests used for transactions commits (as the worker uses a
 310          * transaction context). it is safe to regard the worker
 311          * as paused for all matters practical. effectively, we only
 312          * avoid cancellation requests from completing.
 313          */
 314         mutex_lock(&fs_info->scrub_lock);
 315         atomic_inc(&fs_info->scrubs_running);
 316         atomic_inc(&fs_info->scrubs_paused);
 317         mutex_unlock(&fs_info->scrub_lock);
 318         atomic_inc(&sctx->workers_pending);
 319 }
 320
 321 /* used for workers that require transaction commits */
 322 static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
 323 {
 324         struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
 325
 326         /*
 327          * see scrub_pending_trans_workers_inc() why we're pretending
 328          * to be paused in the scrub counters
 329          */
 330         mutex_lock(&fs_info->scrub_lock);
 331         atomic_dec(&fs_info->scrubs_running);
 332         atomic_dec(&fs_info->scrubs_paused);
 333         mutex_unlock(&fs_info->scrub_lock);
 334         atomic_dec(&sctx->workers_pending);
 335         wake_up(&fs_info->scrub_pause_wait);
 336         wake_up(&sctx->list_wait);
 337 }
 338
 339 static void scrub_free_csums(struct scrub_ctx *sctx)
 340 {
 341         while (!list_empty(&sctx->csum_list)) {
 342                 struct btrfs_ordered_sum *sum;
 343                 sum = list_first_entry(&sctx->csum_list,
 344                                        struct btrfs_ordered_sum, list);
 345                 list_del(&sum->list);
 346                 kfree(sum);
 347         }
 348 }
 349
 350 static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
 351 {
 352         int i;
 353
 354         if (!sctx)
 355                 return;
 356
 357         scrub_free_wr_ctx(&sctx->wr_ctx);
 358
 359         /* this can happen when scrub is cancelled */
 360         if (sctx->curr != -1) {
 361                 struct scrub_bio *sbio = sctx->bios[sctx->curr];
 362
 363                 for (i = 0; i < sbio->page_count; i++) {
 364                         WARN_ON(!sbio->pagev[i]->page);
 365                         scrub_block_put(sbio->pagev[i]->sblock);
 366                 }
 367                 bio_put(sbio->bio);
 368         }
 369
 370         for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
 371                 struct scrub_bio *sbio = sctx->bios[i];
 372
 373                 if (!sbio)
 374                         break;
 375                 kfree(sbio);
 376         }
 377
 378         scrub_free_csums(sctx);
 379         kfree(sctx);
 380 }
 381
 382 static noinline_for_stack
 383 struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
 384 {
 385         struct scrub_ctx *sctx;
 386         int             i;
 387         struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
 388         int pages_per_rd_bio;
 389         int ret;
 390
 391         /*
 392          * the setting of pages_per_rd_bio is correct for scrub but might
 393          * be wrong for the dev_replace code where we might read from
 394          * different devices in the initial huge bios. However, that
 395          * code is able to correctly handle the case when adding a page
 396          * to a bio fails.
 397          */
 398         if (dev->bdev)
 399                 pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO,
 400                                          bio_get_nr_vecs(dev->bdev));
 401         else
 402                 pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
 403         sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
 404         if (!sctx)
 405                 goto nomem;
 406         sctx->is_dev_replace = is_dev_replace;
 407         sctx->pages_per_rd_bio = pages_per_rd_bio;
 408         sctx->curr = -1;
 409         sctx->dev_root = dev->dev_root;
 410         for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
 411                 struct scrub_bio *sbio;
 412
 413                 sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
 414                 if (!sbio)
 415                         goto nomem;
 416                 sctx->bios[i] = sbio;
 417
 418                 sbio->index = i;
 419                 sbio->sctx = sctx;
 420                 sbio->page_count = 0;
 421                 sbio->work.func = scrub_bio_end_io_worker;
 422
 423                 if (i != SCRUB_BIOS_PER_SCTX - 1)
 424                         sctx->bios[i]->next_free = i + 1;
 425                 else
 426                         sctx->bios[i]->next_free = -1;
 427         }
 428         sctx->first_free = 0;
 429         sctx->nodesize = dev->dev_root->nodesize;
 430         sctx->leafsize = dev->dev_root->leafsize;
 431         sctx->sectorsize = dev->dev_root->sectorsize;
 432         atomic_set(&sctx->bios_in_flight, 0);
 433         atomic_set(&sctx->workers_pending, 0);
 434         atomic_set(&sctx->cancel_req, 0);
 435         sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
 436         INIT_LIST_HEAD(&sctx->csum_list);
 437
 438         spin_lock_init(&sctx->list_lock);
 439         spin_lock_init(&sctx->stat_lock);
 440         init_waitqueue_head(&sctx->list_wait);
 441
 442         ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info,
 443                                  fs_info->dev_replace.tgtdev, is_dev_replace);
 444         if (ret) {
 445                 scrub_free_ctx(sctx);
 446                 return ERR_PTR(ret);
 447         }
 448         return sctx;
 449
 450 nomem:
 451         scrub_free_ctx(sctx);
 452         return ERR_PTR(-ENOMEM);
 453 }
 454
 455 static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
 456                                      void *warn_ctx)
 457 {
 458         u64 isize;
 459         u32 nlink;
 460         int ret;
 461         int i;
 462         struct extent_buffer *eb;
 463         struct btrfs_inode_item *inode_item;
 464         struct scrub_warning *swarn = warn_ctx;
 465         struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
 466         struct inode_fs_paths *ipath = NULL;
 467         struct btrfs_root *local_root;
 468         struct btrfs_key root_key;
 469
 470         root_key.objectid = root;
 471         root_key.type = BTRFS_ROOT_ITEM_KEY;
 472         root_key.offset = (u64)-1;
 473         local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
 474         if (IS_ERR(local_root)) {
 475                 ret = PTR_ERR(local_root);
 476                 goto err;
 477         }
 478
 479         ret = inode_item_info(inum, 0, local_root, swarn->path);
 480         if (ret) {
 481                 btrfs_release_path(swarn->path);
 482                 goto err;
 483         }
 484
 485         eb = swarn->path->nodes[0];
 486         inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
 487                                         struct btrfs_inode_item);
 488         isize = btrfs_inode_size(eb, inode_item);
 489         nlink = btrfs_inode_nlink(eb, inode_item);
 490         btrfs_release_path(swarn->path);
 491
 492         ipath = init_ipath(4096, local_root, swarn->path);
 493         if (IS_ERR(ipath)) {
 494                 ret = PTR_ERR(ipath);
 495                 ipath = NULL;
 496                 goto err;
 497         }
 498         ret = paths_from_inode(inum, ipath);
 499
 500         if (ret < 0)
 501                 goto err;
 502
 503         /*
 504          * we deliberately ignore the bit ipath might have been too small to
 505          * hold all of the paths here
 506          */
 507         for (i = 0; i < ipath->fspath->elem_cnt; ++i)
 508                 printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "
 509                         "%s, sector %llu, root %llu, inode %llu, offset %llu, "
 510                         "length %llu, links %u (path: %s)\n", swarn->errstr,
 511                         swarn->logical, rcu_str_deref(swarn->dev->name),
 512                         (unsigned long long)swarn->sector, root, inum, offset,
 513                         min(isize - offset, (u64)PAGE_SIZE), nlink,
 514                         (char *)(unsigned long)ipath->fspath->val[i]);
 515
 516         free_ipath(ipath);
 517         return 0;
 518
 519 err:
 520         printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "
 521                 "%s, sector %llu, root %llu, inode %llu, offset %llu: path "
 522                 "resolving failed with ret=%d\n", swarn->errstr,
 523                 swarn->logical, rcu_str_deref(swarn->dev->name),
 524                 (unsigned long long)swarn->sector, root, inum, offset, ret);
 525
 526         free_ipath(ipath);
 527         return 0;
 528 }
 529
 530 static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
 531 {
 532         struct btrfs_device *dev;
 533         struct btrfs_fs_info *fs_info;
 534         struct btrfs_path *path;
 535         struct btrfs_key found_key;
 536         struct extent_buffer *eb;
 537         struct btrfs_extent_item *ei;
 538         struct scrub_warning swarn;
 539         unsigned long ptr = 0;
 540         u64 extent_item_pos;
 541         u64 flags = 0;
 542         u64 ref_root;
 543         u32 item_size;
 544         u8 ref_level;
 545         const int bufsize = 4096;
 546         int ret;
 547
 548         WARN_ON(sblock->page_count < 1);
 549         dev = sblock->pagev[0]->dev;
 550         fs_info = sblock->sctx->dev_root->fs_info;
 551
 552         path = btrfs_alloc_path();
 553
 554         swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
 555         swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
 556         swarn.sector = (sblock->pagev[0]->physical) >> 9;
 557         swarn.logical = sblock->pagev[0]->logical;
 558         swarn.errstr = errstr;
 559         swarn.dev = NULL;
 560         swarn.msg_bufsize = bufsize;
 561         swarn.scratch_bufsize = bufsize;
 562
 563         if (!path || !swarn.scratch_buf || !swarn.msg_buf)
 564                 goto out;
 565
 566         ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
 567                                   &flags);
 568         if (ret < 0)
 569                 goto out;
 570
 571         extent_item_pos = swarn.logical - found_key.objectid;
 572         swarn.extent_item_size = found_key.offset;
 573
 574         eb = path->nodes[0];
 575         ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
 576         item_size = btrfs_item_size_nr(eb, path->slots[0]);
 577
 578         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
 579                 do {
 580                         ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
 581                                                       item_size, &ref_root,
 582                                                       &ref_level);
 583                         printk_in_rcu(KERN_WARNING
 584                                 "BTRFS: %s at logical %llu on dev %s, "
 585                                 "sector %llu: metadata %s (level %d) in tree "
 586                                 "%llu\n", errstr, swarn.logical,
 587                                 rcu_str_deref(dev->name),
 588                                 (unsigned long long)swarn.sector,
 589                                 ref_level ? "node" : "leaf",
 590                                 ret < 0 ? -1 : ref_level,
 591                                 ret < 0 ? -1 : ref_root);
 592                 } while (ret != 1);
 593                 btrfs_release_path(path);
 594         } else {
 595                 btrfs_release_path(path);
 596                 swarn.path = path;
 597                 swarn.dev = dev;
 598                 iterate_extent_inodes(fs_info, found_key.objectid,
 599                                         extent_item_pos, 1,
 600                                         scrub_print_warning_inode, &swarn);
 601         }
 602
 603 out:
 604         btrfs_free_path(path);
 605         kfree(swarn.scratch_buf);
 606         kfree(swarn.msg_buf);
 607 }
 608
 609 static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
 610 {
 611         struct page *page = NULL;
 612         unsigned long index;
 613         struct scrub_fixup_nodatasum *fixup = fixup_ctx;
 614         int ret;
 615         int corrected = 0;
 616         struct btrfs_key key;
 617         struct inode *inode = NULL;
 618         struct btrfs_fs_info *fs_info;
 619         u64 end = offset + PAGE_SIZE - 1;
 620         struct btrfs_root *local_root;
 621         int srcu_index;
 622
 623         key.objectid = root;
 624         key.type = BTRFS_ROOT_ITEM_KEY;
 625         key.offset = (u64)-1;
 626
 627         fs_info = fixup->root->fs_info;
 628         srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
 629
 630         local_root = btrfs_read_fs_root_no_name(fs_info, &key);
 631         if (IS_ERR(local_root)) {
 632                 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
 633                 return PTR_ERR(local_root);
 634         }
 635
 636         key.type = BTRFS_INODE_ITEM_KEY;
 637         key.objectid = inum;
 638         key.offset = 0;
 639         inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
 640         srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
 641         if (IS_ERR(inode))
 642                 return PTR_ERR(inode);
 643
 644         index = offset >> PAGE_CACHE_SHIFT;
 645
 646         page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
 647         if (!page) {
 648                 ret = -ENOMEM;
 649                 goto out;
 650         }
 651
 652         if (PageUptodate(page)) {
 653                 if (PageDirty(page)) {
 654                         /*
 655                          * we need to write the data to the defect sector. the
 656                          * data that was in that sector is not in memory,
 657                          * because the page was modified. we must not write the
 658                          * modified page to that sector.
 659                          *
 660                          * TODO: what could be done here: wait for the delalloc
 661                          *       runner to write out that page (might involve
 662                          *       COW) and see whether the sector is still
 663                          *       referenced afterwards.
 664                          *
 665                          * For the meantime, we'll treat this error
 666                          * incorrectable, although there is a chance that a
 667                          * later scrub will find the bad sector again and that
 668                          * there's no dirty page in memory, then.
 669                          */
 670                         ret = -EIO;
 671                         goto out;
 672                 }
 673                 fs_info = BTRFS_I(inode)->root->fs_info;
 674                 ret = repair_io_failure(fs_info, offset, PAGE_SIZE,
 675                                         fixup->logical, page,
 676                                         fixup->mirror_num);
 677                 unlock_page(page);
 678                 corrected = !ret;
 679         } else {
 680                 /*
 681                  * we need to get good data first. the general readpage path
 682                  * will call repair_io_failure for us, we just have to make
 683                  * sure we read the bad mirror.
 684                  */
 685                 ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
 686                                         EXTENT_DAMAGED, GFP_NOFS);
 687                 if (ret) {
 688                         /* set_extent_bits should give proper error */
 689                         WARN_ON(ret > 0);
 690                         if (ret > 0)
 691                                 ret = -EFAULT;
 692                         goto out;
 693                 }
 694
 695                 ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
 696                                                 btrfs_get_extent,
 697                                                 fixup->mirror_num);
 698                 wait_on_page_locked(page);
 699
 700                 corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
 701                                                 end, EXTENT_DAMAGED, 0, NULL);
 702                 if (!corrected)
 703                         clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
 704                                                 EXTENT_DAMAGED, GFP_NOFS);
 705         }
 706
 707 out:
 708         if (page)
 709                 put_page(page);
 710         if (inode)
 711                 iput(inode);
 712
 713         if (ret < 0)
 714                 return ret;
 715
 716         if (ret == 0 && corrected) {
 717                 /*
 718                  * we only need to call readpage for one of the inodes belonging
 719                  * to this extent. so make iterate_extent_inodes stop
 720                  */
 721                 return 1;
 722         }
 723
 724         return -EIO;
 725 }
 726
 727 static void scrub_fixup_nodatasum(struct btrfs_work *work)
 728 {
 729         int ret;
 730         struct scrub_fixup_nodatasum *fixup;
 731         struct scrub_ctx *sctx;
 732         struct btrfs_trans_handle *trans = NULL;
 733         struct btrfs_path *path;
 734         int uncorrectable = 0;
 735
 736         fixup = container_of(work, struct scrub_fixup_nodatasum, work);
 737         sctx = fixup->sctx;
 738
 739         path = btrfs_alloc_path();
 740         if (!path) {
 741                 spin_lock(&sctx->stat_lock);
 742                 ++sctx->stat.malloc_errors;
 743                 spin_unlock(&sctx->stat_lock);
 744                 uncorrectable = 1;
 745                 goto out;
 746         }
 747
 748         trans = btrfs_join_transaction(fixup->root);
 749         if (IS_ERR(trans)) {
 750                 uncorrectable = 1;
 751                 goto out;
 752         }
 753
 754         /*
 755          * the idea is to trigger a regular read through the standard path. we
 756          * read a page from the (failed) logical address by specifying the
 757          * corresponding copynum of the failed sector. thus, that readpage is
 758          * expected to fail.
 759          * that is the point where on-the-fly error correction will kick in
 760          * (once it's finished) and rewrite the failed sector if a good copy
 761          * can be found.
 762          */
 763         ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info,
 764                                                 path, scrub_fixup_readpage,
 765                                                 fixup);
 766         if (ret < 0) {
 767                 uncorrectable = 1;
 768                 goto out;
 769         }
 770         WARN_ON(ret != 1);
 771
 772         spin_lock(&sctx->stat_lock);
 773         ++sctx->stat.corrected_errors;
 774         spin_unlock(&sctx->stat_lock);
 775
 776 out:
 777         if (trans && !IS_ERR(trans))
 778                 btrfs_end_transaction(trans, fixup->root);
 779         if (uncorrectable) {
 780                 spin_lock(&sctx->stat_lock);
 781                 ++sctx->stat.uncorrectable_errors;
 782                 spin_unlock(&sctx->stat_lock);
 783                 btrfs_dev_replace_stats_inc(
 784                         &sctx->dev_root->fs_info->dev_replace.
 785                         num_uncorrectable_read_errors);
 786                 printk_ratelimited_in_rcu(KERN_ERR "BTRFS: "
 787                     "unable to fixup (nodatasum) error at logical %llu on dev %s\n",
 788                         fixup->logical, rcu_str_deref(fixup->dev->name));
 789         }
 790
 791         btrfs_free_path(path);
 792         kfree(fixup);
 793
 794         scrub_pending_trans_workers_dec(sctx);
 795 }
 796
 797 /*
 798  * scrub_handle_errored_block gets called when either verification of the
 799  * pages failed or the bio failed to read, e.g. with EIO. In the latter
 800  * case, this function handles all pages in the bio, even though only one
 801  * may be bad.
 802  * The goal of this function is to repair the errored block by using the
 803  * contents of one of the mirrors.
 804  */
 805 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 806 {
 807         struct scrub_ctx *sctx = sblock_to_check->sctx;
 808         struct btrfs_device *dev;
 809         struct btrfs_fs_info *fs_info;
 810         u64 length;
 811         u64 logical;
 812         u64 generation;
 813         unsigned int failed_mirror_index;
 814         unsigned int is_metadata;
 815         unsigned int have_csum;
 816         u8 *csum;
 817         struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
 818         struct scrub_block *sblock_bad;
 819         int ret;
 820         int mirror_index;
 821         int page_num;
 822         int success;
 823         static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
 824                                       DEFAULT_RATELIMIT_BURST);
 825
 826         BUG_ON(sblock_to_check->page_count < 1);
 827         fs_info = sctx->dev_root->fs_info;
 828         if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
 829                 /*
 830                  * if we find an error in a super block, we just report it.
 831                  * They will get written with the next transaction commit
 832                  * anyway
 833                  */
 834                 spin_lock(&sctx->stat_lock);
 835                 ++sctx->stat.super_errors;
 836                 spin_unlock(&sctx->stat_lock);
 837                 return 0;
 838         }
 839         length = sblock_to_check->page_count * PAGE_SIZE;
 840         logical = sblock_to_check->pagev[0]->logical;
 841         generation = sblock_to_check->pagev[0]->generation;
 842         BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
 843         failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
 844         is_metadata = !(sblock_to_check->pagev[0]->flags &
 845                         BTRFS_EXTENT_FLAG_DATA);
 846         have_csum = sblock_to_check->pagev[0]->have_csum;
 847         csum = sblock_to_check->pagev[0]->csum;
 848         dev = sblock_to_check->pagev[0]->dev;
 849
 850         if (sctx->is_dev_replace && !is_metadata && !have_csum) {
 851                 sblocks_for_recheck = NULL;
 852                 goto nodatasum_case;
 853         }
 854
 855         /*
 856          * read all mirrors one after the other. This includes to
 857          * re-read the extent or metadata block that failed (that was
 858          * the cause that this fixup code is called) another time,
 859          * page by page this time in order to know which pages
 860          * caused I/O errors and which ones are good (for all mirrors).
 861          * It is the goal to handle the situation when more than one
 862          * mirror contains I/O errors, but the errors do not
 863          * overlap, i.e. the data can be repaired by selecting the
 864          * pages from those mirrors without I/O error on the
 865          * particular pages. One example (with blocks >= 2 * PAGE_SIZE)
 866          * would be that mirror #1 has an I/O error on the first page,
 867          * the second page is good, and mirror #2 has an I/O error on
 868          * the second page, but the first page is good.
 869          * Then the first page of the first mirror can be repaired by
 870          * taking the first page of the second mirror, and the
 871          * second page of the second mirror can be repaired by
 872          * copying the contents of the 2nd page of the 1st mirror.
 873          * One more note: if the pages of one mirror contain I/O
 874          * errors, the checksum cannot be verified. In order to get
 875          * the best data for repairing, the first attempt is to find
 876          * a mirror without I/O errors and with a validated checksum.
 877          * Only if this is not possible, the pages are picked from
 878          * mirrors with I/O errors without considering the checksum.
 879          * If the latter is the case, at the end, the checksum of the
 880          * repaired area is verified in order to correctly maintain
 881          * the statistics.
 882          */
 883
 884         sblocks_for_recheck = kzalloc(BTRFS_MAX_MIRRORS *
 885                                      sizeof(*sblocks_for_recheck),
 886                                      GFP_NOFS);
 887         if (!sblocks_for_recheck) {
 888                 spin_lock(&sctx->stat_lock);
 889                 sctx->stat.malloc_errors++;
 890                 sctx->stat.read_errors++;
 891                 sctx->stat.uncorrectable_errors++;
 892                 spin_unlock(&sctx->stat_lock);
 893                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
 894                 goto out;
 895         }
 896
 897         /* setup the context, map the logical blocks and alloc the pages */
 898         ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length,
 899                                         logical, sblocks_for_recheck);
 900         if (ret) {
 901                 spin_lock(&sctx->stat_lock);
 902                 sctx->stat.read_errors++;
 903                 sctx->stat.uncorrectable_errors++;
 904                 spin_unlock(&sctx->stat_lock);
 905                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
 906                 goto out;
 907         }
 908         BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
 909         sblock_bad = sblocks_for_recheck + failed_mirror_index;
 910
 911         /* build and submit the bios for the failed mirror, check checksums */
 912         scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
 913                             csum, generation, sctx->csum_size);
 914
 915         if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
 916             sblock_bad->no_io_error_seen) {
 917                 /*
 918                  * the error disappeared after reading page by page, or
 919                  * the area was part of a huge bio and other parts of the
 920                  * bio caused I/O errors, or the block layer merged several
 921                  * read requests into one and the error is caused by a
 922                  * different bio (usually one of the two latter cases is
 923                  * the cause)
 924                  */
 925                 spin_lock(&sctx->stat_lock);
 926                 sctx->stat.unverified_errors++;
 927                 spin_unlock(&sctx->stat_lock);
 928
 929                 if (sctx->is_dev_replace)
 930                         scrub_write_block_to_dev_replace(sblock_bad);
 931                 goto out;
 932         }
 933
 934         if (!sblock_bad->no_io_error_seen) {
 935                 spin_lock(&sctx->stat_lock);
 936                 sctx->stat.read_errors++;
 937                 spin_unlock(&sctx->stat_lock);
 938                 if (__ratelimit(&_rs))
 939                         scrub_print_warning("i/o error", sblock_to_check);
 940                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
 941         } else if (sblock_bad->checksum_error) {
 942                 spin_lock(&sctx->stat_lock);
 943                 sctx->stat.csum_errors++;
 944                 spin_unlock(&sctx->stat_lock);
 945                 if (__ratelimit(&_rs))
 946                         scrub_print_warning("checksum error", sblock_to_check);
 947                 btrfs_dev_stat_inc_and_print(dev,
 948                                              BTRFS_DEV_STAT_CORRUPTION_ERRS);
 949         } else if (sblock_bad->header_error) {
 950                 spin_lock(&sctx->stat_lock);
 951                 sctx->stat.verify_errors++;
 952                 spin_unlock(&sctx->stat_lock);
 953                 if (__ratelimit(&_rs))
 954                         scrub_print_warning("checksum/header error",
 955                                             sblock_to_check);
 956                 if (sblock_bad->generation_error)
 957                         btrfs_dev_stat_inc_and_print(dev,
 958                                 BTRFS_DEV_STAT_GENERATION_ERRS);
 959                 else
 960                         btrfs_dev_stat_inc_and_print(dev,
 961                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
 962         }
 963
 964         if (sctx->readonly) {
 965                 ASSERT(!sctx->is_dev_replace);
 966                 goto out;
 967         }
 968
 969         if (!is_metadata && !have_csum) {
 970                 struct scrub_fixup_nodatasum *fixup_nodatasum;
 971
 972 nodatasum_case:
 973                 WARN_ON(sctx->is_dev_replace);
 974
 975                 /*
 976                  * !is_metadata and !have_csum, this means that the data
 977                  * might not be COW'ed, that it might be modified
 978                  * concurrently. The general strategy to work on the
 979                  * commit root does not help in the case when COW is not
 980                  * used.
 981                  */
 982                 fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
 983                 if (!fixup_nodatasum)
 984                         goto did_not_correct_error;
 985                 fixup_nodatasum->sctx = sctx;
 986                 fixup_nodatasum->dev = dev;
 987                 fixup_nodatasum->logical = logical;
 988                 fixup_nodatasum->root = fs_info->extent_root;
 989                 fixup_nodatasum->mirror_num = failed_mirror_index + 1;
 990                 scrub_pending_trans_workers_inc(sctx);
 991                 fixup_nodatasum->work.func = scrub_fixup_nodatasum;
 992                 btrfs_queue_worker(&fs_info->scrub_workers,
 993                                    &fixup_nodatasum->work);
 994                 goto out;
 995         }
 996
 997         /*
 998          * now build and submit the bios for the other mirrors, check
 999          * checksums.
1000          * First try to pick the mirror which is completely without I/O
1001          * errors and also does not have a checksum error.
1002          * If one is found, and if a checksum is present, the full block
1003          * that is known to contain an error is rewritten. Afterwards
1004          * the block is known to be corrected.
1005          * If a mirror is found which is completely correct, and no
1006          * checksum is present, only those pages are rewritten that had
1007          * an I/O error in the block to be repaired, since it cannot be
1008          * determined, which copy of the other pages is better (and it
1009          * could happen otherwise that a correct page would be
1010          * overwritten by a bad one).
1011          */
1012         for (mirror_index = 0;
1013              mirror_index < BTRFS_MAX_MIRRORS &&
1014              sblocks_for_recheck[mirror_index].page_count > 0;
1015              mirror_index++) {
1016                 struct scrub_block *sblock_other;
1017
1018                 if (mirror_index == failed_mirror_index)
1019                         continue;
1020                 sblock_other = sblocks_for_recheck + mirror_index;
1021
1022                 /* build and submit the bios, check checksums */
1023                 scrub_recheck_block(fs_info, sblock_other, is_metadata,
1024                                     have_csum, csum, generation,
1025                                     sctx->csum_size);
1026
1027                 if (!sblock_other->header_error &&
1028                     !sblock_other->checksum_error &&
1029                     sblock_other->no_io_error_seen) {
1030                         if (sctx->is_dev_replace) {
1031                                 scrub_write_block_to_dev_replace(sblock_other);
1032                         } else {
1033                                 int force_write = is_metadata || have_csum;
1034
1035                                 ret = scrub_repair_block_from_good_copy(
1036                                                 sblock_bad, sblock_other,
1037                                                 force_write);
1038                         }
1039                         if (0 == ret)
1040                                 goto corrected_error;
1041                 }
1042         }
1043
1044         /*
1045          * for dev_replace, pick good pages and write to the target device.
1046          */
1047         if (sctx->is_dev_replace) {
1048                 success = 1;
1049                 for (page_num = 0; page_num < sblock_bad->page_count;
1050                      page_num++) {
1051                         int sub_success;
1052
1053                         sub_success = 0;
1054                         for (mirror_index = 0;
1055                              mirror_index < BTRFS_MAX_MIRRORS &&
1056                              sblocks_for_recheck[mirror_index].page_count > 0;
1057                              mirror_index++) {
1058                                 struct scrub_block *sblock_other =
1059                                         sblocks_for_recheck + mirror_index;
1060                                 struct scrub_page *page_other =
1061                                         sblock_other->pagev[page_num];
1062
1063                                 if (!page_other->io_error) {
1064                                         ret = scrub_write_page_to_dev_replace(
1065                                                         sblock_other, page_num);
1066                                         if (ret == 0) {
1067                                                 /* succeeded for this page */
1068                                                 sub_success = 1;
1069                                                 break;
1070                                         } else {
1071                                                 btrfs_dev_replace_stats_inc(
1072                                                         &sctx->dev_root->
1073                                                         fs_info->dev_replace.
1074                                                         num_write_errors);
1075                                         }
1076                                 }
1077                         }
1078
1079                         if (!sub_success) {
1080                                 /*
1081                                  * did not find a mirror to fetch the page
1082                                  * from. scrub_write_page_to_dev_replace()
1083                                  * handles this case (page->io_error), by
1084                                  * filling the block with zeros before
1085                                  * submitting the write request
1086                                  */
1087                                 success = 0;
1088                                 ret = scrub_write_page_to_dev_replace(
1089                                                 sblock_bad, page_num);
1090                                 if (ret)
1091                                         btrfs_dev_replace_stats_inc(
1092                                                 &sctx->dev_root->fs_info->
1093                                                 dev_replace.num_write_errors);
1094                         }
1095                 }
1096
1097                 goto out;
1098         }
1099
1100         /*
1101          * for regular scrub, repair those pages that are errored.
1102          * In case of I/O errors in the area that is supposed to be
1103          * repaired, continue by picking good copies of those pages.
1104          * Select the good pages from mirrors to rewrite bad pages from
1105          * the area to fix. Afterwards verify the checksum of the block
1106          * that is supposed to be repaired. This verification step is
1107          * only done for the purpose of statistic counting and for the
1108          * final scrub report, whether errors remain.
1109          * A perfect algorithm could make use of the checksum and try
1110          * all possible combinations of pages from the different mirrors
1111          * until the checksum verification succeeds. For example, when
1112          * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
1113          * of mirror #2 is readable but the final checksum test fails,
1114          * then the 2nd page of mirror #3 could be tried, whether now
1115          * the final checksum succeedes. But this would be a rare
1116          * exception and is therefore not implemented. At least it is
1117          * avoided that the good copy is overwritten.
1118          * A more useful improvement would be to pick the sectors
1119          * without I/O error based on sector sizes (512 bytes on legacy
1120          * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
1121          * mirror could be repaired by taking 512 byte of a different
1122          * mirror, even if other 512 byte sectors in the same PAGE_SIZE
1123          * area are unreadable.
1124          */
1125
1126         /* can only fix I/O errors from here on */
1127         if (sblock_bad->no_io_error_seen)
1128                 goto did_not_correct_error;
1129
1130         success = 1;
1131         for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
1132                 struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1133
1134                 if (!page_bad->io_error)
1135                         continue;
1136
1137                 for (mirror_index = 0;
1138                      mirror_index < BTRFS_MAX_MIRRORS &&
1139                      sblocks_for_recheck[mirror_index].page_count > 0;
1140                      mirror_index++) {
1141                         struct scrub_block *sblock_other = sblocks_for_recheck +
1142                                                            mirror_index;
1143                         struct scrub_page *page_other = sblock_other->pagev[
1144                                                         page_num];
1145
1146                         if (!page_other->io_error) {
1147                                 ret = scrub_repair_page_from_good_copy(
1148                                         sblock_bad, sblock_other, page_num, 0);
1149                                 if (0 == ret) {
1150                                         page_bad->io_error = 0;
1151                                         break; /* succeeded for this page */
1152                                 }
1153                         }
1154                 }
1155
1156                 if (page_bad->io_error) {
1157                         /* did not find a mirror to copy the page from */
1158                         success = 0;
1159                 }
1160         }
1161
1162         if (success) {
1163                 if (is_metadata || have_csum) {
1164                         /*
1165                          * need to verify the checksum now that all
1166                          * sectors on disk are repaired (the write
1167                          * request for data to be repaired is on its way).
1168                          * Just be lazy and use scrub_recheck_block()
1169                          * which re-reads the data before the checksum
1170                          * is verified, but most likely the data comes out
1171                          * of the page cache.
1172                          */
1173                         scrub_recheck_block(fs_info, sblock_bad,
1174                                             is_metadata, have_csum, csum,
1175                                             generation, sctx->csum_size);
1176                         if (!sblock_bad->header_error &&
1177                             !sblock_bad->checksum_error &&
1178                             sblock_bad->no_io_error_seen)
1179                                 goto corrected_error;
1180                         else
1181                                 goto did_not_correct_error;
1182                 } else {
1183 corrected_error:
1184                         spin_lock(&sctx->stat_lock);
1185                         sctx->stat.corrected_errors++;
1186                         spin_unlock(&sctx->stat_lock);
1187                         printk_ratelimited_in_rcu(KERN_ERR
1188                                 "BTRFS: fixed up error at logical %llu on dev %s\n",
1189                                 logical, rcu_str_deref(dev->name));
1190                 }
1191         } else {
1192 did_not_correct_error:
1193                 spin_lock(&sctx->stat_lock);
1194                 sctx->stat.uncorrectable_errors++;
1195                 spin_unlock(&sctx->stat_lock);
1196                 printk_ratelimited_in_rcu(KERN_ERR
1197                         "BTRFS: unable to fixup (regular) error at logical %llu on dev %s\n",
1198                         logical, rcu_str_deref(dev->name));
1199         }
1200
1201 out:
1202         if (sblocks_for_recheck) {
1203                 for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
1204                      mirror_index++) {
1205                         struct scrub_block *sblock = sblocks_for_recheck +
1206                                                      mirror_index;
1207                         int page_index;
1208
1209                         for (page_index = 0; page_index < sblock->page_count;
1210                              page_index++) {
1211                                 sblock->pagev[page_index]->sblock = NULL;
1212                                 scrub_page_put(sblock->pagev[page_index]);
1213                         }
1214                 }
1215                 kfree(sblocks_for_recheck);
1216         }
1217
1218         return 0;
1219 }
1220
1221 static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
1222                                      struct btrfs_fs_info *fs_info,
1223                                      struct scrub_block *original_sblock,
1224                                      u64 length, u64 logical,
1225                                      struct scrub_block *sblocks_for_recheck)
1226 {
1227         int page_index;
1228         int mirror_index;
1229         int ret;
1230
1231         /*
1232          * note: the two members ref_count and outstanding_pages
1233          * are not used (and not set) in the blocks that are used for
1234          * the recheck procedure
1235          */
1236
1237         page_index = 0;
1238         while (length > 0) {
1239                 u64 sublen = min_t(u64, length, PAGE_SIZE);
1240                 u64 mapped_length = sublen;
1241                 struct btrfs_bio *bbio = NULL;
1242
1243                 /*
1244                  * with a length of PAGE_SIZE, each returned stripe
1245                  * represents one mirror
1246                  */
1247                 ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical,
1248                                       &mapped_length, &bbio, 0);
1249                 if (ret || !bbio || mapped_length < sublen) {
1250                         kfree(bbio);
1251                         return -EIO;
1252                 }
1253
1254                 BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
1255                 for (mirror_index = 0; mirror_index < (int)bbio->num_stripes;
1256                      mirror_index++) {
1257                         struct scrub_block *sblock;
1258                         struct scrub_page *page;
1259
1260                         if (mirror_index >= BTRFS_MAX_MIRRORS)
1261                                 continue;
1262
1263                         sblock = sblocks_for_recheck + mirror_index;
1264                         sblock->sctx = sctx;
1265                         page = kzalloc(sizeof(*page), GFP_NOFS);
1266                         if (!page) {
1267 leave_nomem:
1268                                 spin_lock(&sctx->stat_lock);
1269                                 sctx->stat.malloc_errors++;
1270                                 spin_unlock(&sctx->stat_lock);
1271                                 kfree(bbio);
1272                                 return -ENOMEM;
1273                         }
1274                         scrub_page_get(page);
1275                         sblock->pagev[page_index] = page;
1276                         page->logical = logical;
1277                         page->physical = bbio->stripes[mirror_index].physical;
1278                         BUG_ON(page_index >= original_sblock->page_count);
1279                         page->physical_for_dev_replace =
1280                                 original_sblock->pagev[page_index]->
1281                                 physical_for_dev_replace;
1282                         /* for missing devices, dev->bdev is NULL */
1283                         page->dev = bbio->stripes[mirror_index].dev;
1284                         page->mirror_num = mirror_index + 1;
1285                         sblock->page_count++;
1286                         page->page = alloc_page(GFP_NOFS);
1287                         if (!page->page)
1288                                 goto leave_nomem;
1289                 }
1290                 kfree(bbio);
1291                 length -= sublen;
1292                 logical += sublen;
1293                 page_index++;
1294         }
1295
1296         return 0;
1297 }
1298
1299 /*
1300  * this function will check the on disk data for checksum errors, header
1301  * errors and read I/O errors. If any I/O errors happen, the exact pages
1302  * which are errored are marked as being bad. The goal is to enable scrub
1303  * to take those pages that are not errored from all the mirrors so that
1304  * the pages that are errored in the just handled mirror can be repaired.
1305  */
1306 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1307                                 struct scrub_block *sblock, int is_metadata,
1308                                 int have_csum, u8 *csum, u64 generation,
1309                                 u16 csum_size)
1310 {
1311         int page_num;
1312
1313         sblock->no_io_error_seen = 1;
1314         sblock->header_error = 0;
1315         sblock->checksum_error = 0;
1316
1317         for (page_num = 0; page_num < sblock->page_count; page_num++) {
1318                 struct bio *bio;
1319                 struct scrub_page *page = sblock->pagev[page_num];
1320
1321                 if (page->dev->bdev == NULL) {
1322                         page->io_error = 1;
1323                         sblock->no_io_error_seen = 0;
1324                         continue;
1325                 }
1326
1327                 WARN_ON(!page->page);
1328                 bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
1329                 if (!bio) {
1330                         page->io_error = 1;
1331                         sblock->no_io_error_seen = 0;
1332                         continue;
1333                 }
1334                 bio->bi_bdev = page->dev->bdev;
1335                 bio->bi_iter.bi_sector = page->physical >> 9;
1336
1337                 bio_add_page(bio, page->page, PAGE_SIZE, 0);
1338                 if (btrfsic_submit_bio_wait(READ, bio))
1339                         sblock->no_io_error_seen = 0;
1340
1341                 bio_put(bio);
1342         }
1343
1344         if (sblock->no_io_error_seen)
1345                 scrub_recheck_block_checksum(fs_info, sblock, is_metadata,
1346                                              have_csum, csum, generation,
1347                                              csum_size);
1348
1349         return;
1350 }
1351
1352 static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
1353                                          struct scrub_block *sblock,
1354                                          int is_metadata, int have_csum,
1355                                          const u8 *csum, u64 generation,
1356                                          u16 csum_size)
1357 {
1358         int page_num;
1359         u8 calculated_csum[BTRFS_CSUM_SIZE];
1360         u32 crc = ~(u32)0;
1361         void *mapped_buffer;
1362
1363         WARN_ON(!sblock->pagev[0]->page);
1364         if (is_metadata) {
1365                 struct btrfs_header *h;
1366
1367                 mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
1368                 h = (struct btrfs_header *)mapped_buffer;
1369
1370                 if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h) ||
1371                     memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) ||
1372                     memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1373                            BTRFS_UUID_SIZE)) {
1374                         sblock->header_error = 1;
1375                 } else if (generation != btrfs_stack_header_generation(h)) {
1376                         sblock->header_error = 1;
1377                         sblock->generation_error = 1;
1378                 }
1379                 csum = h->csum;
1380         } else {
1381                 if (!have_csum)
1382                         return;
1383
1384                 mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
1385         }
1386
1387         for (page_num = 0;;) {
1388                 if (page_num == 0 && is_metadata)
1389                         crc = btrfs_csum_data(
1390                                 ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE,
1391                                 crc, PAGE_SIZE - BTRFS_CSUM_SIZE);
1392                 else
1393                         crc = btrfs_csum_data(mapped_buffer, crc, PAGE_SIZE);
1394
1395                 kunmap_atomic(mapped_buffer);
1396                 page_num++;
1397                 if (page_num >= sblock->page_count)
1398                         break;
1399                 WARN_ON(!sblock->pagev[page_num]->page);
1400
1401                 mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page);
1402         }
1403
1404         btrfs_csum_final(crc, calculated_csum);
1405         if (memcmp(calculated_csum, csum, csum_size))
1406                 sblock->checksum_error = 1;
1407 }
1408
1409 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
1410                                              struct scrub_block *sblock_good,
1411                                              int force_write)
1412 {
1413         int page_num;
1414         int ret = 0;
1415
1416         for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
1417                 int ret_sub;
1418
1419                 ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
1420                                                            sblock_good,
1421                                                            page_num,
1422                                                            force_write);
1423                 if (ret_sub)
1424                         ret = ret_sub;
1425         }
1426
1427         return ret;
1428 }
1429
1430 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1431                                             struct scrub_block *sblock_good,
1432                                             int page_num, int force_write)
1433 {
1434         struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1435         struct scrub_page *page_good = sblock_good->pagev[page_num];
1436
1437         BUG_ON(page_bad->page == NULL);
1438         BUG_ON(page_good->page == NULL);
1439         if (force_write || sblock_bad->header_error ||
1440             sblock_bad->checksum_error || page_bad->io_error) {
1441                 struct bio *bio;
1442                 int ret;
1443
1444                 if (!page_bad->dev->bdev) {
1445                         printk_ratelimited(KERN_WARNING "BTRFS: "
1446                                 "scrub_repair_page_from_good_copy(bdev == NULL) "
1447                                 "is unexpected!\n");
1448                         return -EIO;
1449                 }
1450
1451                 bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
1452                 if (!bio)
1453                         return -EIO;
1454                 bio->bi_bdev = page_bad->dev->bdev;
1455                 bio->bi_iter.bi_sector = page_bad->physical >> 9;
1456
1457                 ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
1458                 if (PAGE_SIZE != ret) {
1459                         bio_put(bio);
1460                         return -EIO;
1461                 }
1462
1463                 if (btrfsic_submit_bio_wait(WRITE, bio)) {
1464                         btrfs_dev_stat_inc_and_print(page_bad->dev,
1465                                 BTRFS_DEV_STAT_WRITE_ERRS);
1466                         btrfs_dev_replace_stats_inc(
1467                                 &sblock_bad->sctx->dev_root->fs_info->
1468                                 dev_replace.num_write_errors);
1469                         bio_put(bio);
1470                         return -EIO;
1471                 }
1472                 bio_put(bio);
1473         }
1474
1475         return 0;
1476 }
1477
1478 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1479 {
1480         int page_num;
1481
1482         for (page_num = 0; page_num < sblock->page_count; page_num++) {
1483                 int ret;
1484
1485                 ret = scrub_write_page_to_dev_replace(sblock, page_num);
1486                 if (ret)
1487                         btrfs_dev_replace_stats_inc(
1488                                 &sblock->sctx->dev_root->fs_info->dev_replace.
1489                                 num_write_errors);
1490         }
1491 }
1492
1493 static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
1494                                            int page_num)
1495 {
1496         struct scrub_page *spage = sblock->pagev[page_num];
1497
1498         BUG_ON(spage->page == NULL);
1499         if (spage->io_error) {
1500                 void *mapped_buffer = kmap_atomic(spage->page);
1501
1502                 memset(mapped_buffer, 0, PAGE_CACHE_SIZE);
1503                 flush_dcache_page(spage->page);
1504                 kunmap_atomic(mapped_buffer);
1505         }
1506         return scrub_add_page_to_wr_bio(sblock->sctx, spage);
1507 }
1508
1509 static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
1510                                     struct scrub_page *spage)
1511 {
1512         struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1513         struct scrub_bio *sbio;
1514         int ret;
1515
1516         mutex_lock(&wr_ctx->wr_lock);
1517 again:
1518         if (!wr_ctx->wr_curr_bio) {
1519                 wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio),
1520                                               GFP_NOFS);
1521                 if (!wr_ctx->wr_curr_bio) {
1522                         mutex_unlock(&wr_ctx->wr_lock);
1523                         return -ENOMEM;
1524                 }
1525                 wr_ctx->wr_curr_bio->sctx = sctx;
1526                 wr_ctx->wr_curr_bio->page_count = 0;
1527         }
1528         sbio = wr_ctx->wr_curr_bio;
1529         if (sbio->page_count == 0) {
1530                 struct bio *bio;
1531
1532                 sbio->physical = spage->physical_for_dev_replace;
1533                 sbio->logical = spage->logical;
1534                 sbio->dev = wr_ctx->tgtdev;
1535                 bio = sbio->bio;
1536                 if (!bio) {
1537                         bio = btrfs_io_bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio);
1538                         if (!bio) {
1539                                 mutex_unlock(&wr_ctx->wr_lock);
1540                                 return -ENOMEM;
1541                         }
1542                         sbio->bio = bio;
1543                 }
1544
1545                 bio->bi_private = sbio;
1546                 bio->bi_end_io = scrub_wr_bio_end_io;
1547                 bio->bi_bdev = sbio->dev->bdev;
1548                 bio->bi_iter.bi_sector = sbio->physical >> 9;
1549                 sbio->err = 0;
1550         } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1551                    spage->physical_for_dev_replace ||
1552                    sbio->logical + sbio->page_count * PAGE_SIZE !=
1553                    spage->logical) {
1554                 scrub_wr_submit(sctx);
1555                 goto again;
1556         }
1557
1558         ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
1559         if (ret != PAGE_SIZE) {
1560                 if (sbio->page_count < 1) {
1561                         bio_put(sbio->bio);
1562                         sbio->bio = NULL;
1563                         mutex_unlock(&wr_ctx->wr_lock);
1564                         return -EIO;
1565                 }
1566                 scrub_wr_submit(sctx);
1567                 goto again;
1568         }
1569
1570         sbio->pagev[sbio->page_count] = spage;
1571         scrub_page_get(spage);
1572         sbio->page_count++;
1573         if (sbio->page_count == wr_ctx->pages_per_wr_bio)
1574                 scrub_wr_submit(sctx);
1575         mutex_unlock(&wr_ctx->wr_lock);
1576
1577         return 0;
1578 }
1579
1580 static void scrub_wr_submit(struct scrub_ctx *sctx)
1581 {
1582         struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1583         struct scrub_bio *sbio;
1584
1585         if (!wr_ctx->wr_curr_bio)
1586                 return;
1587
1588         sbio = wr_ctx->wr_curr_bio;
1589         wr_ctx->wr_curr_bio = NULL;
1590         WARN_ON(!sbio->bio->bi_bdev);
1591         scrub_pending_bio_inc(sctx);
1592         /* process all writes in a single worker thread. Then the block layer
1593          * orders the requests before sending them to the driver which
1594          * doubled the write performance on spinning disks when measured
1595          * with Linux 3.5 */
1596         btrfsic_submit_bio(WRITE, sbio->bio);
1597 }
1598
1599 static void scrub_wr_bio_end_io(struct bio *bio, int err)
1600 {
1601         struct scrub_bio *sbio = bio->bi_private;
1602         struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
1603
1604         sbio->err = err;
1605         sbio->bio = bio;
1606
1607         sbio->work.func = scrub_wr_bio_end_io_worker;
1608         btrfs_queue_worker(&fs_info->scrub_wr_completion_workers, &sbio->work);
1609 }
1610
1611 static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
1612 {
1613         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1614         struct scrub_ctx *sctx = sbio->sctx;
1615         int i;
1616
1617         WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
1618         if (sbio->err) {
1619                 struct btrfs_dev_replace *dev_replace =
1620                         &sbio->sctx->dev_root->fs_info->dev_replace;
1621
1622                 for (i = 0; i < sbio->page_count; i++) {
1623                         struct scrub_page *spage = sbio->pagev[i];
1624
1625                         spage->io_error = 1;
1626                         btrfs_dev_replace_stats_inc(&dev_replace->
1627                                                     num_write_errors);
1628                 }
1629         }
1630
1631         for (i = 0; i < sbio->page_count; i++)
1632                 scrub_page_put(sbio->pagev[i]);
1633
1634         bio_put(sbio->bio);
1635         kfree(sbio);
1636         scrub_pending_bio_dec(sctx);
1637 }
1638
1639 static int scrub_checksum(struct scrub_block *sblock)
1640 {
1641         u64 flags;
1642         int ret;
1643
1644         WARN_ON(sblock->page_count < 1);
1645         flags = sblock->pagev[0]->flags;
1646         ret = 0;
1647         if (flags & BTRFS_EXTENT_FLAG_DATA)
1648                 ret = scrub_checksum_data(sblock);
1649         else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1650                 ret = scrub_checksum_tree_block(sblock);
1651         else if (flags & BTRFS_EXTENT_FLAG_SUPER)
1652                 (void)scrub_checksum_super(sblock);
1653         else
1654                 WARN_ON(1);
1655         if (ret)
1656                 scrub_handle_errored_block(sblock);
1657
1658         return ret;
1659 }
1660
1661 static int scrub_checksum_data(struct scrub_block *sblock)
1662 {
1663         struct scrub_ctx *sctx = sblock->sctx;
1664         u8 csum[BTRFS_CSUM_SIZE];
1665         u8 *on_disk_csum;
1666         struct page *page;
1667         void *buffer;
1668         u32 crc = ~(u32)0;
1669         int fail = 0;
1670         u64 len;
1671         int index;
1672
1673         BUG_ON(sblock->page_count < 1);
1674         if (!sblock->pagev[0]->have_csum)
1675                 return 0;
1676
1677         on_disk_csum = sblock->pagev[0]->csum;
1678         page = sblock->pagev[0]->page;
1679         buffer = kmap_atomic(page);
1680
1681         len = sctx->sectorsize;
1682         index = 0;
1683         for (;;) {
1684                 u64 l = min_t(u64, len, PAGE_SIZE);
1685
1686                 crc = btrfs_csum_data(buffer, crc, l);
1687                 kunmap_atomic(buffer);
1688                 len -= l;
1689                 if (len == 0)
1690                         break;
1691                 index++;
1692                 BUG_ON(index >= sblock->page_count);
1693                 BUG_ON(!sblock->pagev[index]->page);
1694                 page = sblock->pagev[index]->page;
1695                 buffer = kmap_atomic(page);
1696         }
1697
1698         btrfs_csum_final(crc, csum);
1699         if (memcmp(csum, on_disk_csum, sctx->csum_size))
1700                 fail = 1;
1701
1702         return fail;
1703 }
1704
1705 static int scrub_checksum_tree_block(struct scrub_block *sblock)
1706 {
1707         struct scrub_ctx *sctx = sblock->sctx;
1708         struct btrfs_header *h;
1709         struct btrfs_root *root = sctx->dev_root;
1710         struct btrfs_fs_info *fs_info = root->fs_info;
1711         u8 calculated_csum[BTRFS_CSUM_SIZE];
1712         u8 on_disk_csum[BTRFS_CSUM_SIZE];
1713         struct page *page;
1714         void *mapped_buffer;
1715         u64 mapped_size;
1716         void *p;
1717         u32 crc = ~(u32)0;
1718         int fail = 0;
1719         int crc_fail = 0;
1720         u64 len;
1721         int index;
1722
1723         BUG_ON(sblock->page_count < 1);
1724         page = sblock->pagev[0]->page;
1725         mapped_buffer = kmap_atomic(page);
1726         h = (struct btrfs_header *)mapped_buffer;
1727         memcpy(on_disk_csum, h->csum, sctx->csum_size);
1728
1729         /*
1730          * we don't use the getter functions here, as we
1731          * a) don't have an extent buffer and
1732          * b) the page is already kmapped
1733          */
1734
1735         if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h))
1736                 ++fail;
1737
1738         if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h))
1739                 ++fail;
1740
1741         if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
1742                 ++fail;
1743
1744         if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1745                    BTRFS_UUID_SIZE))
1746                 ++fail;
1747
1748         WARN_ON(sctx->nodesize != sctx->leafsize);
1749         len = sctx->nodesize - BTRFS_CSUM_SIZE;
1750         mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
1751         p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
1752         index = 0;
1753         for (;;) {
1754                 u64 l = min_t(u64, len, mapped_size);
1755
1756                 crc = btrfs_csum_data(p, crc, l);
1757                 kunmap_atomic(mapped_buffer);
1758                 len -= l;
1759                 if (len == 0)
1760                         break;
1761                 index++;
1762                 BUG_ON(index >= sblock->page_count);
1763                 BUG_ON(!sblock->pagev[index]->page);
1764                 page = sblock->pagev[index]->page;
1765                 mapped_buffer = kmap_atomic(page);
1766                 mapped_size = PAGE_SIZE;
1767                 p = mapped_buffer;
1768         }
1769
1770         btrfs_csum_final(crc, calculated_csum);
1771         if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
1772                 ++crc_fail;
1773
1774         return fail || crc_fail;
1775 }
1776
1777 static int scrub_checksum_super(struct scrub_block *sblock)
1778 {
1779         struct btrfs_super_block *s;
1780         struct scrub_ctx *sctx = sblock->sctx;
1781         struct btrfs_root *root = sctx->dev_root;
1782         struct btrfs_fs_info *fs_info = root->fs_info;
1783         u8 calculated_csum[BTRFS_CSUM_SIZE];
1784         u8 on_disk_csum[BTRFS_CSUM_SIZE];
1785         struct page *page;
1786         void *mapped_buffer;
1787         u64 mapped_size;
1788         void *p;
1789         u32 crc = ~(u32)0;
1790         int fail_gen = 0;
1791         int fail_cor = 0;
1792         u64 len;
1793         int index;
1794
1795         BUG_ON(sblock->page_count < 1);
1796         page = sblock->pagev[0]->page;
1797         mapped_buffer = kmap_atomic(page);
1798         s = (struct btrfs_super_block *)mapped_buffer;
1799         memcpy(on_disk_csum, s->csum, sctx->csum_size);
1800
1801         if (sblock->pagev[0]->logical != btrfs_super_bytenr(s))
1802                 ++fail_cor;
1803
1804         if (sblock->pagev[0]->generation != btrfs_super_generation(s))
1805                 ++fail_gen;
1806
1807         if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
1808                 ++fail_cor;
1809
1810         len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
1811         mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
1812         p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
1813         index = 0;
1814         for (;;) {
1815                 u64 l = min_t(u64, len, mapped_size);
1816
1817                 crc = btrfs_csum_data(p, crc, l);
1818                 kunmap_atomic(mapped_buffer);
1819                 len -= l;
1820                 if (len == 0)
1821                         break;
1822                 index++;
1823                 BUG_ON(index >= sblock->page_count);
1824                 BUG_ON(!sblock->pagev[index]->page);
1825                 page = sblock->pagev[index]->page;
1826                 mapped_buffer = kmap_atomic(page);
1827                 mapped_size = PAGE_SIZE;
1828                 p = mapped_buffer;
1829         }
1830
1831         btrfs_csum_final(crc, calculated_csum);
1832         if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
1833                 ++fail_cor;
1834
1835         if (fail_cor + fail_gen) {
1836                 /*
1837                  * if we find an error in a super block, we just report it.
1838                  * They will get written with the next transaction commit
1839                  * anyway
1840                  */
1841                 spin_lock(&sctx->stat_lock);
1842                 ++sctx->stat.super_errors;
1843                 spin_unlock(&sctx->stat_lock);
1844                 if (fail_cor)
1845                         btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
1846                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
1847                 else
1848                         btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
1849                                 BTRFS_DEV_STAT_GENERATION_ERRS);
1850         }
1851
1852         return fail_cor + fail_gen;
1853 }
1854
1855 static void scrub_block_get(struct scrub_block *sblock)
1856 {
1857         atomic_inc(&sblock->ref_count);
1858 }
1859
1860 static void scrub_block_put(struct scrub_block *sblock)
1861 {
1862         if (atomic_dec_and_test(&sblock->ref_count)) {
1863                 int i;
1864
1865                 for (i = 0; i < sblock->page_count; i++)
1866                         scrub_page_put(sblock->pagev[i]);
1867                 kfree(sblock);
1868         }
1869 }
1870
1871 static void scrub_page_get(struct scrub_page *spage)
1872 {
1873         atomic_inc(&spage->ref_count);
1874 }
1875
1876 static void scrub_page_put(struct scrub_page *spage)
1877 {
1878         if (atomic_dec_and_test(&spage->ref_count)) {
1879                 if (spage->page)
1880                         __free_page(spage->page);
1881                 kfree(spage);
1882         }
1883 }
1884
1885 static void scrub_submit(struct scrub_ctx *sctx)
1886 {
1887         struct scrub_bio *sbio;
1888
1889         if (sctx->curr == -1)
1890                 return;
1891
1892         sbio = sctx->bios[sctx->curr];
1893         sctx->curr = -1;
1894         scrub_pending_bio_inc(sctx);
1895
1896         if (!sbio->bio->bi_bdev) {
1897                 /*
1898                  * this case should not happen. If btrfs_map_block() is
1899                  * wrong, it could happen for dev-replace operations on
1900                  * missing devices when no mirrors are available, but in
1901                  * this case it should already fail the mount.
1902                  * This case is handled correctly (but _very_ slowly).
1903                  */
1904                 printk_ratelimited(KERN_WARNING
1905                         "BTRFS: scrub_submit(bio bdev == NULL) is unexpected!\n");
1906                 bio_endio(sbio->bio, -EIO);
1907         } else {
1908                 btrfsic_submit_bio(READ, sbio->bio);
1909         }
1910 }
1911
1912 static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
1913                                     struct scrub_page *spage)
1914 {
1915         struct scrub_block *sblock = spage->sblock;
1916         struct scrub_bio *sbio;
1917         int ret;
1918
1919 again:
1920         /*
1921          * grab a fresh bio or wait for one to become available
1922          */
1923         while (sctx->curr == -1) {
1924                 spin_lock(&sctx->list_lock);
1925                 sctx->curr = sctx->first_free;
1926                 if (sctx->curr != -1) {
1927                         sctx->first_free = sctx->bios[sctx->curr]->next_free;
1928                         sctx->bios[sctx->curr]->next_free = -1;
1929                         sctx->bios[sctx->curr]->page_count = 0;
1930                         spin_unlock(&sctx->list_lock);
1931                 } else {
1932                         spin_unlock(&sctx->list_lock);
1933                         wait_event(sctx->list_wait, sctx->first_free != -1);
1934                 }
1935         }
1936         sbio = sctx->bios[sctx->curr];
1937         if (sbio->page_count == 0) {
1938                 struct bio *bio;
1939
1940                 sbio->physical = spage->physical;
1941                 sbio->logical = spage->logical;
1942                 sbio->dev = spage->dev;
1943                 bio = sbio->bio;
1944                 if (!bio) {
1945                         bio = btrfs_io_bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio);
1946                         if (!bio)
1947                                 return -ENOMEM;
1948                         sbio->bio = bio;
1949                 }
1950
1951                 bio->bi_private = sbio;
1952                 bio->bi_end_io = scrub_bio_end_io;
1953                 bio->bi_bdev = sbio->dev->bdev;
1954                 bio->bi_iter.bi_sector = sbio->physical >> 9;
1955                 sbio->err = 0;
1956         } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1957                    spage->physical ||
1958                    sbio->logical + sbio->page_count * PAGE_SIZE !=
1959                    spage->logical ||
1960                    sbio->dev != spage->dev) {
1961                 scrub_submit(sctx);
1962                 goto again;
1963         }
1964
1965         sbio->pagev[sbio->page_count] = spage;
1966         ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
1967         if (ret != PAGE_SIZE) {
1968                 if (sbio->page_count < 1) {
1969                         bio_put(sbio->bio);
1970                         sbio->bio = NULL;
1971                         return -EIO;
1972                 }
1973                 scrub_submit(sctx);
1974                 goto again;
1975         }
1976
1977         scrub_block_get(sblock); /* one for the page added to the bio */
1978         atomic_inc(&sblock->outstanding_pages);
1979         sbio->page_count++;
1980         if (sbio->page_count == sctx->pages_per_rd_bio)
1981                 scrub_submit(sctx);
1982
1983         return 0;
1984 }
1985
1986 static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
1987                        u64 physical, struct btrfs_device *dev, u64 flags,
1988                        u64 gen, int mirror_num, u8 *csum, int force,
1989                        u64 physical_for_dev_replace)
1990 {
1991         struct scrub_block *sblock;
1992         int index;
1993
1994         sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
1995         if (!sblock) {
1996                 spin_lock(&sctx->stat_lock);
1997                 sctx->stat.malloc_errors++;
1998                 spin_unlock(&sctx->stat_lock);
1999                 return -ENOMEM;
2000         }
2001
2002         /* one ref inside this function, plus one for each page added to
2003          * a bio later on */
2004         atomic_set(&sblock->ref_count, 1);
2005         sblock->sctx = sctx;
2006         sblock->no_io_error_seen = 1;
2007
2008         for (index = 0; len > 0; index++) {
2009                 struct scrub_page *spage;
2010                 u64 l = min_t(u64, len, PAGE_SIZE);
2011
2012                 spage = kzalloc(sizeof(*spage), GFP_NOFS);
2013                 if (!spage) {
2014 leave_nomem:
2015                         spin_lock(&sctx->stat_lock);
2016                         sctx->stat.malloc_errors++;
2017                         spin_unlock(&sctx->stat_lock);
2018                         scrub_block_put(sblock);
2019                         return -ENOMEM;
2020                 }
2021                 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2022                 scrub_page_get(spage);
2023                 sblock->pagev[index] = spage;
2024                 spage->sblock = sblock;
2025                 spage->dev = dev;
2026                 spage->flags = flags;
2027                 spage->generation = gen;
2028                 spage->logical = logical;
2029                 spage->physical = physical;
2030                 spage->physical_for_dev_replace = physical_for_dev_replace;
2031                 spage->mirror_num = mirror_num;
2032                 if (csum) {
2033                         spage->have_csum = 1;
2034                         memcpy(spage->csum, csum, sctx->csum_size);
2035                 } else {
2036                         spage->have_csum = 0;
2037                 }
2038                 sblock->page_count++;
2039                 spage->page = alloc_page(GFP_NOFS);
2040                 if (!spage->page)
2041                         goto leave_nomem;
2042                 len -= l;
2043                 logical += l;
2044                 physical += l;
2045                 physical_for_dev_replace += l;
2046         }
2047
2048         WARN_ON(sblock->page_count == 0);
2049         for (index = 0; index < sblock->page_count; index++) {
2050                 struct scrub_page *spage = sblock->pagev[index];
2051                 int ret;
2052
2053                 ret = scrub_add_page_to_rd_bio(sctx, spage);
2054                 if (ret) {
2055                         scrub_block_put(sblock);
2056                         return ret;
2057                 }
2058         }
2059
2060         if (force)
2061                 scrub_submit(sctx);
2062
2063         /* last one frees, either here or in bio completion for last page */
2064         scrub_block_put(sblock);
2065         return 0;
2066 }
2067
2068 static void scrub_bio_end_io(struct bio *bio, int err)
2069 {
2070         struct scrub_bio *sbio = bio->bi_private;
2071         struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
2072
2073         sbio->err = err;
2074         sbio->bio = bio;
2075
2076         btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work);
2077 }
2078
2079 static void scrub_bio_end_io_worker(struct btrfs_work *work)
2080 {
2081         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
2082         struct scrub_ctx *sctx = sbio->sctx;
2083         int i;
2084
2085         BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
2086         if (sbio->err) {
2087                 for (i = 0; i < sbio->page_count; i++) {
2088                         struct scrub_page *spage = sbio->pagev[i];
2089
2090                         spage->io_error = 1;
2091                         spage->sblock->no_io_error_seen = 0;
2092                 }
2093         }
2094
2095         /* now complete the scrub_block items that have all pages completed */
2096         for (i = 0; i < sbio->page_count; i++) {
2097                 struct scrub_page *spage = sbio->pagev[i];
2098                 struct scrub_block *sblock = spage->sblock;
2099
2100                 if (atomic_dec_and_test(&sblock->outstanding_pages))
2101                         scrub_block_complete(sblock);
2102                 scrub_block_put(sblock);
2103         }
2104
2105         bio_put(sbio->bio);
2106         sbio->bio = NULL;
2107         spin_lock(&sctx->list_lock);
2108         sbio->next_free = sctx->first_free;
2109         sctx->first_free = sbio->index;
2110         spin_unlock(&sctx->list_lock);
2111
2112         if (sctx->is_dev_replace &&
2113             atomic_read(&sctx->wr_ctx.flush_all_writes)) {
2114                 mutex_lock(&sctx->wr_ctx.wr_lock);
2115                 scrub_wr_submit(sctx);
2116                 mutex_unlock(&sctx->wr_ctx.wr_lock);
2117         }
2118
2119         scrub_pending_bio_dec(sctx);
2120 }
2121
2122 static void scrub_block_complete(struct scrub_block *sblock)
2123 {
2124         if (!sblock->no_io_error_seen) {
2125                 scrub_handle_errored_block(sblock);
2126         } else {
2127                 /*
2128                  * if has checksum error, write via repair mechanism in
2129                  * dev replace case, otherwise write here in dev replace
2130                  * case.
2131                  */
2132                 if (!scrub_checksum(sblock) && sblock->sctx->is_dev_replace)
2133                         scrub_write_block_to_dev_replace(sblock);
2134         }
2135 }
2136
2137 static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
2138                            u8 *csum)
2139 {
2140         struct btrfs_ordered_sum *sum = NULL;
2141         unsigned long index;
2142         unsigned long num_sectors;
2143
2144         while (!list_empty(&sctx->csum_list)) {
2145                 sum = list_first_entry(&sctx->csum_list,
2146                                        struct btrfs_ordered_sum, list);
2147                 if (sum->bytenr > logical)
2148                         return 0;
2149                 if (sum->bytenr + sum->len > logical)
2150                         break;
2151
2152                 ++sctx->stat.csum_discards;
2153                 list_del(&sum->list);
2154                 kfree(sum);
2155                 sum = NULL;
2156         }
2157         if (!sum)
2158                 return 0;
2159
2160         index = ((u32)(logical - sum->bytenr)) / sctx->sectorsize;
2161         num_sectors = sum->len / sctx->sectorsize;
2162         memcpy(csum, sum->sums + index, sctx->csum_size);
2163         if (index == num_sectors - 1) {
2164                 list_del(&sum->list);
2165                 kfree(sum);
2166         }
2167         return 1;
2168 }
2169
2170 /* scrub extent tries to collect up to 64 kB for each bio */
2171 static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
2172                         u64 physical, struct btrfs_device *dev, u64 flags,
2173                         u64 gen, int mirror_num, u64 physical_for_dev_replace)
2174 {
2175         int ret;
2176         u8 csum[BTRFS_CSUM_SIZE];
2177         u32 blocksize;
2178
2179         if (flags & BTRFS_EXTENT_FLAG_DATA) {
2180                 blocksize = sctx->sectorsize;
2181                 spin_lock(&sctx->stat_lock);
2182                 sctx->stat.data_extents_scrubbed++;
2183                 sctx->stat.data_bytes_scrubbed += len;
2184                 spin_unlock(&sctx->stat_lock);
2185         } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2186                 WARN_ON(sctx->nodesize != sctx->leafsize);
2187                 blocksize = sctx->nodesize;
2188                 spin_lock(&sctx->stat_lock);
2189                 sctx->stat.tree_extents_scrubbed++;
2190                 sctx->stat.tree_bytes_scrubbed += len;
2191                 spin_unlock(&sctx->stat_lock);
2192         } else {
2193                 blocksize = sctx->sectorsize;
2194                 WARN_ON(1);
2195         }
2196
2197         while (len) {
2198                 u64 l = min_t(u64, len, blocksize);
2199                 int have_csum = 0;
2200
2201                 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2202                         /* push csums to sbio */
2203                         have_csum = scrub_find_csum(sctx, logical, l, csum);
2204                         if (have_csum == 0)
2205                                 ++sctx->stat.no_csum;
2206                         if (sctx->is_dev_replace && !have_csum) {
2207                                 ret = copy_nocow_pages(sctx, logical, l,
2208                                                        mirror_num,
2209                                                       physical_for_dev_replace);
2210                                 goto behind_scrub_pages;
2211                         }
2212                 }
2213                 ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
2214                                   mirror_num, have_csum ? csum : NULL, 0,
2215                                   physical_for_dev_replace);
2216 behind_scrub_pages:
2217                 if (ret)
2218                         return ret;
2219                 len -= l;
2220                 logical += l;
2221                 physical += l;
2222                 physical_for_dev_replace += l;
2223         }
2224         return 0;
2225 }
2226
2227 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2228                                            struct map_lookup *map,
2229                                            struct btrfs_device *scrub_dev,
2230                                            int num, u64 base, u64 length,
2231                                            int is_dev_replace)
2232 {
2233         struct btrfs_path *path;
2234         struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
2235         struct btrfs_root *root = fs_info->extent_root;
2236         struct btrfs_root *csum_root = fs_info->csum_root;
2237         struct btrfs_extent_item *extent;
2238         struct blk_plug plug;
2239         u64 flags;
2240         int ret;
2241         int slot;
2242         u64 nstripes;
2243         struct extent_buffer *l;
2244         struct btrfs_key key;
2245         u64 physical;
2246         u64 logical;
2247         u64 logic_end;
2248         u64 generation;
2249         int mirror_num;
2250         struct reada_control *reada1;
2251         struct reada_control *reada2;
2252         struct btrfs_key key_start;
2253         struct btrfs_key key_end;
2254         u64 increment = map->stripe_len;
2255         u64 offset;
2256         u64 extent_logical;
2257         u64 extent_physical;
2258         u64 extent_len;
2259         struct btrfs_device *extent_dev;
2260         int extent_mirror_num;
2261         int stop_loop;
2262
2263         if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
2264                          BTRFS_BLOCK_GROUP_RAID6)) {
2265                 if (num >= nr_data_stripes(map)) {
2266                         return 0;
2267                 }
2268         }
2269
2270         nstripes = length;
2271         offset = 0;
2272         do_div(nstripes, map->stripe_len);
2273         if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
2274                 offset = map->stripe_len * num;
2275                 increment = map->stripe_len * map->num_stripes;
2276                 mirror_num = 1;
2277         } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2278                 int factor = map->num_stripes / map->sub_stripes;
2279                 offset = map->stripe_len * (num / map->sub_stripes);
2280                 increment = map->stripe_len * factor;
2281                 mirror_num = num % map->sub_stripes + 1;
2282         } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
2283                 increment = map->stripe_len;
2284                 mirror_num = num % map->num_stripes + 1;
2285         } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
2286                 increment = map->stripe_len;
2287                 mirror_num = num % map->num_stripes + 1;
2288         } else {
2289                 increment = map->stripe_len;
2290                 mirror_num = 1;
2291         }
2292
2293         path = btrfs_alloc_path();
2294         if (!path)
2295                 return -ENOMEM;
2296
2297         /*
2298          * work on commit root. The related disk blocks are static as
2299          * long as COW is applied. This means, it is save to rewrite
2300          * them to repair disk errors without any race conditions
2301          */
2302         path->search_commit_root = 1;
2303         path->skip_locking = 1;
2304
2305         /*
2306          * trigger the readahead for extent tree csum tree and wait for
2307          * completion. During readahead, the scrub is officially paused
2308          * to not hold off transaction commits
2309          */
2310         logical = base + offset;
2311
2312         wait_event(sctx->list_wait,
2313                    atomic_read(&sctx->bios_in_flight) == 0);
2314         scrub_blocked_if_needed(fs_info);
2315
2316         /* FIXME it might be better to start readahead at commit root */
2317         key_start.objectid = logical;
2318         key_start.type = BTRFS_EXTENT_ITEM_KEY;
2319         key_start.offset = (u64)0;
2320         key_end.objectid = base + offset + nstripes * increment;
2321         key_end.type = BTRFS_METADATA_ITEM_KEY;
2322         key_end.offset = (u64)-1;
2323         reada1 = btrfs_reada_add(root, &key_start, &key_end);
2324
2325         key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
2326         key_start.type = BTRFS_EXTENT_CSUM_KEY;
2327         key_start.offset = logical;
2328         key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
2329         key_end.type = BTRFS_EXTENT_CSUM_KEY;
2330         key_end.offset = base + offset + nstripes * increment;
2331         reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
2332
2333         if (!IS_ERR(reada1))
2334                 btrfs_reada_wait(reada1);
2335         if (!IS_ERR(reada2))
2336                 btrfs_reada_wait(reada2);
2337
2338
2339         /*
2340          * collect all data csums for the stripe to avoid seeking during
2341          * the scrub. This might currently (crc32) end up to be about 1MB
2342          */
2343         blk_start_plug(&plug);
2344
2345         /*
2346          * now find all extents for each stripe and scrub them
2347          */
2348         logical = base + offset;
2349         physical = map->stripes[num].physical;
2350         logic_end = logical + increment * nstripes;
2351         ret = 0;
2352         while (logical < logic_end) {
2353                 /*
2354                  * canceled?
2355                  */
2356                 if (atomic_read(&fs_info->scrub_cancel_req) ||
2357                     atomic_read(&sctx->cancel_req)) {
2358                         ret = -ECANCELED;
2359                         goto out;
2360                 }
2361                 /*
2362                  * check to see if we have to pause
2363                  */
2364                 if (atomic_read(&fs_info->scrub_pause_req)) {
2365                         /* push queued extents */
2366                         atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
2367                         scrub_submit(sctx);
2368                         mutex_lock(&sctx->wr_ctx.wr_lock);
2369                         scrub_wr_submit(sctx);
2370                         mutex_unlock(&sctx->wr_ctx.wr_lock);
2371                         wait_event(sctx->list_wait,
2372                                    atomic_read(&sctx->bios_in_flight) == 0);
2373                         atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
2374                         scrub_blocked_if_needed(fs_info);
2375                 }
2376
2377                 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2378                         key.type = BTRFS_METADATA_ITEM_KEY;
2379                 else
2380                         key.type = BTRFS_EXTENT_ITEM_KEY;
2381                 key.objectid = logical;
2382                 key.offset = (u64)-1;
2383
2384                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2385                 if (ret < 0)
2386                         goto out;
2387
2388                 if (ret > 0) {
2389                         ret = btrfs_previous_extent_item(root, path, 0);
2390                         if (ret < 0)
2391                                 goto out;
2392                         if (ret > 0) {
2393                                 /* there's no smaller item, so stick with the
2394                                  * larger one */
2395                                 btrfs_release_path(path);
2396                                 ret = btrfs_search_slot(NULL, root, &key,
2397                                                         path, 0, 0);
2398                                 if (ret < 0)
2399                                         goto out;
2400                         }
2401                 }
2402
2403                 stop_loop = 0;
2404                 while (1) {
2405                         u64 bytes;
2406
2407                         l = path->nodes[0];
2408                         slot = path->slots[0];
2409                         if (slot >= btrfs_header_nritems(l)) {
2410                                 ret = btrfs_next_leaf(root, path);
2411                                 if (ret == 0)
2412                                         continue;
2413                                 if (ret < 0)
2414                                         goto out;
2415
2416                                 stop_loop = 1;
2417                                 break;
2418                         }
2419                         btrfs_item_key_to_cpu(l, &key, slot);
2420
2421                         if (key.type == BTRFS_METADATA_ITEM_KEY)
2422                                 bytes = root->leafsize;
2423                         else
2424                                 bytes = key.offset;
2425
2426                         if (key.objectid + bytes <= logical)
2427                                 goto next;
2428
2429                         if (key.type != BTRFS_EXTENT_ITEM_KEY &&
2430                             key.type != BTRFS_METADATA_ITEM_KEY)
2431                                 goto next;
2432
2433                         if (key.objectid >= logical + map->stripe_len) {
2434                                 /* out of this device extent */
2435                                 if (key.objectid >= logic_end)
2436                                         stop_loop = 1;
2437                                 break;
2438                         }
2439
2440                         extent = btrfs_item_ptr(l, slot,
2441                                                 struct btrfs_extent_item);
2442                         flags = btrfs_extent_flags(l, extent);
2443                         generation = btrfs_extent_generation(l, extent);
2444
2445                         if (key.objectid < logical &&
2446                             (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
2447                                 btrfs_err(fs_info,
2448                                            "scrub: tree block %llu spanning "
2449                                            "stripes, ignored. logical=%llu",
2450                                        key.objectid, logical);
2451                                 goto next;
2452                         }
2453
2454 again:
2455                         extent_logical = key.objectid;
2456                         extent_len = bytes;
2457
2458                         /*
2459                          * trim extent to this stripe
2460                          */
2461                         if (extent_logical < logical) {
2462                                 extent_len -= logical - extent_logical;
2463                                 extent_logical = logical;
2464                         }
2465                         if (extent_logical + extent_len >
2466                             logical + map->stripe_len) {
2467                                 extent_len = logical + map->stripe_len -
2468                                              extent_logical;
2469                         }
2470
2471                         extent_physical = extent_logical - logical + physical;
2472                         extent_dev = scrub_dev;
2473                         extent_mirror_num = mirror_num;
2474                         if (is_dev_replace)
2475                                 scrub_remap_extent(fs_info, extent_logical,
2476                                                    extent_len, &extent_physical,
2477                                                    &extent_dev,
2478                                                    &extent_mirror_num);
2479
2480                         ret = btrfs_lookup_csums_range(csum_root, logical,
2481                                                 logical + map->stripe_len - 1,
2482                                                 &sctx->csum_list, 1);
2483                         if (ret)
2484                                 goto out;
2485
2486                         ret = scrub_extent(sctx, extent_logical, extent_len,
2487                                            extent_physical, extent_dev, flags,
2488                                            generation, extent_mirror_num,
2489                                            extent_logical - logical + physical);
2490                         if (ret)
2491                                 goto out;
2492
2493                         scrub_free_csums(sctx);
2494                         if (extent_logical + extent_len <
2495                             key.objectid + bytes) {
2496                                 logical += increment;
2497                                 physical += map->stripe_len;
2498
2499                                 if (logical < key.objectid + bytes) {
2500                                         cond_resched();
2501                                         goto again;
2502                                 }
2503
2504                                 if (logical >= logic_end) {
2505                                         stop_loop = 1;
2506                                         break;
2507                                 }
2508                         }
2509 next:
2510                         path->slots[0]++;
2511                 }
2512                 btrfs_release_path(path);
2513                 logical += increment;
2514                 physical += map->stripe_len;
2515                 spin_lock(&sctx->stat_lock);
2516                 if (stop_loop)
2517                         sctx->stat.last_physical = map->stripes[num].physical +
2518                                                    length;
2519                 else
2520                         sctx->stat.last_physical = physical;
2521                 spin_unlock(&sctx->stat_lock);
2522                 if (stop_loop)
2523                         break;
2524         }
2525 out:
2526         /* push queued extents */
2527         scrub_submit(sctx);
2528         mutex_lock(&sctx->wr_ctx.wr_lock);
2529         scrub_wr_submit(sctx);
2530         mutex_unlock(&sctx->wr_ctx.wr_lock);
2531
2532         blk_finish_plug(&plug);
2533         btrfs_free_path(path);
2534         return ret < 0 ? ret : 0;
2535 }
2536
2537 static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
2538                                           struct btrfs_device *scrub_dev,
2539                                           u64 chunk_tree, u64 chunk_objectid,
2540                                           u64 chunk_offset, u64 length,
2541                                           u64 dev_offset, int is_dev_replace)
2542 {
2543         struct btrfs_mapping_tree *map_tree =
2544                 &sctx->dev_root->fs_info->mapping_tree;
2545         struct map_lookup *map;
2546         struct extent_map *em;
2547         int i;
2548         int ret = 0;
2549
2550         read_lock(&map_tree->map_tree.lock);
2551         em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
2552         read_unlock(&map_tree->map_tree.lock);
2553
2554         if (!em)
2555                 return -EINVAL;
2556
2557         map = (struct map_lookup *)em->bdev;
2558         if (em->start != chunk_offset)
2559                 goto out;
2560
2561         if (em->len < length)
2562                 goto out;
2563
2564         for (i = 0; i < map->num_stripes; ++i) {
2565                 if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
2566                     map->stripes[i].physical == dev_offset) {
2567                         ret = scrub_stripe(sctx, map, scrub_dev, i,
2568                                            chunk_offset, length,
2569                                            is_dev_replace);
2570                         if (ret)
2571                                 goto out;
2572                 }
2573         }
2574 out:
2575         free_extent_map(em);
2576
2577         return ret;
2578 }
2579
2580 static noinline_for_stack
2581 int scrub_enumerate_chunks(struct scrub_ctx *sctx,
2582                            struct btrfs_device *scrub_dev, u64 start, u64 end,
2583                            int is_dev_replace)
2584 {
2585         struct btrfs_dev_extent *dev_extent = NULL;
2586         struct btrfs_path *path;
2587         struct btrfs_root *root = sctx->dev_root;
2588         struct btrfs_fs_info *fs_info = root->fs_info;
2589         u64 length;
2590         u64 chunk_tree;
2591         u64 chunk_objectid;
2592         u64 chunk_offset;
2593         int ret;
2594         int slot;
2595         struct extent_buffer *l;
2596         struct btrfs_key key;
2597         struct btrfs_key found_key;
2598         struct btrfs_block_group_cache *cache;
2599         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
2600
2601         path = btrfs_alloc_path();
2602         if (!path)
2603                 return -ENOMEM;
2604
2605         path->reada = 2;
2606         path->search_commit_root = 1;
2607         path->skip_locking = 1;
2608
2609         key.objectid = scrub_dev->devid;
2610         key.offset = 0ull;
2611         key.type = BTRFS_DEV_EXTENT_KEY;
2612
2613         while (1) {
2614                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2615                 if (ret < 0)
2616                         break;
2617                 if (ret > 0) {
2618                         if (path->slots[0] >=
2619                             btrfs_header_nritems(path->nodes[0])) {
2620                                 ret = btrfs_next_leaf(root, path);
2621                                 if (ret)
2622                                         break;
2623                         }
2624                 }
2625
2626                 l = path->nodes[0];
2627                 slot = path->slots[0];
2628
2629                 btrfs_item_key_to_cpu(l, &found_key, slot);
2630
2631                 if (found_key.objectid != scrub_dev->devid)
2632                         break;
2633
2634                 if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY)
2635                         break;
2636
2637                 if (found_key.offset >= end)
2638                         break;
2639
2640                 if (found_key.offset < key.offset)
2641                         break;
2642
2643                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
2644                 length = btrfs_dev_extent_length(l, dev_extent);
2645
2646                 if (found_key.offset + length <= start) {
2647                         key.offset = found_key.offset + length;
2648                         btrfs_release_path(path);
2649                         continue;
2650                 }
2651
2652                 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
2653                 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
2654                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
2655
2656                 /*
2657                  * get a reference on the corresponding block group to prevent
2658                  * the chunk from going away while we scrub it
2659                  */
2660                 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
2661                 if (!cache) {
2662                         ret = -ENOENT;
2663                         break;
2664                 }
2665                 dev_replace->cursor_right = found_key.offset + length;
2666                 dev_replace->cursor_left = found_key.offset;
2667                 dev_replace->item_needs_writeback = 1;
2668                 ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid,
2669                                   chunk_offset, length, found_key.offset,
2670                                   is_dev_replace);
2671
2672                 /*
2673                  * flush, submit all pending read and write bios, afterwards
2674                  * wait for them.
2675                  * Note that in the dev replace case, a read request causes
2676                  * write requests that are submitted in the read completion
2677                  * worker. Therefore in the current situation, it is required
2678                  * that all write requests are flushed, so that all read and
2679                  * write requests are really completed when bios_in_flight
2680                  * changes to 0.
2681                  */
2682                 atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
2683                 scrub_submit(sctx);
2684                 mutex_lock(&sctx->wr_ctx.wr_lock);
2685                 scrub_wr_submit(sctx);
2686                 mutex_unlock(&sctx->wr_ctx.wr_lock);
2687
2688                 wait_event(sctx->list_wait,
2689                            atomic_read(&sctx->bios_in_flight) == 0);
2690                 atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
2691                 wait_event(sctx->list_wait,
2692                            atomic_read(&sctx->workers_pending) == 0);
2693                 scrub_blocked_if_needed(fs_info);
2694
2695                 btrfs_put_block_group(cache);
2696                 if (ret)
2697                         break;
2698                 if (is_dev_replace &&
2699                     atomic64_read(&dev_replace->num_write_errors) > 0) {
2700                         ret = -EIO;
2701                         break;
2702                 }
2703                 if (sctx->stat.malloc_errors > 0) {
2704                         ret = -ENOMEM;
2705                         break;
2706                 }
2707
2708                 dev_replace->cursor_left = dev_replace->cursor_right;
2709                 dev_replace->item_needs_writeback = 1;
2710
2711                 key.offset = found_key.offset + length;
2712                 btrfs_release_path(path);
2713         }
2714
2715         btrfs_free_path(path);
2716
2717         /*
2718          * ret can still be 1 from search_slot or next_leaf,
2719          * that's not an error
2720          */
2721         return ret < 0 ? ret : 0;
2722 }
2723
2724 static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
2725                                            struct btrfs_device *scrub_dev)
2726 {
2727         int     i;
2728         u64     bytenr;
2729         u64     gen;
2730         int     ret;
2731         struct btrfs_root *root = sctx->dev_root;
2732
2733         if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
2734                 return -EIO;
2735
2736         gen = root->fs_info->last_trans_committed;
2737
2738         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
2739                 bytenr = btrfs_sb_offset(i);
2740                 if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->total_bytes)
2741                         break;
2742
2743                 ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
2744                                   scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
2745                                   NULL, 1, bytenr);
2746                 if (ret)
2747                         return ret;
2748         }
2749         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
2750
2751         return 0;
2752 }
2753
2754 /*
2755  * get a reference count on fs_info->scrub_workers. start worker if necessary
2756  */
2757 static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
2758                                                 int is_dev_replace)
2759 {
2760         int ret = 0;
2761
2762         if (fs_info->scrub_workers_refcnt == 0) {
2763                 if (is_dev_replace)
2764                         btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1,
2765                                         &fs_info->generic_worker);
2766                 else
2767                         btrfs_init_workers(&fs_info->scrub_workers, "scrub",
2768                                         fs_info->thread_pool_size,
2769                                         &fs_info->generic_worker);
2770                 fs_info->scrub_workers.idle_thresh = 4;
2771                 ret = btrfs_start_workers(&fs_info->scrub_workers);
2772                 if (ret)
2773                         goto out;
2774                 btrfs_init_workers(&fs_info->scrub_wr_completion_workers,
2775                                    "scrubwrc",
2776                                    fs_info->thread_pool_size,
2777                                    &fs_info->generic_worker);
2778                 fs_info->scrub_wr_completion_workers.idle_thresh = 2;
2779                 ret = btrfs_start_workers(
2780                                 &fs_info->scrub_wr_completion_workers);
2781                 if (ret)
2782                         goto out;
2783                 btrfs_init_workers(&fs_info->scrub_nocow_workers, "scrubnc", 1,
2784                                    &fs_info->generic_worker);
2785                 ret = btrfs_start_workers(&fs_info->scrub_nocow_workers);
2786                 if (ret)
2787                         goto out;
2788         }
2789         ++fs_info->scrub_workers_refcnt;
2790 out:
2791         return ret;
2792 }
2793
2794 static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
2795 {
2796         if (--fs_info->scrub_workers_refcnt == 0) {
2797                 btrfs_stop_workers(&fs_info->scrub_workers);
2798                 btrfs_stop_workers(&fs_info->scrub_wr_completion_workers);
2799                 btrfs_stop_workers(&fs_info->scrub_nocow_workers);
2800         }
2801         WARN_ON(fs_info->scrub_workers_refcnt < 0);
2802 }
2803
2804 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
2805                     u64 end, struct btrfs_scrub_progress *progress,
2806                     int readonly, int is_dev_replace)
2807 {
2808         struct scrub_ctx *sctx;
2809         int ret;
2810         struct btrfs_device *dev;
2811
2812         if (btrfs_fs_closing(fs_info))
2813                 return -EINVAL;
2814
2815         /*
2816          * check some assumptions
2817          */
2818         if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) {
2819                 btrfs_err(fs_info,
2820                            "scrub: size assumption nodesize == leafsize (%d == %d) fails",
2821                        fs_info->chunk_root->nodesize,
2822                        fs_info->chunk_root->leafsize);
2823                 return -EINVAL;
2824         }
2825
2826         if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) {
2827                 /*
2828                  * in this case scrub is unable to calculate the checksum
2829                  * the way scrub is implemented. Do not handle this
2830                  * situation at all because it won't ever happen.
2831                  */
2832                 btrfs_err(fs_info,
2833                            "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
2834                        fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN);
2835                 return -EINVAL;
2836         }
2837
2838         if (fs_info->chunk_root->sectorsize != PAGE_SIZE) {
2839                 /* not supported for data w/o checksums */
2840                 btrfs_err(fs_info,
2841                            "scrub: size assumption sectorsize != PAGE_SIZE "
2842                            "(%d != %lu) fails",
2843                        fs_info->chunk_root->sectorsize, PAGE_SIZE);
2844                 return -EINVAL;
2845         }
2846
2847         if (fs_info->chunk_root->nodesize >
2848             PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
2849             fs_info->chunk_root->sectorsize >
2850             PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
2851                 /*
2852                  * would exhaust the array bounds of pagev member in
2853                  * struct scrub_block
2854                  */
2855                 btrfs_err(fs_info, "scrub: size assumption nodesize and sectorsize "
2856                            "<= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails",
2857                        fs_info->chunk_root->nodesize,
2858                        SCRUB_MAX_PAGES_PER_BLOCK,
2859                        fs_info->chunk_root->sectorsize,
2860                        SCRUB_MAX_PAGES_PER_BLOCK);
2861                 return -EINVAL;
2862         }
2863
2864
2865         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2866         dev = btrfs_find_device(fs_info, devid, NULL, NULL);
2867         if (!dev || (dev->missing && !is_dev_replace)) {
2868                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2869                 return -ENODEV;
2870         }
2871
2872         mutex_lock(&fs_info->scrub_lock);
2873         if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {
2874                 mutex_unlock(&fs_info->scrub_lock);
2875                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2876                 return -EIO;
2877         }
2878
2879         btrfs_dev_replace_lock(&fs_info->dev_replace);
2880         if (dev->scrub_device ||
2881             (!is_dev_replace &&
2882              btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
2883                 btrfs_dev_replace_unlock(&fs_info->dev_replace);
2884                 mutex_unlock(&fs_info->scrub_lock);
2885                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2886                 return -EINPROGRESS;
2887         }
2888         btrfs_dev_replace_unlock(&fs_info->dev_replace);
2889
2890         ret = scrub_workers_get(fs_info, is_dev_replace);
2891         if (ret) {
2892                 mutex_unlock(&fs_info->scrub_lock);
2893                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2894                 return ret;
2895         }
2896
2897         sctx = scrub_setup_ctx(dev, is_dev_replace);
2898         if (IS_ERR(sctx)) {
2899                 mutex_unlock(&fs_info->scrub_lock);
2900                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2901                 scrub_workers_put(fs_info);
2902                 return PTR_ERR(sctx);
2903         }
2904         sctx->readonly = readonly;
2905         dev->scrub_device = sctx;
2906         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2907
2908         /*
2909          * checking @scrub_pause_req here, we can avoid
2910          * race between committing transaction and scrubbing.
2911          */
2912         __scrub_blocked_if_needed(fs_info);
2913         atomic_inc(&fs_info->scrubs_running);
2914         mutex_unlock(&fs_info->scrub_lock);
2915
2916         if (!is_dev_replace) {
2917                 /*
2918                  * by holding device list mutex, we can
2919                  * kick off writing super in log tree sync.
2920                  */
2921                 mutex_lock(&fs_info->fs_devices->device_list_mutex);
2922                 ret = scrub_supers(sctx, dev);
2923                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2924         }
2925
2926         if (!ret)
2927                 ret = scrub_enumerate_chunks(sctx, dev, start, end,
2928                                              is_dev_replace);
2929
2930         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
2931         atomic_dec(&fs_info->scrubs_running);
2932         wake_up(&fs_info->scrub_pause_wait);
2933
2934         wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
2935
2936         if (progress)
2937                 memcpy(progress, &sctx->stat, sizeof(*progress));
2938
2939         mutex_lock(&fs_info->scrub_lock);
2940         dev->scrub_device = NULL;
2941         scrub_workers_put(fs_info);
2942         mutex_unlock(&fs_info->scrub_lock);
2943
2944         scrub_free_ctx(sctx);
2945
2946         return ret;
2947 }
2948
2949 void btrfs_scrub_pause(struct btrfs_root *root)
2950 {
2951         struct btrfs_fs_info *fs_info = root->fs_info;
2952
2953         mutex_lock(&fs_info->scrub_lock);
2954         atomic_inc(&fs_info->scrub_pause_req);
2955         while (atomic_read(&fs_info->scrubs_paused) !=
2956                atomic_read(&fs_info->scrubs_running)) {
2957                 mutex_unlock(&fs_info->scrub_lock);
2958                 wait_event(fs_info->scrub_pause_wait,
2959                            atomic_read(&fs_info->scrubs_paused) ==
2960                            atomic_read(&fs_info->scrubs_running));
2961                 mutex_lock(&fs_info->scrub_lock);
2962         }
2963         mutex_unlock(&fs_info->scrub_lock);
2964 }
2965
2966 void btrfs_scrub_continue(struct btrfs_root *root)
2967 {
2968         struct btrfs_fs_info *fs_info = root->fs_info;
2969
2970         atomic_dec(&fs_info->scrub_pause_req);
2971         wake_up(&fs_info->scrub_pause_wait);
2972 }
2973
2974 int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
2975 {
2976         mutex_lock(&fs_info->scrub_lock);
2977         if (!atomic_read(&fs_info->scrubs_running)) {
2978                 mutex_unlock(&fs_info->scrub_lock);
2979                 return -ENOTCONN;
2980         }
2981
2982         atomic_inc(&fs_info->scrub_cancel_req);
2983         while (atomic_read(&fs_info->scrubs_running)) {
2984                 mutex_unlock(&fs_info->scrub_lock);
2985                 wait_event(fs_info->scrub_pause_wait,
2986                            atomic_read(&fs_info->scrubs_running) == 0);
2987                 mutex_lock(&fs_info->scrub_lock);
2988         }
2989         atomic_dec(&fs_info->scrub_cancel_req);
2990         mutex_unlock(&fs_info->scrub_lock);
2991
2992         return 0;
2993 }
2994
2995 int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,
2996                            struct btrfs_device *dev)
2997 {
2998         struct scrub_ctx *sctx;
2999
3000         mutex_lock(&fs_info->scrub_lock);
3001         sctx = dev->scrub_device;
3002         if (!sctx) {
3003                 mutex_unlock(&fs_info->scrub_lock);
3004                 return -ENOTCONN;
3005         }
3006         atomic_inc(&sctx->cancel_req);
3007         while (dev->scrub_device) {
3008                 mutex_unlock(&fs_info->scrub_lock);
3009                 wait_event(fs_info->scrub_pause_wait,
3010                            dev->scrub_device == NULL);
3011                 mutex_lock(&fs_info->scrub_lock);
3012         }
3013         mutex_unlock(&fs_info->scrub_lock);
3014
3015         return 0;
3016 }
3017
3018 int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
3019                          struct btrfs_scrub_progress *progress)
3020 {
3021         struct btrfs_device *dev;
3022         struct scrub_ctx *sctx = NULL;
3023
3024         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
3025         dev = btrfs_find_device(root->fs_info, devid, NULL, NULL);
3026         if (dev)
3027                 sctx = dev->scrub_device;
3028         if (sctx)
3029                 memcpy(progress, &sctx->stat, sizeof(*progress));
3030         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
3031
3032         return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
3033 }
3034
3035 static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
3036                                u64 extent_logical, u64 extent_len,
3037                                u64 *extent_physical,
3038                                struct btrfs_device **extent_dev,
3039                                int *extent_mirror_num)
3040 {
3041         u64 mapped_length;
3042         struct btrfs_bio *bbio = NULL;
3043         int ret;
3044
3045         mapped_length = extent_len;
3046         ret = btrfs_map_block(fs_info, READ, extent_logical,
3047                               &mapped_length, &bbio, 0);
3048         if (ret || !bbio || mapped_length < extent_len ||
3049             !bbio->stripes[0].dev->bdev) {
3050                 kfree(bbio);
3051                 return;
3052         }
3053
3054         *extent_physical = bbio->stripes[0].physical;
3055         *extent_mirror_num = bbio->mirror_num;
3056         *extent_dev = bbio->stripes[0].dev;
3057         kfree(bbio);
3058 }
3059
3060 static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
3061                               struct scrub_wr_ctx *wr_ctx,
3062                               struct btrfs_fs_info *fs_info,
3063                               struct btrfs_device *dev,
3064                               int is_dev_replace)
3065 {
3066         WARN_ON(wr_ctx->wr_curr_bio != NULL);
3067
3068         mutex_init(&wr_ctx->wr_lock);
3069         wr_ctx->wr_curr_bio = NULL;
3070         if (!is_dev_replace)
3071                 return 0;
3072
3073         WARN_ON(!dev->bdev);
3074         wr_ctx->pages_per_wr_bio = min_t(int, SCRUB_PAGES_PER_WR_BIO,
3075                                          bio_get_nr_vecs(dev->bdev));
3076         wr_ctx->tgtdev = dev;
3077         atomic_set(&wr_ctx->flush_all_writes, 0);
3078         return 0;
3079 }
3080
3081 static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx)
3082 {
3083         mutex_lock(&wr_ctx->wr_lock);
3084         kfree(wr_ctx->wr_curr_bio);
3085         wr_ctx->wr_curr_bio = NULL;
3086         mutex_unlock(&wr_ctx->wr_lock);
3087 }
3088
3089 static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
3090                             int mirror_num, u64 physical_for_dev_replace)
3091 {
3092         struct scrub_copy_nocow_ctx *nocow_ctx;
3093         struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
3094
3095         nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS);
3096         if (!nocow_ctx) {
3097                 spin_lock(&sctx->stat_lock);
3098                 sctx->stat.malloc_errors++;
3099                 spin_unlock(&sctx->stat_lock);
3100                 return -ENOMEM;
3101         }
3102
3103         scrub_pending_trans_workers_inc(sctx);
3104
3105         nocow_ctx->sctx = sctx;
3106         nocow_ctx->logical = logical;
3107         nocow_ctx->len = len;
3108         nocow_ctx->mirror_num = mirror_num;
3109         nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
3110         nocow_ctx->work.func = copy_nocow_pages_worker;
3111         INIT_LIST_HEAD(&nocow_ctx->inodes);
3112         btrfs_queue_worker(&fs_info->scrub_nocow_workers,
3113                            &nocow_ctx->work);
3114
3115         return 0;
3116 }
3117
3118 static int record_inode_for_nocow(u64 inum, u64 offset, u64 root, void *ctx)
3119 {
3120         struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
3121         struct scrub_nocow_inode *nocow_inode;
3122
3123         nocow_inode = kzalloc(sizeof(*nocow_inode), GFP_NOFS);
3124         if (!nocow_inode)
3125                 return -ENOMEM;
3126         nocow_inode->inum = inum;
3127         nocow_inode->offset = offset;
3128         nocow_inode->root = root;
3129         list_add_tail(&nocow_inode->list, &nocow_ctx->inodes);
3130         return 0;
3131 }
3132
3133 #define COPY_COMPLETE 1
3134
3135 static void copy_nocow_pages_worker(struct btrfs_work *work)
3136 {
3137         struct scrub_copy_nocow_ctx *nocow_ctx =
3138                 container_of(work, struct scrub_copy_nocow_ctx, work);
3139         struct scrub_ctx *sctx = nocow_ctx->sctx;
3140         u64 logical = nocow_ctx->logical;
3141         u64 len = nocow_ctx->len;
3142         int mirror_num = nocow_ctx->mirror_num;
3143         u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
3144         int ret;
3145         struct btrfs_trans_handle *trans = NULL;
3146         struct btrfs_fs_info *fs_info;
3147         struct btrfs_path *path;
3148         struct btrfs_root *root;
3149         int not_written = 0;
3150
3151         fs_info = sctx->dev_root->fs_info;
3152         root = fs_info->extent_root;
3153
3154         path = btrfs_alloc_path();
3155         if (!path) {
3156                 spin_lock(&sctx->stat_lock);
3157                 sctx->stat.malloc_errors++;
3158                 spin_unlock(&sctx->stat_lock);
3159                 not_written = 1;
3160                 goto out;
3161         }
3162
3163         trans = btrfs_join_transaction(root);
3164         if (IS_ERR(trans)) {
3165                 not_written = 1;
3166                 goto out;
3167         }
3168
3169         ret = iterate_inodes_from_logical(logical, fs_info, path,
3170                                           record_inode_for_nocow, nocow_ctx);
3171         if (ret != 0 && ret != -ENOENT) {
3172                 btrfs_warn(fs_info, "iterate_inodes_from_logical() failed: log %llu, "
3173                         "phys %llu, len %llu, mir %u, ret %d",
3174                         logical, physical_for_dev_replace, len, mirror_num,
3175                         ret);
3176                 not_written = 1;
3177                 goto out;
3178         }
3179
3180         btrfs_end_transaction(trans, root);
3181         trans = NULL;
3182         while (!list_empty(&nocow_ctx->inodes)) {
3183                 struct scrub_nocow_inode *entry;
3184                 entry = list_first_entry(&nocow_ctx->inodes,
3185                                          struct scrub_nocow_inode,
3186                                          list);
3187                 list_del_init(&entry->list);
3188                 ret = copy_nocow_pages_for_inode(entry->inum, entry->offset,
3189                                                  entry->root, nocow_ctx);
3190                 kfree(entry);
3191                 if (ret == COPY_COMPLETE) {
3192                         ret = 0;
3193                         break;
3194                 } else if (ret) {
3195                         break;
3196                 }
3197         }
3198 out:
3199         while (!list_empty(&nocow_ctx->inodes)) {
3200                 struct scrub_nocow_inode *entry;
3201                 entry = list_first_entry(&nocow_ctx->inodes,
3202                                          struct scrub_nocow_inode,
3203                                          list);
3204                 list_del_init(&entry->list);
3205                 kfree(entry);
3206         }
3207         if (trans && !IS_ERR(trans))
3208                 btrfs_end_transaction(trans, root);
3209         if (not_written)
3210                 btrfs_dev_replace_stats_inc(&fs_info->dev_replace.
3211                                             num_uncorrectable_read_errors);
3212
3213         btrfs_free_path(path);
3214         kfree(nocow_ctx);
3215
3216         scrub_pending_trans_workers_dec(sctx);
3217 }
3218
3219 static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
3220                                       struct scrub_copy_nocow_ctx *nocow_ctx)
3221 {
3222         struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
3223         struct btrfs_key key;
3224         struct inode *inode;
3225         struct page *page;
3226         struct btrfs_root *local_root;
3227         struct btrfs_ordered_extent *ordered;
3228         struct extent_map *em;
3229         struct extent_state *cached_state = NULL;
3230         struct extent_io_tree *io_tree;
3231         u64 physical_for_dev_replace;
3232         u64 len = nocow_ctx->len;
3233         u64 lockstart = offset, lockend = offset + len - 1;
3234         unsigned long index;
3235         int srcu_index;
3236         int ret = 0;
3237         int err = 0;
3238
3239         key.objectid = root;
3240         key.type = BTRFS_ROOT_ITEM_KEY;
3241         key.offset = (u64)-1;
3242
3243         srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
3244
3245         local_root = btrfs_read_fs_root_no_name(fs_info, &key);
3246         if (IS_ERR(local_root)) {
3247                 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
3248                 return PTR_ERR(local_root);
3249         }
3250
3251         key.type = BTRFS_INODE_ITEM_KEY;
3252         key.objectid = inum;
3253         key.offset = 0;
3254         inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
3255         srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
3256         if (IS_ERR(inode))
3257                 return PTR_ERR(inode);
3258
3259         /* Avoid truncate/dio/punch hole.. */
3260         mutex_lock(&inode->i_mutex);
3261         inode_dio_wait(inode);
3262
3263         physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
3264         io_tree = &BTRFS_I(inode)->io_tree;
3265
3266         lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state);
3267         ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
3268         if (ordered) {
3269                 btrfs_put_ordered_extent(ordered);
3270                 goto out_unlock;
3271         }
3272
3273         em = btrfs_get_extent(inode, NULL, 0, lockstart, len, 0);
3274         if (IS_ERR(em)) {
3275                 ret = PTR_ERR(em);
3276                 goto out_unlock;
3277         }
3278
3279         /*
3280          * This extent does not actually cover the logical extent anymore,
3281          * move on to the next inode.
3282          */
3283         if (em->block_start > nocow_ctx->logical ||
3284             em->block_start + em->block_len < nocow_ctx->logical + len) {
3285                 free_extent_map(em);
3286                 goto out_unlock;
3287         }
3288         free_extent_map(em);
3289
3290         while (len >= PAGE_CACHE_SIZE) {
3291                 index = offset >> PAGE_CACHE_SHIFT;
3292 again:
3293                 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
3294                 if (!page) {
3295                         btrfs_err(fs_info, "find_or_create_page() failed");
3296                         ret = -ENOMEM;
3297                         goto out;
3298                 }
3299
3300                 if (PageUptodate(page)) {
3301                         if (PageDirty(page))
3302                                 goto next_page;
3303                 } else {
3304                         ClearPageError(page);
3305                         err = extent_read_full_page_nolock(io_tree, page,
3306                                                            btrfs_get_extent,
3307                                                            nocow_ctx->mirror_num);
3308                         if (err) {
3309                                 ret = err;
3310                                 goto next_page;
3311                         }
3312
3313                         lock_page(page);
3314                         /*
3315                          * If the page has been remove from the page cache,
3316                          * the data on it is meaningless, because it may be
3317                          * old one, the new data may be written into the new
3318                          * page in the page cache.
3319                          */
3320                         if (page->mapping != inode->i_mapping) {
3321                                 unlock_page(page);
3322                                 page_cache_release(page);
3323                                 goto again;
3324                         }
3325                         if (!PageUptodate(page)) {
3326                                 ret = -EIO;
3327                                 goto next_page;
3328                         }
3329                 }
3330                 err = write_page_nocow(nocow_ctx->sctx,
3331                                        physical_for_dev_replace, page);
3332                 if (err)
3333                         ret = err;
3334 next_page:
3335                 unlock_page(page);
3336                 page_cache_release(page);
3337
3338                 if (ret)
3339                         break;
3340
3341                 offset += PAGE_CACHE_SIZE;
3342                 physical_for_dev_replace += PAGE_CACHE_SIZE;
3343                 len -= PAGE_CACHE_SIZE;
3344         }
3345         ret = COPY_COMPLETE;
3346 out_unlock:
3347         unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
3348                              GFP_NOFS);
3349 out:
3350         mutex_unlock(&inode->i_mutex);
3351         iput(inode);
3352         return ret;
3353 }
3354
3355 static int write_page_nocow(struct scrub_ctx *sctx,
3356                             u64 physical_for_dev_replace, struct page *page)
3357 {
3358         struct bio *bio;
3359         struct btrfs_device *dev;
3360         int ret;
3361
3362         dev = sctx->wr_ctx.tgtdev;
3363         if (!dev)
3364                 return -EIO;
3365         if (!dev->bdev) {
3366                 printk_ratelimited(KERN_WARNING
3367                         "BTRFS: scrub write_page_nocow(bdev == NULL) is unexpected!\n");
3368                 return -EIO;
3369         }
3370         bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
3371         if (!bio) {
3372                 spin_lock(&sctx->stat_lock);
3373                 sctx->stat.malloc_errors++;
3374                 spin_unlock(&sctx->stat_lock);
3375                 return -ENOMEM;
3376         }
3377         bio->bi_iter.bi_size = 0;
3378         bio->bi_iter.bi_sector = physical_for_dev_replace >> 9;
3379         bio->bi_bdev = dev->bdev;
3380         ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
3381         if (ret != PAGE_CACHE_SIZE) {
3382 leave_with_eio:
3383                 bio_put(bio);
3384                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
3385                 return -EIO;
3386         }
3387
3388         if (btrfsic_submit_bio_wait(WRITE_SYNC, bio))
3389                 goto leave_with_eio;
3390
3391         bio_put(bio);
3392         return 0;
3393 }