Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma
[platform/kernel/linux-starfive.git] / fs / btrfs / scrub.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2011, 2012 STRATO.  All rights reserved.
4  */
5
6 #include <linux/blkdev.h>
7 #include <linux/ratelimit.h>
8 #include <linux/sched/mm.h>
9 #include <crypto/hash.h>
10 #include "ctree.h"
11 #include "discard.h"
12 #include "volumes.h"
13 #include "disk-io.h"
14 #include "ordered-data.h"
15 #include "transaction.h"
16 #include "backref.h"
17 #include "extent_io.h"
18 #include "dev-replace.h"
19 #include "check-integrity.h"
20 #include "rcu-string.h"
21 #include "raid56.h"
22 #include "block-group.h"
23 #include "zoned.h"
24
25 /*
26  * This is only the first step towards a full-features scrub. It reads all
27  * extent and super block and verifies the checksums. In case a bad checksum
28  * is found or the extent cannot be read, good data will be written back if
29  * any can be found.
30  *
31  * Future enhancements:
32  *  - In case an unrepairable extent is encountered, track which files are
33  *    affected and report them
34  *  - track and record media errors, throw out bad devices
35  *  - add a mode to also read unallocated space
36  */
37
38 struct scrub_block;
39 struct scrub_ctx;
40
41 /*
42  * the following three values only influence the performance.
43  * The last one configures the number of parallel and outstanding I/O
44  * operations. The first two values configure an upper limit for the number
45  * of (dynamically allocated) pages that are added to a bio.
46  */
47 #define SCRUB_PAGES_PER_RD_BIO  32      /* 128k per bio */
48 #define SCRUB_PAGES_PER_WR_BIO  32      /* 128k per bio */
49 #define SCRUB_BIOS_PER_SCTX     64      /* 8MB per device in flight */
50
51 /*
52  * the following value times PAGE_SIZE needs to be large enough to match the
53  * largest node/leaf/sector size that shall be supported.
54  * Values larger than BTRFS_STRIPE_LEN are not supported.
55  */
56 #define SCRUB_MAX_PAGES_PER_BLOCK       16      /* 64k per node/leaf/sector */
57
58 struct scrub_recover {
59         refcount_t              refs;
60         struct btrfs_bio        *bbio;
61         u64                     map_length;
62 };
63
64 struct scrub_page {
65         struct scrub_block      *sblock;
66         struct page             *page;
67         struct btrfs_device     *dev;
68         struct list_head        list;
69         u64                     flags;  /* extent flags */
70         u64                     generation;
71         u64                     logical;
72         u64                     physical;
73         u64                     physical_for_dev_replace;
74         atomic_t                refs;
75         u8                      mirror_num;
76         int                     have_csum:1;
77         int                     io_error:1;
78         u8                      csum[BTRFS_CSUM_SIZE];
79
80         struct scrub_recover    *recover;
81 };
82
83 struct scrub_bio {
84         int                     index;
85         struct scrub_ctx        *sctx;
86         struct btrfs_device     *dev;
87         struct bio              *bio;
88         blk_status_t            status;
89         u64                     logical;
90         u64                     physical;
91 #if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
92         struct scrub_page       *pagev[SCRUB_PAGES_PER_WR_BIO];
93 #else
94         struct scrub_page       *pagev[SCRUB_PAGES_PER_RD_BIO];
95 #endif
96         int                     page_count;
97         int                     next_free;
98         struct btrfs_work       work;
99 };
100
101 struct scrub_block {
102         struct scrub_page       *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
103         int                     page_count;
104         atomic_t                outstanding_pages;
105         refcount_t              refs; /* free mem on transition to zero */
106         struct scrub_ctx        *sctx;
107         struct scrub_parity     *sparity;
108         struct {
109                 unsigned int    header_error:1;
110                 unsigned int    checksum_error:1;
111                 unsigned int    no_io_error_seen:1;
112                 unsigned int    generation_error:1; /* also sets header_error */
113
114                 /* The following is for the data used to check parity */
115                 /* It is for the data with checksum */
116                 unsigned int    data_corrected:1;
117         };
118         struct btrfs_work       work;
119 };
120
121 /* Used for the chunks with parity stripe such RAID5/6 */
122 struct scrub_parity {
123         struct scrub_ctx        *sctx;
124
125         struct btrfs_device     *scrub_dev;
126
127         u64                     logic_start;
128
129         u64                     logic_end;
130
131         int                     nsectors;
132
133         u32                     stripe_len;
134
135         refcount_t              refs;
136
137         struct list_head        spages;
138
139         /* Work of parity check and repair */
140         struct btrfs_work       work;
141
142         /* Mark the parity blocks which have data */
143         unsigned long           *dbitmap;
144
145         /*
146          * Mark the parity blocks which have data, but errors happen when
147          * read data or check data
148          */
149         unsigned long           *ebitmap;
150
151         unsigned long           bitmap[];
152 };
153
154 struct scrub_ctx {
155         struct scrub_bio        *bios[SCRUB_BIOS_PER_SCTX];
156         struct btrfs_fs_info    *fs_info;
157         int                     first_free;
158         int                     curr;
159         atomic_t                bios_in_flight;
160         atomic_t                workers_pending;
161         spinlock_t              list_lock;
162         wait_queue_head_t       list_wait;
163         struct list_head        csum_list;
164         atomic_t                cancel_req;
165         int                     readonly;
166         int                     pages_per_rd_bio;
167
168         /* State of IO submission throttling affecting the associated device */
169         ktime_t                 throttle_deadline;
170         u64                     throttle_sent;
171
172         int                     is_dev_replace;
173         u64                     write_pointer;
174
175         struct scrub_bio        *wr_curr_bio;
176         struct mutex            wr_lock;
177         int                     pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
178         struct btrfs_device     *wr_tgtdev;
179         bool                    flush_all_writes;
180
181         /*
182          * statistics
183          */
184         struct btrfs_scrub_progress stat;
185         spinlock_t              stat_lock;
186
187         /*
188          * Use a ref counter to avoid use-after-free issues. Scrub workers
189          * decrement bios_in_flight and workers_pending and then do a wakeup
190          * on the list_wait wait queue. We must ensure the main scrub task
191          * doesn't free the scrub context before or while the workers are
192          * doing the wakeup() call.
193          */
194         refcount_t              refs;
195 };
196
197 struct scrub_warning {
198         struct btrfs_path       *path;
199         u64                     extent_item_size;
200         const char              *errstr;
201         u64                     physical;
202         u64                     logical;
203         struct btrfs_device     *dev;
204 };
205
206 struct full_stripe_lock {
207         struct rb_node node;
208         u64 logical;
209         u64 refs;
210         struct mutex mutex;
211 };
212
213 static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
214                                      struct scrub_block *sblocks_for_recheck);
215 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
216                                 struct scrub_block *sblock,
217                                 int retry_failed_mirror);
218 static void scrub_recheck_block_checksum(struct scrub_block *sblock);
219 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
220                                              struct scrub_block *sblock_good);
221 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
222                                             struct scrub_block *sblock_good,
223                                             int page_num, int force_write);
224 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
225 static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
226                                            int page_num);
227 static int scrub_checksum_data(struct scrub_block *sblock);
228 static int scrub_checksum_tree_block(struct scrub_block *sblock);
229 static int scrub_checksum_super(struct scrub_block *sblock);
230 static void scrub_block_put(struct scrub_block *sblock);
231 static void scrub_page_get(struct scrub_page *spage);
232 static void scrub_page_put(struct scrub_page *spage);
233 static void scrub_parity_get(struct scrub_parity *sparity);
234 static void scrub_parity_put(struct scrub_parity *sparity);
235 static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len,
236                        u64 physical, struct btrfs_device *dev, u64 flags,
237                        u64 gen, int mirror_num, u8 *csum,
238                        u64 physical_for_dev_replace);
239 static void scrub_bio_end_io(struct bio *bio);
240 static void scrub_bio_end_io_worker(struct btrfs_work *work);
241 static void scrub_block_complete(struct scrub_block *sblock);
242 static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
243                                u64 extent_logical, u32 extent_len,
244                                u64 *extent_physical,
245                                struct btrfs_device **extent_dev,
246                                int *extent_mirror_num);
247 static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
248                                     struct scrub_page *spage);
249 static void scrub_wr_submit(struct scrub_ctx *sctx);
250 static void scrub_wr_bio_end_io(struct bio *bio);
251 static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
252 static void scrub_put_ctx(struct scrub_ctx *sctx);
253
254 static inline int scrub_is_page_on_raid56(struct scrub_page *spage)
255 {
256         return spage->recover &&
257                (spage->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
258 }
259
260 static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
261 {
262         refcount_inc(&sctx->refs);
263         atomic_inc(&sctx->bios_in_flight);
264 }
265
266 static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
267 {
268         atomic_dec(&sctx->bios_in_flight);
269         wake_up(&sctx->list_wait);
270         scrub_put_ctx(sctx);
271 }
272
273 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
274 {
275         while (atomic_read(&fs_info->scrub_pause_req)) {
276                 mutex_unlock(&fs_info->scrub_lock);
277                 wait_event(fs_info->scrub_pause_wait,
278                    atomic_read(&fs_info->scrub_pause_req) == 0);
279                 mutex_lock(&fs_info->scrub_lock);
280         }
281 }
282
283 static void scrub_pause_on(struct btrfs_fs_info *fs_info)
284 {
285         atomic_inc(&fs_info->scrubs_paused);
286         wake_up(&fs_info->scrub_pause_wait);
287 }
288
289 static void scrub_pause_off(struct btrfs_fs_info *fs_info)
290 {
291         mutex_lock(&fs_info->scrub_lock);
292         __scrub_blocked_if_needed(fs_info);
293         atomic_dec(&fs_info->scrubs_paused);
294         mutex_unlock(&fs_info->scrub_lock);
295
296         wake_up(&fs_info->scrub_pause_wait);
297 }
298
299 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
300 {
301         scrub_pause_on(fs_info);
302         scrub_pause_off(fs_info);
303 }
304
305 /*
306  * Insert new full stripe lock into full stripe locks tree
307  *
308  * Return pointer to existing or newly inserted full_stripe_lock structure if
309  * everything works well.
310  * Return ERR_PTR(-ENOMEM) if we failed to allocate memory
311  *
312  * NOTE: caller must hold full_stripe_locks_root->lock before calling this
313  * function
314  */
315 static struct full_stripe_lock *insert_full_stripe_lock(
316                 struct btrfs_full_stripe_locks_tree *locks_root,
317                 u64 fstripe_logical)
318 {
319         struct rb_node **p;
320         struct rb_node *parent = NULL;
321         struct full_stripe_lock *entry;
322         struct full_stripe_lock *ret;
323
324         lockdep_assert_held(&locks_root->lock);
325
326         p = &locks_root->root.rb_node;
327         while (*p) {
328                 parent = *p;
329                 entry = rb_entry(parent, struct full_stripe_lock, node);
330                 if (fstripe_logical < entry->logical) {
331                         p = &(*p)->rb_left;
332                 } else if (fstripe_logical > entry->logical) {
333                         p = &(*p)->rb_right;
334                 } else {
335                         entry->refs++;
336                         return entry;
337                 }
338         }
339
340         /*
341          * Insert new lock.
342          */
343         ret = kmalloc(sizeof(*ret), GFP_KERNEL);
344         if (!ret)
345                 return ERR_PTR(-ENOMEM);
346         ret->logical = fstripe_logical;
347         ret->refs = 1;
348         mutex_init(&ret->mutex);
349
350         rb_link_node(&ret->node, parent, p);
351         rb_insert_color(&ret->node, &locks_root->root);
352         return ret;
353 }
354
355 /*
356  * Search for a full stripe lock of a block group
357  *
358  * Return pointer to existing full stripe lock if found
359  * Return NULL if not found
360  */
361 static struct full_stripe_lock *search_full_stripe_lock(
362                 struct btrfs_full_stripe_locks_tree *locks_root,
363                 u64 fstripe_logical)
364 {
365         struct rb_node *node;
366         struct full_stripe_lock *entry;
367
368         lockdep_assert_held(&locks_root->lock);
369
370         node = locks_root->root.rb_node;
371         while (node) {
372                 entry = rb_entry(node, struct full_stripe_lock, node);
373                 if (fstripe_logical < entry->logical)
374                         node = node->rb_left;
375                 else if (fstripe_logical > entry->logical)
376                         node = node->rb_right;
377                 else
378                         return entry;
379         }
380         return NULL;
381 }
382
383 /*
384  * Helper to get full stripe logical from a normal bytenr.
385  *
386  * Caller must ensure @cache is a RAID56 block group.
387  */
388 static u64 get_full_stripe_logical(struct btrfs_block_group *cache, u64 bytenr)
389 {
390         u64 ret;
391
392         /*
393          * Due to chunk item size limit, full stripe length should not be
394          * larger than U32_MAX. Just a sanity check here.
395          */
396         WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX);
397
398         /*
399          * round_down() can only handle power of 2, while RAID56 full
400          * stripe length can be 64KiB * n, so we need to manually round down.
401          */
402         ret = div64_u64(bytenr - cache->start, cache->full_stripe_len) *
403                         cache->full_stripe_len + cache->start;
404         return ret;
405 }
406
407 /*
408  * Lock a full stripe to avoid concurrency of recovery and read
409  *
410  * It's only used for profiles with parities (RAID5/6), for other profiles it
411  * does nothing.
412  *
413  * Return 0 if we locked full stripe covering @bytenr, with a mutex held.
414  * So caller must call unlock_full_stripe() at the same context.
415  *
416  * Return <0 if encounters error.
417  */
418 static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
419                             bool *locked_ret)
420 {
421         struct btrfs_block_group *bg_cache;
422         struct btrfs_full_stripe_locks_tree *locks_root;
423         struct full_stripe_lock *existing;
424         u64 fstripe_start;
425         int ret = 0;
426
427         *locked_ret = false;
428         bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
429         if (!bg_cache) {
430                 ASSERT(0);
431                 return -ENOENT;
432         }
433
434         /* Profiles not based on parity don't need full stripe lock */
435         if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
436                 goto out;
437         locks_root = &bg_cache->full_stripe_locks_root;
438
439         fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
440
441         /* Now insert the full stripe lock */
442         mutex_lock(&locks_root->lock);
443         existing = insert_full_stripe_lock(locks_root, fstripe_start);
444         mutex_unlock(&locks_root->lock);
445         if (IS_ERR(existing)) {
446                 ret = PTR_ERR(existing);
447                 goto out;
448         }
449         mutex_lock(&existing->mutex);
450         *locked_ret = true;
451 out:
452         btrfs_put_block_group(bg_cache);
453         return ret;
454 }
455
456 /*
457  * Unlock a full stripe.
458  *
459  * NOTE: Caller must ensure it's the same context calling corresponding
460  * lock_full_stripe().
461  *
462  * Return 0 if we unlock full stripe without problem.
463  * Return <0 for error
464  */
465 static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
466                               bool locked)
467 {
468         struct btrfs_block_group *bg_cache;
469         struct btrfs_full_stripe_locks_tree *locks_root;
470         struct full_stripe_lock *fstripe_lock;
471         u64 fstripe_start;
472         bool freeit = false;
473         int ret = 0;
474
475         /* If we didn't acquire full stripe lock, no need to continue */
476         if (!locked)
477                 return 0;
478
479         bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
480         if (!bg_cache) {
481                 ASSERT(0);
482                 return -ENOENT;
483         }
484         if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
485                 goto out;
486
487         locks_root = &bg_cache->full_stripe_locks_root;
488         fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
489
490         mutex_lock(&locks_root->lock);
491         fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start);
492         /* Unpaired unlock_full_stripe() detected */
493         if (!fstripe_lock) {
494                 WARN_ON(1);
495                 ret = -ENOENT;
496                 mutex_unlock(&locks_root->lock);
497                 goto out;
498         }
499
500         if (fstripe_lock->refs == 0) {
501                 WARN_ON(1);
502                 btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow",
503                         fstripe_lock->logical);
504         } else {
505                 fstripe_lock->refs--;
506         }
507
508         if (fstripe_lock->refs == 0) {
509                 rb_erase(&fstripe_lock->node, &locks_root->root);
510                 freeit = true;
511         }
512         mutex_unlock(&locks_root->lock);
513
514         mutex_unlock(&fstripe_lock->mutex);
515         if (freeit)
516                 kfree(fstripe_lock);
517 out:
518         btrfs_put_block_group(bg_cache);
519         return ret;
520 }
521
522 static void scrub_free_csums(struct scrub_ctx *sctx)
523 {
524         while (!list_empty(&sctx->csum_list)) {
525                 struct btrfs_ordered_sum *sum;
526                 sum = list_first_entry(&sctx->csum_list,
527                                        struct btrfs_ordered_sum, list);
528                 list_del(&sum->list);
529                 kfree(sum);
530         }
531 }
532
533 static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
534 {
535         int i;
536
537         if (!sctx)
538                 return;
539
540         /* this can happen when scrub is cancelled */
541         if (sctx->curr != -1) {
542                 struct scrub_bio *sbio = sctx->bios[sctx->curr];
543
544                 for (i = 0; i < sbio->page_count; i++) {
545                         WARN_ON(!sbio->pagev[i]->page);
546                         scrub_block_put(sbio->pagev[i]->sblock);
547                 }
548                 bio_put(sbio->bio);
549         }
550
551         for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
552                 struct scrub_bio *sbio = sctx->bios[i];
553
554                 if (!sbio)
555                         break;
556                 kfree(sbio);
557         }
558
559         kfree(sctx->wr_curr_bio);
560         scrub_free_csums(sctx);
561         kfree(sctx);
562 }
563
564 static void scrub_put_ctx(struct scrub_ctx *sctx)
565 {
566         if (refcount_dec_and_test(&sctx->refs))
567                 scrub_free_ctx(sctx);
568 }
569
570 static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
571                 struct btrfs_fs_info *fs_info, int is_dev_replace)
572 {
573         struct scrub_ctx *sctx;
574         int             i;
575
576         sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
577         if (!sctx)
578                 goto nomem;
579         refcount_set(&sctx->refs, 1);
580         sctx->is_dev_replace = is_dev_replace;
581         sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
582         sctx->curr = -1;
583         sctx->fs_info = fs_info;
584         INIT_LIST_HEAD(&sctx->csum_list);
585         for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
586                 struct scrub_bio *sbio;
587
588                 sbio = kzalloc(sizeof(*sbio), GFP_KERNEL);
589                 if (!sbio)
590                         goto nomem;
591                 sctx->bios[i] = sbio;
592
593                 sbio->index = i;
594                 sbio->sctx = sctx;
595                 sbio->page_count = 0;
596                 btrfs_init_work(&sbio->work, scrub_bio_end_io_worker, NULL,
597                                 NULL);
598
599                 if (i != SCRUB_BIOS_PER_SCTX - 1)
600                         sctx->bios[i]->next_free = i + 1;
601                 else
602                         sctx->bios[i]->next_free = -1;
603         }
604         sctx->first_free = 0;
605         atomic_set(&sctx->bios_in_flight, 0);
606         atomic_set(&sctx->workers_pending, 0);
607         atomic_set(&sctx->cancel_req, 0);
608
609         spin_lock_init(&sctx->list_lock);
610         spin_lock_init(&sctx->stat_lock);
611         init_waitqueue_head(&sctx->list_wait);
612         sctx->throttle_deadline = 0;
613
614         WARN_ON(sctx->wr_curr_bio != NULL);
615         mutex_init(&sctx->wr_lock);
616         sctx->wr_curr_bio = NULL;
617         if (is_dev_replace) {
618                 WARN_ON(!fs_info->dev_replace.tgtdev);
619                 sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO;
620                 sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
621                 sctx->flush_all_writes = false;
622         }
623
624         return sctx;
625
626 nomem:
627         scrub_free_ctx(sctx);
628         return ERR_PTR(-ENOMEM);
629 }
630
631 static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
632                                      void *warn_ctx)
633 {
634         u32 nlink;
635         int ret;
636         int i;
637         unsigned nofs_flag;
638         struct extent_buffer *eb;
639         struct btrfs_inode_item *inode_item;
640         struct scrub_warning *swarn = warn_ctx;
641         struct btrfs_fs_info *fs_info = swarn->dev->fs_info;
642         struct inode_fs_paths *ipath = NULL;
643         struct btrfs_root *local_root;
644         struct btrfs_key key;
645
646         local_root = btrfs_get_fs_root(fs_info, root, true);
647         if (IS_ERR(local_root)) {
648                 ret = PTR_ERR(local_root);
649                 goto err;
650         }
651
652         /*
653          * this makes the path point to (inum INODE_ITEM ioff)
654          */
655         key.objectid = inum;
656         key.type = BTRFS_INODE_ITEM_KEY;
657         key.offset = 0;
658
659         ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
660         if (ret) {
661                 btrfs_put_root(local_root);
662                 btrfs_release_path(swarn->path);
663                 goto err;
664         }
665
666         eb = swarn->path->nodes[0];
667         inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
668                                         struct btrfs_inode_item);
669         nlink = btrfs_inode_nlink(eb, inode_item);
670         btrfs_release_path(swarn->path);
671
672         /*
673          * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub
674          * uses GFP_NOFS in this context, so we keep it consistent but it does
675          * not seem to be strictly necessary.
676          */
677         nofs_flag = memalloc_nofs_save();
678         ipath = init_ipath(4096, local_root, swarn->path);
679         memalloc_nofs_restore(nofs_flag);
680         if (IS_ERR(ipath)) {
681                 btrfs_put_root(local_root);
682                 ret = PTR_ERR(ipath);
683                 ipath = NULL;
684                 goto err;
685         }
686         ret = paths_from_inode(inum, ipath);
687
688         if (ret < 0)
689                 goto err;
690
691         /*
692          * we deliberately ignore the bit ipath might have been too small to
693          * hold all of the paths here
694          */
695         for (i = 0; i < ipath->fspath->elem_cnt; ++i)
696                 btrfs_warn_in_rcu(fs_info,
697 "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)",
698                                   swarn->errstr, swarn->logical,
699                                   rcu_str_deref(swarn->dev->name),
700                                   swarn->physical,
701                                   root, inum, offset,
702                                   fs_info->sectorsize, nlink,
703                                   (char *)(unsigned long)ipath->fspath->val[i]);
704
705         btrfs_put_root(local_root);
706         free_ipath(ipath);
707         return 0;
708
709 err:
710         btrfs_warn_in_rcu(fs_info,
711                           "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
712                           swarn->errstr, swarn->logical,
713                           rcu_str_deref(swarn->dev->name),
714                           swarn->physical,
715                           root, inum, offset, ret);
716
717         free_ipath(ipath);
718         return 0;
719 }
720
721 static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
722 {
723         struct btrfs_device *dev;
724         struct btrfs_fs_info *fs_info;
725         struct btrfs_path *path;
726         struct btrfs_key found_key;
727         struct extent_buffer *eb;
728         struct btrfs_extent_item *ei;
729         struct scrub_warning swarn;
730         unsigned long ptr = 0;
731         u64 extent_item_pos;
732         u64 flags = 0;
733         u64 ref_root;
734         u32 item_size;
735         u8 ref_level = 0;
736         int ret;
737
738         WARN_ON(sblock->page_count < 1);
739         dev = sblock->pagev[0]->dev;
740         fs_info = sblock->sctx->fs_info;
741
742         path = btrfs_alloc_path();
743         if (!path)
744                 return;
745
746         swarn.physical = sblock->pagev[0]->physical;
747         swarn.logical = sblock->pagev[0]->logical;
748         swarn.errstr = errstr;
749         swarn.dev = NULL;
750
751         ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
752                                   &flags);
753         if (ret < 0)
754                 goto out;
755
756         extent_item_pos = swarn.logical - found_key.objectid;
757         swarn.extent_item_size = found_key.offset;
758
759         eb = path->nodes[0];
760         ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
761         item_size = btrfs_item_size_nr(eb, path->slots[0]);
762
763         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
764                 do {
765                         ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
766                                                       item_size, &ref_root,
767                                                       &ref_level);
768                         btrfs_warn_in_rcu(fs_info,
769 "%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
770                                 errstr, swarn.logical,
771                                 rcu_str_deref(dev->name),
772                                 swarn.physical,
773                                 ref_level ? "node" : "leaf",
774                                 ret < 0 ? -1 : ref_level,
775                                 ret < 0 ? -1 : ref_root);
776                 } while (ret != 1);
777                 btrfs_release_path(path);
778         } else {
779                 btrfs_release_path(path);
780                 swarn.path = path;
781                 swarn.dev = dev;
782                 iterate_extent_inodes(fs_info, found_key.objectid,
783                                         extent_item_pos, 1,
784                                         scrub_print_warning_inode, &swarn, false);
785         }
786
787 out:
788         btrfs_free_path(path);
789 }
790
791 static inline void scrub_get_recover(struct scrub_recover *recover)
792 {
793         refcount_inc(&recover->refs);
794 }
795
796 static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
797                                      struct scrub_recover *recover)
798 {
799         if (refcount_dec_and_test(&recover->refs)) {
800                 btrfs_bio_counter_dec(fs_info);
801                 btrfs_put_bbio(recover->bbio);
802                 kfree(recover);
803         }
804 }
805
806 /*
807  * scrub_handle_errored_block gets called when either verification of the
808  * pages failed or the bio failed to read, e.g. with EIO. In the latter
809  * case, this function handles all pages in the bio, even though only one
810  * may be bad.
811  * The goal of this function is to repair the errored block by using the
812  * contents of one of the mirrors.
813  */
814 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
815 {
816         struct scrub_ctx *sctx = sblock_to_check->sctx;
817         struct btrfs_device *dev;
818         struct btrfs_fs_info *fs_info;
819         u64 logical;
820         unsigned int failed_mirror_index;
821         unsigned int is_metadata;
822         unsigned int have_csum;
823         struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
824         struct scrub_block *sblock_bad;
825         int ret;
826         int mirror_index;
827         int page_num;
828         int success;
829         bool full_stripe_locked;
830         unsigned int nofs_flag;
831         static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
832                                       DEFAULT_RATELIMIT_BURST);
833
834         BUG_ON(sblock_to_check->page_count < 1);
835         fs_info = sctx->fs_info;
836         if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
837                 /*
838                  * if we find an error in a super block, we just report it.
839                  * They will get written with the next transaction commit
840                  * anyway
841                  */
842                 spin_lock(&sctx->stat_lock);
843                 ++sctx->stat.super_errors;
844                 spin_unlock(&sctx->stat_lock);
845                 return 0;
846         }
847         logical = sblock_to_check->pagev[0]->logical;
848         BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
849         failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
850         is_metadata = !(sblock_to_check->pagev[0]->flags &
851                         BTRFS_EXTENT_FLAG_DATA);
852         have_csum = sblock_to_check->pagev[0]->have_csum;
853         dev = sblock_to_check->pagev[0]->dev;
854
855         if (btrfs_is_zoned(fs_info) && !sctx->is_dev_replace)
856                 return btrfs_repair_one_zone(fs_info, logical);
857
858         /*
859          * We must use GFP_NOFS because the scrub task might be waiting for a
860          * worker task executing this function and in turn a transaction commit
861          * might be waiting the scrub task to pause (which needs to wait for all
862          * the worker tasks to complete before pausing).
863          * We do allocations in the workers through insert_full_stripe_lock()
864          * and scrub_add_page_to_wr_bio(), which happens down the call chain of
865          * this function.
866          */
867         nofs_flag = memalloc_nofs_save();
868         /*
869          * For RAID5/6, race can happen for a different device scrub thread.
870          * For data corruption, Parity and Data threads will both try
871          * to recovery the data.
872          * Race can lead to doubly added csum error, or even unrecoverable
873          * error.
874          */
875         ret = lock_full_stripe(fs_info, logical, &full_stripe_locked);
876         if (ret < 0) {
877                 memalloc_nofs_restore(nofs_flag);
878                 spin_lock(&sctx->stat_lock);
879                 if (ret == -ENOMEM)
880                         sctx->stat.malloc_errors++;
881                 sctx->stat.read_errors++;
882                 sctx->stat.uncorrectable_errors++;
883                 spin_unlock(&sctx->stat_lock);
884                 return ret;
885         }
886
887         /*
888          * read all mirrors one after the other. This includes to
889          * re-read the extent or metadata block that failed (that was
890          * the cause that this fixup code is called) another time,
891          * sector by sector this time in order to know which sectors
892          * caused I/O errors and which ones are good (for all mirrors).
893          * It is the goal to handle the situation when more than one
894          * mirror contains I/O errors, but the errors do not
895          * overlap, i.e. the data can be repaired by selecting the
896          * sectors from those mirrors without I/O error on the
897          * particular sectors. One example (with blocks >= 2 * sectorsize)
898          * would be that mirror #1 has an I/O error on the first sector,
899          * the second sector is good, and mirror #2 has an I/O error on
900          * the second sector, but the first sector is good.
901          * Then the first sector of the first mirror can be repaired by
902          * taking the first sector of the second mirror, and the
903          * second sector of the second mirror can be repaired by
904          * copying the contents of the 2nd sector of the 1st mirror.
905          * One more note: if the sectors of one mirror contain I/O
906          * errors, the checksum cannot be verified. In order to get
907          * the best data for repairing, the first attempt is to find
908          * a mirror without I/O errors and with a validated checksum.
909          * Only if this is not possible, the sectors are picked from
910          * mirrors with I/O errors without considering the checksum.
911          * If the latter is the case, at the end, the checksum of the
912          * repaired area is verified in order to correctly maintain
913          * the statistics.
914          */
915
916         sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS,
917                                       sizeof(*sblocks_for_recheck), GFP_KERNEL);
918         if (!sblocks_for_recheck) {
919                 spin_lock(&sctx->stat_lock);
920                 sctx->stat.malloc_errors++;
921                 sctx->stat.read_errors++;
922                 sctx->stat.uncorrectable_errors++;
923                 spin_unlock(&sctx->stat_lock);
924                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
925                 goto out;
926         }
927
928         /* setup the context, map the logical blocks and alloc the pages */
929         ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
930         if (ret) {
931                 spin_lock(&sctx->stat_lock);
932                 sctx->stat.read_errors++;
933                 sctx->stat.uncorrectable_errors++;
934                 spin_unlock(&sctx->stat_lock);
935                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
936                 goto out;
937         }
938         BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
939         sblock_bad = sblocks_for_recheck + failed_mirror_index;
940
941         /* build and submit the bios for the failed mirror, check checksums */
942         scrub_recheck_block(fs_info, sblock_bad, 1);
943
944         if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
945             sblock_bad->no_io_error_seen) {
946                 /*
947                  * the error disappeared after reading page by page, or
948                  * the area was part of a huge bio and other parts of the
949                  * bio caused I/O errors, or the block layer merged several
950                  * read requests into one and the error is caused by a
951                  * different bio (usually one of the two latter cases is
952                  * the cause)
953                  */
954                 spin_lock(&sctx->stat_lock);
955                 sctx->stat.unverified_errors++;
956                 sblock_to_check->data_corrected = 1;
957                 spin_unlock(&sctx->stat_lock);
958
959                 if (sctx->is_dev_replace)
960                         scrub_write_block_to_dev_replace(sblock_bad);
961                 goto out;
962         }
963
964         if (!sblock_bad->no_io_error_seen) {
965                 spin_lock(&sctx->stat_lock);
966                 sctx->stat.read_errors++;
967                 spin_unlock(&sctx->stat_lock);
968                 if (__ratelimit(&rs))
969                         scrub_print_warning("i/o error", sblock_to_check);
970                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
971         } else if (sblock_bad->checksum_error) {
972                 spin_lock(&sctx->stat_lock);
973                 sctx->stat.csum_errors++;
974                 spin_unlock(&sctx->stat_lock);
975                 if (__ratelimit(&rs))
976                         scrub_print_warning("checksum error", sblock_to_check);
977                 btrfs_dev_stat_inc_and_print(dev,
978                                              BTRFS_DEV_STAT_CORRUPTION_ERRS);
979         } else if (sblock_bad->header_error) {
980                 spin_lock(&sctx->stat_lock);
981                 sctx->stat.verify_errors++;
982                 spin_unlock(&sctx->stat_lock);
983                 if (__ratelimit(&rs))
984                         scrub_print_warning("checksum/header error",
985                                             sblock_to_check);
986                 if (sblock_bad->generation_error)
987                         btrfs_dev_stat_inc_and_print(dev,
988                                 BTRFS_DEV_STAT_GENERATION_ERRS);
989                 else
990                         btrfs_dev_stat_inc_and_print(dev,
991                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
992         }
993
994         if (sctx->readonly) {
995                 ASSERT(!sctx->is_dev_replace);
996                 goto out;
997         }
998
999         /*
1000          * now build and submit the bios for the other mirrors, check
1001          * checksums.
1002          * First try to pick the mirror which is completely without I/O
1003          * errors and also does not have a checksum error.
1004          * If one is found, and if a checksum is present, the full block
1005          * that is known to contain an error is rewritten. Afterwards
1006          * the block is known to be corrected.
1007          * If a mirror is found which is completely correct, and no
1008          * checksum is present, only those pages are rewritten that had
1009          * an I/O error in the block to be repaired, since it cannot be
1010          * determined, which copy of the other pages is better (and it
1011          * could happen otherwise that a correct page would be
1012          * overwritten by a bad one).
1013          */
1014         for (mirror_index = 0; ;mirror_index++) {
1015                 struct scrub_block *sblock_other;
1016
1017                 if (mirror_index == failed_mirror_index)
1018                         continue;
1019
1020                 /* raid56's mirror can be more than BTRFS_MAX_MIRRORS */
1021                 if (!scrub_is_page_on_raid56(sblock_bad->pagev[0])) {
1022                         if (mirror_index >= BTRFS_MAX_MIRRORS)
1023                                 break;
1024                         if (!sblocks_for_recheck[mirror_index].page_count)
1025                                 break;
1026
1027                         sblock_other = sblocks_for_recheck + mirror_index;
1028                 } else {
1029                         struct scrub_recover *r = sblock_bad->pagev[0]->recover;
1030                         int max_allowed = r->bbio->num_stripes -
1031                                                 r->bbio->num_tgtdevs;
1032
1033                         if (mirror_index >= max_allowed)
1034                                 break;
1035                         if (!sblocks_for_recheck[1].page_count)
1036                                 break;
1037
1038                         ASSERT(failed_mirror_index == 0);
1039                         sblock_other = sblocks_for_recheck + 1;
1040                         sblock_other->pagev[0]->mirror_num = 1 + mirror_index;
1041                 }
1042
1043                 /* build and submit the bios, check checksums */
1044                 scrub_recheck_block(fs_info, sblock_other, 0);
1045
1046                 if (!sblock_other->header_error &&
1047                     !sblock_other->checksum_error &&
1048                     sblock_other->no_io_error_seen) {
1049                         if (sctx->is_dev_replace) {
1050                                 scrub_write_block_to_dev_replace(sblock_other);
1051                                 goto corrected_error;
1052                         } else {
1053                                 ret = scrub_repair_block_from_good_copy(
1054                                                 sblock_bad, sblock_other);
1055                                 if (!ret)
1056                                         goto corrected_error;
1057                         }
1058                 }
1059         }
1060
1061         if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
1062                 goto did_not_correct_error;
1063
1064         /*
1065          * In case of I/O errors in the area that is supposed to be
1066          * repaired, continue by picking good copies of those sectors.
1067          * Select the good sectors from mirrors to rewrite bad sectors from
1068          * the area to fix. Afterwards verify the checksum of the block
1069          * that is supposed to be repaired. This verification step is
1070          * only done for the purpose of statistic counting and for the
1071          * final scrub report, whether errors remain.
1072          * A perfect algorithm could make use of the checksum and try
1073          * all possible combinations of sectors from the different mirrors
1074          * until the checksum verification succeeds. For example, when
1075          * the 2nd sector of mirror #1 faces I/O errors, and the 2nd sector
1076          * of mirror #2 is readable but the final checksum test fails,
1077          * then the 2nd sector of mirror #3 could be tried, whether now
1078          * the final checksum succeeds. But this would be a rare
1079          * exception and is therefore not implemented. At least it is
1080          * avoided that the good copy is overwritten.
1081          * A more useful improvement would be to pick the sectors
1082          * without I/O error based on sector sizes (512 bytes on legacy
1083          * disks) instead of on sectorsize. Then maybe 512 byte of one
1084          * mirror could be repaired by taking 512 byte of a different
1085          * mirror, even if other 512 byte sectors in the same sectorsize
1086          * area are unreadable.
1087          */
1088         success = 1;
1089         for (page_num = 0; page_num < sblock_bad->page_count;
1090              page_num++) {
1091                 struct scrub_page *spage_bad = sblock_bad->pagev[page_num];
1092                 struct scrub_block *sblock_other = NULL;
1093
1094                 /* skip no-io-error page in scrub */
1095                 if (!spage_bad->io_error && !sctx->is_dev_replace)
1096                         continue;
1097
1098                 if (scrub_is_page_on_raid56(sblock_bad->pagev[0])) {
1099                         /*
1100                          * In case of dev replace, if raid56 rebuild process
1101                          * didn't work out correct data, then copy the content
1102                          * in sblock_bad to make sure target device is identical
1103                          * to source device, instead of writing garbage data in
1104                          * sblock_for_recheck array to target device.
1105                          */
1106                         sblock_other = NULL;
1107                 } else if (spage_bad->io_error) {
1108                         /* try to find no-io-error page in mirrors */
1109                         for (mirror_index = 0;
1110                              mirror_index < BTRFS_MAX_MIRRORS &&
1111                              sblocks_for_recheck[mirror_index].page_count > 0;
1112                              mirror_index++) {
1113                                 if (!sblocks_for_recheck[mirror_index].
1114                                     pagev[page_num]->io_error) {
1115                                         sblock_other = sblocks_for_recheck +
1116                                                        mirror_index;
1117                                         break;
1118                                 }
1119                         }
1120                         if (!sblock_other)
1121                                 success = 0;
1122                 }
1123
1124                 if (sctx->is_dev_replace) {
1125                         /*
1126                          * did not find a mirror to fetch the page
1127                          * from. scrub_write_page_to_dev_replace()
1128                          * handles this case (page->io_error), by
1129                          * filling the block with zeros before
1130                          * submitting the write request
1131                          */
1132                         if (!sblock_other)
1133                                 sblock_other = sblock_bad;
1134
1135                         if (scrub_write_page_to_dev_replace(sblock_other,
1136                                                             page_num) != 0) {
1137                                 atomic64_inc(
1138                                         &fs_info->dev_replace.num_write_errors);
1139                                 success = 0;
1140                         }
1141                 } else if (sblock_other) {
1142                         ret = scrub_repair_page_from_good_copy(sblock_bad,
1143                                                                sblock_other,
1144                                                                page_num, 0);
1145                         if (0 == ret)
1146                                 spage_bad->io_error = 0;
1147                         else
1148                                 success = 0;
1149                 }
1150         }
1151
1152         if (success && !sctx->is_dev_replace) {
1153                 if (is_metadata || have_csum) {
1154                         /*
1155                          * need to verify the checksum now that all
1156                          * sectors on disk are repaired (the write
1157                          * request for data to be repaired is on its way).
1158                          * Just be lazy and use scrub_recheck_block()
1159                          * which re-reads the data before the checksum
1160                          * is verified, but most likely the data comes out
1161                          * of the page cache.
1162                          */
1163                         scrub_recheck_block(fs_info, sblock_bad, 1);
1164                         if (!sblock_bad->header_error &&
1165                             !sblock_bad->checksum_error &&
1166                             sblock_bad->no_io_error_seen)
1167                                 goto corrected_error;
1168                         else
1169                                 goto did_not_correct_error;
1170                 } else {
1171 corrected_error:
1172                         spin_lock(&sctx->stat_lock);
1173                         sctx->stat.corrected_errors++;
1174                         sblock_to_check->data_corrected = 1;
1175                         spin_unlock(&sctx->stat_lock);
1176                         btrfs_err_rl_in_rcu(fs_info,
1177                                 "fixed up error at logical %llu on dev %s",
1178                                 logical, rcu_str_deref(dev->name));
1179                 }
1180         } else {
1181 did_not_correct_error:
1182                 spin_lock(&sctx->stat_lock);
1183                 sctx->stat.uncorrectable_errors++;
1184                 spin_unlock(&sctx->stat_lock);
1185                 btrfs_err_rl_in_rcu(fs_info,
1186                         "unable to fixup (regular) error at logical %llu on dev %s",
1187                         logical, rcu_str_deref(dev->name));
1188         }
1189
1190 out:
1191         if (sblocks_for_recheck) {
1192                 for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
1193                      mirror_index++) {
1194                         struct scrub_block *sblock = sblocks_for_recheck +
1195                                                      mirror_index;
1196                         struct scrub_recover *recover;
1197                         int page_index;
1198
1199                         for (page_index = 0; page_index < sblock->page_count;
1200                              page_index++) {
1201                                 sblock->pagev[page_index]->sblock = NULL;
1202                                 recover = sblock->pagev[page_index]->recover;
1203                                 if (recover) {
1204                                         scrub_put_recover(fs_info, recover);
1205                                         sblock->pagev[page_index]->recover =
1206                                                                         NULL;
1207                                 }
1208                                 scrub_page_put(sblock->pagev[page_index]);
1209                         }
1210                 }
1211                 kfree(sblocks_for_recheck);
1212         }
1213
1214         ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
1215         memalloc_nofs_restore(nofs_flag);
1216         if (ret < 0)
1217                 return ret;
1218         return 0;
1219 }
1220
1221 static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio)
1222 {
1223         if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
1224                 return 2;
1225         else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
1226                 return 3;
1227         else
1228                 return (int)bbio->num_stripes;
1229 }
1230
1231 static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
1232                                                  u64 *raid_map,
1233                                                  u64 mapped_length,
1234                                                  int nstripes, int mirror,
1235                                                  int *stripe_index,
1236                                                  u64 *stripe_offset)
1237 {
1238         int i;
1239
1240         if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
1241                 /* RAID5/6 */
1242                 for (i = 0; i < nstripes; i++) {
1243                         if (raid_map[i] == RAID6_Q_STRIPE ||
1244                             raid_map[i] == RAID5_P_STRIPE)
1245                                 continue;
1246
1247                         if (logical >= raid_map[i] &&
1248                             logical < raid_map[i] + mapped_length)
1249                                 break;
1250                 }
1251
1252                 *stripe_index = i;
1253                 *stripe_offset = logical - raid_map[i];
1254         } else {
1255                 /* The other RAID type */
1256                 *stripe_index = mirror;
1257                 *stripe_offset = 0;
1258         }
1259 }
1260
1261 static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
1262                                      struct scrub_block *sblocks_for_recheck)
1263 {
1264         struct scrub_ctx *sctx = original_sblock->sctx;
1265         struct btrfs_fs_info *fs_info = sctx->fs_info;
1266         u64 length = original_sblock->page_count * fs_info->sectorsize;
1267         u64 logical = original_sblock->pagev[0]->logical;
1268         u64 generation = original_sblock->pagev[0]->generation;
1269         u64 flags = original_sblock->pagev[0]->flags;
1270         u64 have_csum = original_sblock->pagev[0]->have_csum;
1271         struct scrub_recover *recover;
1272         struct btrfs_bio *bbio;
1273         u64 sublen;
1274         u64 mapped_length;
1275         u64 stripe_offset;
1276         int stripe_index;
1277         int page_index = 0;
1278         int mirror_index;
1279         int nmirrors;
1280         int ret;
1281
1282         /*
1283          * note: the two members refs and outstanding_pages
1284          * are not used (and not set) in the blocks that are used for
1285          * the recheck procedure
1286          */
1287
1288         while (length > 0) {
1289                 sublen = min_t(u64, length, fs_info->sectorsize);
1290                 mapped_length = sublen;
1291                 bbio = NULL;
1292
1293                 /*
1294                  * With a length of sectorsize, each returned stripe represents
1295                  * one mirror
1296                  */
1297                 btrfs_bio_counter_inc_blocked(fs_info);
1298                 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
1299                                 logical, &mapped_length, &bbio);
1300                 if (ret || !bbio || mapped_length < sublen) {
1301                         btrfs_put_bbio(bbio);
1302                         btrfs_bio_counter_dec(fs_info);
1303                         return -EIO;
1304                 }
1305
1306                 recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
1307                 if (!recover) {
1308                         btrfs_put_bbio(bbio);
1309                         btrfs_bio_counter_dec(fs_info);
1310                         return -ENOMEM;
1311                 }
1312
1313                 refcount_set(&recover->refs, 1);
1314                 recover->bbio = bbio;
1315                 recover->map_length = mapped_length;
1316
1317                 BUG_ON(page_index >= SCRUB_MAX_PAGES_PER_BLOCK);
1318
1319                 nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS);
1320
1321                 for (mirror_index = 0; mirror_index < nmirrors;
1322                      mirror_index++) {
1323                         struct scrub_block *sblock;
1324                         struct scrub_page *spage;
1325
1326                         sblock = sblocks_for_recheck + mirror_index;
1327                         sblock->sctx = sctx;
1328
1329                         spage = kzalloc(sizeof(*spage), GFP_NOFS);
1330                         if (!spage) {
1331 leave_nomem:
1332                                 spin_lock(&sctx->stat_lock);
1333                                 sctx->stat.malloc_errors++;
1334                                 spin_unlock(&sctx->stat_lock);
1335                                 scrub_put_recover(fs_info, recover);
1336                                 return -ENOMEM;
1337                         }
1338                         scrub_page_get(spage);
1339                         sblock->pagev[page_index] = spage;
1340                         spage->sblock = sblock;
1341                         spage->flags = flags;
1342                         spage->generation = generation;
1343                         spage->logical = logical;
1344                         spage->have_csum = have_csum;
1345                         if (have_csum)
1346                                 memcpy(spage->csum,
1347                                        original_sblock->pagev[0]->csum,
1348                                        sctx->fs_info->csum_size);
1349
1350                         scrub_stripe_index_and_offset(logical,
1351                                                       bbio->map_type,
1352                                                       bbio->raid_map,
1353                                                       mapped_length,
1354                                                       bbio->num_stripes -
1355                                                       bbio->num_tgtdevs,
1356                                                       mirror_index,
1357                                                       &stripe_index,
1358                                                       &stripe_offset);
1359                         spage->physical = bbio->stripes[stripe_index].physical +
1360                                          stripe_offset;
1361                         spage->dev = bbio->stripes[stripe_index].dev;
1362
1363                         BUG_ON(page_index >= original_sblock->page_count);
1364                         spage->physical_for_dev_replace =
1365                                 original_sblock->pagev[page_index]->
1366                                 physical_for_dev_replace;
1367                         /* for missing devices, dev->bdev is NULL */
1368                         spage->mirror_num = mirror_index + 1;
1369                         sblock->page_count++;
1370                         spage->page = alloc_page(GFP_NOFS);
1371                         if (!spage->page)
1372                                 goto leave_nomem;
1373
1374                         scrub_get_recover(recover);
1375                         spage->recover = recover;
1376                 }
1377                 scrub_put_recover(fs_info, recover);
1378                 length -= sublen;
1379                 logical += sublen;
1380                 page_index++;
1381         }
1382
1383         return 0;
1384 }
1385
1386 static void scrub_bio_wait_endio(struct bio *bio)
1387 {
1388         complete(bio->bi_private);
1389 }
1390
1391 static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
1392                                         struct bio *bio,
1393                                         struct scrub_page *spage)
1394 {
1395         DECLARE_COMPLETION_ONSTACK(done);
1396         int ret;
1397         int mirror_num;
1398
1399         bio->bi_iter.bi_sector = spage->logical >> 9;
1400         bio->bi_private = &done;
1401         bio->bi_end_io = scrub_bio_wait_endio;
1402
1403         mirror_num = spage->sblock->pagev[0]->mirror_num;
1404         ret = raid56_parity_recover(fs_info, bio, spage->recover->bbio,
1405                                     spage->recover->map_length,
1406                                     mirror_num, 0);
1407         if (ret)
1408                 return ret;
1409
1410         wait_for_completion_io(&done);
1411         return blk_status_to_errno(bio->bi_status);
1412 }
1413
1414 static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
1415                                           struct scrub_block *sblock)
1416 {
1417         struct scrub_page *first_page = sblock->pagev[0];
1418         struct bio *bio;
1419         int page_num;
1420
1421         /* All pages in sblock belong to the same stripe on the same device. */
1422         ASSERT(first_page->dev);
1423         if (!first_page->dev->bdev)
1424                 goto out;
1425
1426         bio = btrfs_io_bio_alloc(BIO_MAX_VECS);
1427         bio_set_dev(bio, first_page->dev->bdev);
1428
1429         for (page_num = 0; page_num < sblock->page_count; page_num++) {
1430                 struct scrub_page *spage = sblock->pagev[page_num];
1431
1432                 WARN_ON(!spage->page);
1433                 bio_add_page(bio, spage->page, PAGE_SIZE, 0);
1434         }
1435
1436         if (scrub_submit_raid56_bio_wait(fs_info, bio, first_page)) {
1437                 bio_put(bio);
1438                 goto out;
1439         }
1440
1441         bio_put(bio);
1442
1443         scrub_recheck_block_checksum(sblock);
1444
1445         return;
1446 out:
1447         for (page_num = 0; page_num < sblock->page_count; page_num++)
1448                 sblock->pagev[page_num]->io_error = 1;
1449
1450         sblock->no_io_error_seen = 0;
1451 }
1452
1453 /*
1454  * this function will check the on disk data for checksum errors, header
1455  * errors and read I/O errors. If any I/O errors happen, the exact pages
1456  * which are errored are marked as being bad. The goal is to enable scrub
1457  * to take those pages that are not errored from all the mirrors so that
1458  * the pages that are errored in the just handled mirror can be repaired.
1459  */
1460 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1461                                 struct scrub_block *sblock,
1462                                 int retry_failed_mirror)
1463 {
1464         int page_num;
1465
1466         sblock->no_io_error_seen = 1;
1467
1468         /* short cut for raid56 */
1469         if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->pagev[0]))
1470                 return scrub_recheck_block_on_raid56(fs_info, sblock);
1471
1472         for (page_num = 0; page_num < sblock->page_count; page_num++) {
1473                 struct bio *bio;
1474                 struct scrub_page *spage = sblock->pagev[page_num];
1475
1476                 if (spage->dev->bdev == NULL) {
1477                         spage->io_error = 1;
1478                         sblock->no_io_error_seen = 0;
1479                         continue;
1480                 }
1481
1482                 WARN_ON(!spage->page);
1483                 bio = btrfs_io_bio_alloc(1);
1484                 bio_set_dev(bio, spage->dev->bdev);
1485
1486                 bio_add_page(bio, spage->page, fs_info->sectorsize, 0);
1487                 bio->bi_iter.bi_sector = spage->physical >> 9;
1488                 bio->bi_opf = REQ_OP_READ;
1489
1490                 if (btrfsic_submit_bio_wait(bio)) {
1491                         spage->io_error = 1;
1492                         sblock->no_io_error_seen = 0;
1493                 }
1494
1495                 bio_put(bio);
1496         }
1497
1498         if (sblock->no_io_error_seen)
1499                 scrub_recheck_block_checksum(sblock);
1500 }
1501
1502 static inline int scrub_check_fsid(u8 fsid[],
1503                                    struct scrub_page *spage)
1504 {
1505         struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices;
1506         int ret;
1507
1508         ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1509         return !ret;
1510 }
1511
1512 static void scrub_recheck_block_checksum(struct scrub_block *sblock)
1513 {
1514         sblock->header_error = 0;
1515         sblock->checksum_error = 0;
1516         sblock->generation_error = 0;
1517
1518         if (sblock->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA)
1519                 scrub_checksum_data(sblock);
1520         else
1521                 scrub_checksum_tree_block(sblock);
1522 }
1523
1524 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
1525                                              struct scrub_block *sblock_good)
1526 {
1527         int page_num;
1528         int ret = 0;
1529
1530         for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
1531                 int ret_sub;
1532
1533                 ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
1534                                                            sblock_good,
1535                                                            page_num, 1);
1536                 if (ret_sub)
1537                         ret = ret_sub;
1538         }
1539
1540         return ret;
1541 }
1542
1543 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1544                                             struct scrub_block *sblock_good,
1545                                             int page_num, int force_write)
1546 {
1547         struct scrub_page *spage_bad = sblock_bad->pagev[page_num];
1548         struct scrub_page *spage_good = sblock_good->pagev[page_num];
1549         struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
1550         const u32 sectorsize = fs_info->sectorsize;
1551
1552         BUG_ON(spage_bad->page == NULL);
1553         BUG_ON(spage_good->page == NULL);
1554         if (force_write || sblock_bad->header_error ||
1555             sblock_bad->checksum_error || spage_bad->io_error) {
1556                 struct bio *bio;
1557                 int ret;
1558
1559                 if (!spage_bad->dev->bdev) {
1560                         btrfs_warn_rl(fs_info,
1561                                 "scrub_repair_page_from_good_copy(bdev == NULL) is unexpected");
1562                         return -EIO;
1563                 }
1564
1565                 bio = btrfs_io_bio_alloc(1);
1566                 bio_set_dev(bio, spage_bad->dev->bdev);
1567                 bio->bi_iter.bi_sector = spage_bad->physical >> 9;
1568                 bio->bi_opf = REQ_OP_WRITE;
1569
1570                 ret = bio_add_page(bio, spage_good->page, sectorsize, 0);
1571                 if (ret != sectorsize) {
1572                         bio_put(bio);
1573                         return -EIO;
1574                 }
1575
1576                 if (btrfsic_submit_bio_wait(bio)) {
1577                         btrfs_dev_stat_inc_and_print(spage_bad->dev,
1578                                 BTRFS_DEV_STAT_WRITE_ERRS);
1579                         atomic64_inc(&fs_info->dev_replace.num_write_errors);
1580                         bio_put(bio);
1581                         return -EIO;
1582                 }
1583                 bio_put(bio);
1584         }
1585
1586         return 0;
1587 }
1588
1589 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1590 {
1591         struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
1592         int page_num;
1593
1594         /*
1595          * This block is used for the check of the parity on the source device,
1596          * so the data needn't be written into the destination device.
1597          */
1598         if (sblock->sparity)
1599                 return;
1600
1601         for (page_num = 0; page_num < sblock->page_count; page_num++) {
1602                 int ret;
1603
1604                 ret = scrub_write_page_to_dev_replace(sblock, page_num);
1605                 if (ret)
1606                         atomic64_inc(&fs_info->dev_replace.num_write_errors);
1607         }
1608 }
1609
1610 static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
1611                                            int page_num)
1612 {
1613         struct scrub_page *spage = sblock->pagev[page_num];
1614
1615         BUG_ON(spage->page == NULL);
1616         if (spage->io_error)
1617                 clear_page(page_address(spage->page));
1618
1619         return scrub_add_page_to_wr_bio(sblock->sctx, spage);
1620 }
1621
1622 static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
1623 {
1624         int ret = 0;
1625         u64 length;
1626
1627         if (!btrfs_is_zoned(sctx->fs_info))
1628                 return 0;
1629
1630         if (!btrfs_dev_is_sequential(sctx->wr_tgtdev, physical))
1631                 return 0;
1632
1633         if (sctx->write_pointer < physical) {
1634                 length = physical - sctx->write_pointer;
1635
1636                 ret = btrfs_zoned_issue_zeroout(sctx->wr_tgtdev,
1637                                                 sctx->write_pointer, length);
1638                 if (!ret)
1639                         sctx->write_pointer = physical;
1640         }
1641         return ret;
1642 }
1643
1644 static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
1645                                     struct scrub_page *spage)
1646 {
1647         struct scrub_bio *sbio;
1648         int ret;
1649         const u32 sectorsize = sctx->fs_info->sectorsize;
1650
1651         mutex_lock(&sctx->wr_lock);
1652 again:
1653         if (!sctx->wr_curr_bio) {
1654                 sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio),
1655                                               GFP_KERNEL);
1656                 if (!sctx->wr_curr_bio) {
1657                         mutex_unlock(&sctx->wr_lock);
1658                         return -ENOMEM;
1659                 }
1660                 sctx->wr_curr_bio->sctx = sctx;
1661                 sctx->wr_curr_bio->page_count = 0;
1662         }
1663         sbio = sctx->wr_curr_bio;
1664         if (sbio->page_count == 0) {
1665                 struct bio *bio;
1666
1667                 ret = fill_writer_pointer_gap(sctx,
1668                                               spage->physical_for_dev_replace);
1669                 if (ret) {
1670                         mutex_unlock(&sctx->wr_lock);
1671                         return ret;
1672                 }
1673
1674                 sbio->physical = spage->physical_for_dev_replace;
1675                 sbio->logical = spage->logical;
1676                 sbio->dev = sctx->wr_tgtdev;
1677                 bio = sbio->bio;
1678                 if (!bio) {
1679                         bio = btrfs_io_bio_alloc(sctx->pages_per_wr_bio);
1680                         sbio->bio = bio;
1681                 }
1682
1683                 bio->bi_private = sbio;
1684                 bio->bi_end_io = scrub_wr_bio_end_io;
1685                 bio_set_dev(bio, sbio->dev->bdev);
1686                 bio->bi_iter.bi_sector = sbio->physical >> 9;
1687                 bio->bi_opf = REQ_OP_WRITE;
1688                 sbio->status = 0;
1689         } else if (sbio->physical + sbio->page_count * sectorsize !=
1690                    spage->physical_for_dev_replace ||
1691                    sbio->logical + sbio->page_count * sectorsize !=
1692                    spage->logical) {
1693                 scrub_wr_submit(sctx);
1694                 goto again;
1695         }
1696
1697         ret = bio_add_page(sbio->bio, spage->page, sectorsize, 0);
1698         if (ret != sectorsize) {
1699                 if (sbio->page_count < 1) {
1700                         bio_put(sbio->bio);
1701                         sbio->bio = NULL;
1702                         mutex_unlock(&sctx->wr_lock);
1703                         return -EIO;
1704                 }
1705                 scrub_wr_submit(sctx);
1706                 goto again;
1707         }
1708
1709         sbio->pagev[sbio->page_count] = spage;
1710         scrub_page_get(spage);
1711         sbio->page_count++;
1712         if (sbio->page_count == sctx->pages_per_wr_bio)
1713                 scrub_wr_submit(sctx);
1714         mutex_unlock(&sctx->wr_lock);
1715
1716         return 0;
1717 }
1718
1719 static void scrub_wr_submit(struct scrub_ctx *sctx)
1720 {
1721         struct scrub_bio *sbio;
1722
1723         if (!sctx->wr_curr_bio)
1724                 return;
1725
1726         sbio = sctx->wr_curr_bio;
1727         sctx->wr_curr_bio = NULL;
1728         WARN_ON(!sbio->bio->bi_bdev);
1729         scrub_pending_bio_inc(sctx);
1730         /* process all writes in a single worker thread. Then the block layer
1731          * orders the requests before sending them to the driver which
1732          * doubled the write performance on spinning disks when measured
1733          * with Linux 3.5 */
1734         btrfsic_submit_bio(sbio->bio);
1735
1736         if (btrfs_is_zoned(sctx->fs_info))
1737                 sctx->write_pointer = sbio->physical + sbio->page_count *
1738                         sctx->fs_info->sectorsize;
1739 }
1740
1741 static void scrub_wr_bio_end_io(struct bio *bio)
1742 {
1743         struct scrub_bio *sbio = bio->bi_private;
1744         struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
1745
1746         sbio->status = bio->bi_status;
1747         sbio->bio = bio;
1748
1749         btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL);
1750         btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
1751 }
1752
1753 static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
1754 {
1755         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1756         struct scrub_ctx *sctx = sbio->sctx;
1757         int i;
1758
1759         WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
1760         if (sbio->status) {
1761                 struct btrfs_dev_replace *dev_replace =
1762                         &sbio->sctx->fs_info->dev_replace;
1763
1764                 for (i = 0; i < sbio->page_count; i++) {
1765                         struct scrub_page *spage = sbio->pagev[i];
1766
1767                         spage->io_error = 1;
1768                         atomic64_inc(&dev_replace->num_write_errors);
1769                 }
1770         }
1771
1772         for (i = 0; i < sbio->page_count; i++)
1773                 scrub_page_put(sbio->pagev[i]);
1774
1775         bio_put(sbio->bio);
1776         kfree(sbio);
1777         scrub_pending_bio_dec(sctx);
1778 }
1779
1780 static int scrub_checksum(struct scrub_block *sblock)
1781 {
1782         u64 flags;
1783         int ret;
1784
1785         /*
1786          * No need to initialize these stats currently,
1787          * because this function only use return value
1788          * instead of these stats value.
1789          *
1790          * Todo:
1791          * always use stats
1792          */
1793         sblock->header_error = 0;
1794         sblock->generation_error = 0;
1795         sblock->checksum_error = 0;
1796
1797         WARN_ON(sblock->page_count < 1);
1798         flags = sblock->pagev[0]->flags;
1799         ret = 0;
1800         if (flags & BTRFS_EXTENT_FLAG_DATA)
1801                 ret = scrub_checksum_data(sblock);
1802         else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1803                 ret = scrub_checksum_tree_block(sblock);
1804         else if (flags & BTRFS_EXTENT_FLAG_SUPER)
1805                 (void)scrub_checksum_super(sblock);
1806         else
1807                 WARN_ON(1);
1808         if (ret)
1809                 scrub_handle_errored_block(sblock);
1810
1811         return ret;
1812 }
1813
1814 static int scrub_checksum_data(struct scrub_block *sblock)
1815 {
1816         struct scrub_ctx *sctx = sblock->sctx;
1817         struct btrfs_fs_info *fs_info = sctx->fs_info;
1818         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1819         u8 csum[BTRFS_CSUM_SIZE];
1820         struct scrub_page *spage;
1821         char *kaddr;
1822
1823         BUG_ON(sblock->page_count < 1);
1824         spage = sblock->pagev[0];
1825         if (!spage->have_csum)
1826                 return 0;
1827
1828         kaddr = page_address(spage->page);
1829
1830         shash->tfm = fs_info->csum_shash;
1831         crypto_shash_init(shash);
1832
1833         /*
1834          * In scrub_pages() and scrub_pages_for_parity() we ensure each spage
1835          * only contains one sector of data.
1836          */
1837         crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
1838
1839         if (memcmp(csum, spage->csum, fs_info->csum_size))
1840                 sblock->checksum_error = 1;
1841         return sblock->checksum_error;
1842 }
1843
1844 static int scrub_checksum_tree_block(struct scrub_block *sblock)
1845 {
1846         struct scrub_ctx *sctx = sblock->sctx;
1847         struct btrfs_header *h;
1848         struct btrfs_fs_info *fs_info = sctx->fs_info;
1849         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1850         u8 calculated_csum[BTRFS_CSUM_SIZE];
1851         u8 on_disk_csum[BTRFS_CSUM_SIZE];
1852         /*
1853          * This is done in sectorsize steps even for metadata as there's a
1854          * constraint for nodesize to be aligned to sectorsize. This will need
1855          * to change so we don't misuse data and metadata units like that.
1856          */
1857         const u32 sectorsize = sctx->fs_info->sectorsize;
1858         const int num_sectors = fs_info->nodesize >> fs_info->sectorsize_bits;
1859         int i;
1860         struct scrub_page *spage;
1861         char *kaddr;
1862
1863         BUG_ON(sblock->page_count < 1);
1864
1865         /* Each member in pagev is just one block, not a full page */
1866         ASSERT(sblock->page_count == num_sectors);
1867
1868         spage = sblock->pagev[0];
1869         kaddr = page_address(spage->page);
1870         h = (struct btrfs_header *)kaddr;
1871         memcpy(on_disk_csum, h->csum, sctx->fs_info->csum_size);
1872
1873         /*
1874          * we don't use the getter functions here, as we
1875          * a) don't have an extent buffer and
1876          * b) the page is already kmapped
1877          */
1878         if (spage->logical != btrfs_stack_header_bytenr(h))
1879                 sblock->header_error = 1;
1880
1881         if (spage->generation != btrfs_stack_header_generation(h)) {
1882                 sblock->header_error = 1;
1883                 sblock->generation_error = 1;
1884         }
1885
1886         if (!scrub_check_fsid(h->fsid, spage))
1887                 sblock->header_error = 1;
1888
1889         if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1890                    BTRFS_UUID_SIZE))
1891                 sblock->header_error = 1;
1892
1893         shash->tfm = fs_info->csum_shash;
1894         crypto_shash_init(shash);
1895         crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
1896                             sectorsize - BTRFS_CSUM_SIZE);
1897
1898         for (i = 1; i < num_sectors; i++) {
1899                 kaddr = page_address(sblock->pagev[i]->page);
1900                 crypto_shash_update(shash, kaddr, sectorsize);
1901         }
1902
1903         crypto_shash_final(shash, calculated_csum);
1904         if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size))
1905                 sblock->checksum_error = 1;
1906
1907         return sblock->header_error || sblock->checksum_error;
1908 }
1909
1910 static int scrub_checksum_super(struct scrub_block *sblock)
1911 {
1912         struct btrfs_super_block *s;
1913         struct scrub_ctx *sctx = sblock->sctx;
1914         struct btrfs_fs_info *fs_info = sctx->fs_info;
1915         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1916         u8 calculated_csum[BTRFS_CSUM_SIZE];
1917         struct scrub_page *spage;
1918         char *kaddr;
1919         int fail_gen = 0;
1920         int fail_cor = 0;
1921
1922         BUG_ON(sblock->page_count < 1);
1923         spage = sblock->pagev[0];
1924         kaddr = page_address(spage->page);
1925         s = (struct btrfs_super_block *)kaddr;
1926
1927         if (spage->logical != btrfs_super_bytenr(s))
1928                 ++fail_cor;
1929
1930         if (spage->generation != btrfs_super_generation(s))
1931                 ++fail_gen;
1932
1933         if (!scrub_check_fsid(s->fsid, spage))
1934                 ++fail_cor;
1935
1936         shash->tfm = fs_info->csum_shash;
1937         crypto_shash_init(shash);
1938         crypto_shash_digest(shash, kaddr + BTRFS_CSUM_SIZE,
1939                         BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, calculated_csum);
1940
1941         if (memcmp(calculated_csum, s->csum, sctx->fs_info->csum_size))
1942                 ++fail_cor;
1943
1944         if (fail_cor + fail_gen) {
1945                 /*
1946                  * if we find an error in a super block, we just report it.
1947                  * They will get written with the next transaction commit
1948                  * anyway
1949                  */
1950                 spin_lock(&sctx->stat_lock);
1951                 ++sctx->stat.super_errors;
1952                 spin_unlock(&sctx->stat_lock);
1953                 if (fail_cor)
1954                         btrfs_dev_stat_inc_and_print(spage->dev,
1955                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
1956                 else
1957                         btrfs_dev_stat_inc_and_print(spage->dev,
1958                                 BTRFS_DEV_STAT_GENERATION_ERRS);
1959         }
1960
1961         return fail_cor + fail_gen;
1962 }
1963
1964 static void scrub_block_get(struct scrub_block *sblock)
1965 {
1966         refcount_inc(&sblock->refs);
1967 }
1968
1969 static void scrub_block_put(struct scrub_block *sblock)
1970 {
1971         if (refcount_dec_and_test(&sblock->refs)) {
1972                 int i;
1973
1974                 if (sblock->sparity)
1975                         scrub_parity_put(sblock->sparity);
1976
1977                 for (i = 0; i < sblock->page_count; i++)
1978                         scrub_page_put(sblock->pagev[i]);
1979                 kfree(sblock);
1980         }
1981 }
1982
1983 static void scrub_page_get(struct scrub_page *spage)
1984 {
1985         atomic_inc(&spage->refs);
1986 }
1987
1988 static void scrub_page_put(struct scrub_page *spage)
1989 {
1990         if (atomic_dec_and_test(&spage->refs)) {
1991                 if (spage->page)
1992                         __free_page(spage->page);
1993                 kfree(spage);
1994         }
1995 }
1996
1997 /*
1998  * Throttling of IO submission, bandwidth-limit based, the timeslice is 1
1999  * second.  Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max.
2000  */
2001 static void scrub_throttle(struct scrub_ctx *sctx)
2002 {
2003         const int time_slice = 1000;
2004         struct scrub_bio *sbio;
2005         struct btrfs_device *device;
2006         s64 delta;
2007         ktime_t now;
2008         u32 div;
2009         u64 bwlimit;
2010
2011         sbio = sctx->bios[sctx->curr];
2012         device = sbio->dev;
2013         bwlimit = READ_ONCE(device->scrub_speed_max);
2014         if (bwlimit == 0)
2015                 return;
2016
2017         /*
2018          * Slice is divided into intervals when the IO is submitted, adjust by
2019          * bwlimit and maximum of 64 intervals.
2020          */
2021         div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024)));
2022         div = min_t(u32, 64, div);
2023
2024         /* Start new epoch, set deadline */
2025         now = ktime_get();
2026         if (sctx->throttle_deadline == 0) {
2027                 sctx->throttle_deadline = ktime_add_ms(now, time_slice / div);
2028                 sctx->throttle_sent = 0;
2029         }
2030
2031         /* Still in the time to send? */
2032         if (ktime_before(now, sctx->throttle_deadline)) {
2033                 /* If current bio is within the limit, send it */
2034                 sctx->throttle_sent += sbio->bio->bi_iter.bi_size;
2035                 if (sctx->throttle_sent <= div_u64(bwlimit, div))
2036                         return;
2037
2038                 /* We're over the limit, sleep until the rest of the slice */
2039                 delta = ktime_ms_delta(sctx->throttle_deadline, now);
2040         } else {
2041                 /* New request after deadline, start new epoch */
2042                 delta = 0;
2043         }
2044
2045         if (delta) {
2046                 long timeout;
2047
2048                 timeout = div_u64(delta * HZ, 1000);
2049                 schedule_timeout_interruptible(timeout);
2050         }
2051
2052         /* Next call will start the deadline period */
2053         sctx->throttle_deadline = 0;
2054 }
2055
2056 static void scrub_submit(struct scrub_ctx *sctx)
2057 {
2058         struct scrub_bio *sbio;
2059
2060         if (sctx->curr == -1)
2061                 return;
2062
2063         scrub_throttle(sctx);
2064
2065         sbio = sctx->bios[sctx->curr];
2066         sctx->curr = -1;
2067         scrub_pending_bio_inc(sctx);
2068         btrfsic_submit_bio(sbio->bio);
2069 }
2070
2071 static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
2072                                     struct scrub_page *spage)
2073 {
2074         struct scrub_block *sblock = spage->sblock;
2075         struct scrub_bio *sbio;
2076         const u32 sectorsize = sctx->fs_info->sectorsize;
2077         int ret;
2078
2079 again:
2080         /*
2081          * grab a fresh bio or wait for one to become available
2082          */
2083         while (sctx->curr == -1) {
2084                 spin_lock(&sctx->list_lock);
2085                 sctx->curr = sctx->first_free;
2086                 if (sctx->curr != -1) {
2087                         sctx->first_free = sctx->bios[sctx->curr]->next_free;
2088                         sctx->bios[sctx->curr]->next_free = -1;
2089                         sctx->bios[sctx->curr]->page_count = 0;
2090                         spin_unlock(&sctx->list_lock);
2091                 } else {
2092                         spin_unlock(&sctx->list_lock);
2093                         wait_event(sctx->list_wait, sctx->first_free != -1);
2094                 }
2095         }
2096         sbio = sctx->bios[sctx->curr];
2097         if (sbio->page_count == 0) {
2098                 struct bio *bio;
2099
2100                 sbio->physical = spage->physical;
2101                 sbio->logical = spage->logical;
2102                 sbio->dev = spage->dev;
2103                 bio = sbio->bio;
2104                 if (!bio) {
2105                         bio = btrfs_io_bio_alloc(sctx->pages_per_rd_bio);
2106                         sbio->bio = bio;
2107                 }
2108
2109                 bio->bi_private = sbio;
2110                 bio->bi_end_io = scrub_bio_end_io;
2111                 bio_set_dev(bio, sbio->dev->bdev);
2112                 bio->bi_iter.bi_sector = sbio->physical >> 9;
2113                 bio->bi_opf = REQ_OP_READ;
2114                 sbio->status = 0;
2115         } else if (sbio->physical + sbio->page_count * sectorsize !=
2116                    spage->physical ||
2117                    sbio->logical + sbio->page_count * sectorsize !=
2118                    spage->logical ||
2119                    sbio->dev != spage->dev) {
2120                 scrub_submit(sctx);
2121                 goto again;
2122         }
2123
2124         sbio->pagev[sbio->page_count] = spage;
2125         ret = bio_add_page(sbio->bio, spage->page, sectorsize, 0);
2126         if (ret != sectorsize) {
2127                 if (sbio->page_count < 1) {
2128                         bio_put(sbio->bio);
2129                         sbio->bio = NULL;
2130                         return -EIO;
2131                 }
2132                 scrub_submit(sctx);
2133                 goto again;
2134         }
2135
2136         scrub_block_get(sblock); /* one for the page added to the bio */
2137         atomic_inc(&sblock->outstanding_pages);
2138         sbio->page_count++;
2139         if (sbio->page_count == sctx->pages_per_rd_bio)
2140                 scrub_submit(sctx);
2141
2142         return 0;
2143 }
2144
2145 static void scrub_missing_raid56_end_io(struct bio *bio)
2146 {
2147         struct scrub_block *sblock = bio->bi_private;
2148         struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
2149
2150         if (bio->bi_status)
2151                 sblock->no_io_error_seen = 0;
2152
2153         bio_put(bio);
2154
2155         btrfs_queue_work(fs_info->scrub_workers, &sblock->work);
2156 }
2157
2158 static void scrub_missing_raid56_worker(struct btrfs_work *work)
2159 {
2160         struct scrub_block *sblock = container_of(work, struct scrub_block, work);
2161         struct scrub_ctx *sctx = sblock->sctx;
2162         struct btrfs_fs_info *fs_info = sctx->fs_info;
2163         u64 logical;
2164         struct btrfs_device *dev;
2165
2166         logical = sblock->pagev[0]->logical;
2167         dev = sblock->pagev[0]->dev;
2168
2169         if (sblock->no_io_error_seen)
2170                 scrub_recheck_block_checksum(sblock);
2171
2172         if (!sblock->no_io_error_seen) {
2173                 spin_lock(&sctx->stat_lock);
2174                 sctx->stat.read_errors++;
2175                 spin_unlock(&sctx->stat_lock);
2176                 btrfs_err_rl_in_rcu(fs_info,
2177                         "IO error rebuilding logical %llu for dev %s",
2178                         logical, rcu_str_deref(dev->name));
2179         } else if (sblock->header_error || sblock->checksum_error) {
2180                 spin_lock(&sctx->stat_lock);
2181                 sctx->stat.uncorrectable_errors++;
2182                 spin_unlock(&sctx->stat_lock);
2183                 btrfs_err_rl_in_rcu(fs_info,
2184                         "failed to rebuild valid logical %llu for dev %s",
2185                         logical, rcu_str_deref(dev->name));
2186         } else {
2187                 scrub_write_block_to_dev_replace(sblock);
2188         }
2189
2190         if (sctx->is_dev_replace && sctx->flush_all_writes) {
2191                 mutex_lock(&sctx->wr_lock);
2192                 scrub_wr_submit(sctx);
2193                 mutex_unlock(&sctx->wr_lock);
2194         }
2195
2196         scrub_block_put(sblock);
2197         scrub_pending_bio_dec(sctx);
2198 }
2199
2200 static void scrub_missing_raid56_pages(struct scrub_block *sblock)
2201 {
2202         struct scrub_ctx *sctx = sblock->sctx;
2203         struct btrfs_fs_info *fs_info = sctx->fs_info;
2204         u64 length = sblock->page_count * PAGE_SIZE;
2205         u64 logical = sblock->pagev[0]->logical;
2206         struct btrfs_bio *bbio = NULL;
2207         struct bio *bio;
2208         struct btrfs_raid_bio *rbio;
2209         int ret;
2210         int i;
2211
2212         btrfs_bio_counter_inc_blocked(fs_info);
2213         ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
2214                         &length, &bbio);
2215         if (ret || !bbio || !bbio->raid_map)
2216                 goto bbio_out;
2217
2218         if (WARN_ON(!sctx->is_dev_replace ||
2219                     !(bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
2220                 /*
2221                  * We shouldn't be scrubbing a missing device. Even for dev
2222                  * replace, we should only get here for RAID 5/6. We either
2223                  * managed to mount something with no mirrors remaining or
2224                  * there's a bug in scrub_remap_extent()/btrfs_map_block().
2225                  */
2226                 goto bbio_out;
2227         }
2228
2229         bio = btrfs_io_bio_alloc(0);
2230         bio->bi_iter.bi_sector = logical >> 9;
2231         bio->bi_private = sblock;
2232         bio->bi_end_io = scrub_missing_raid56_end_io;
2233
2234         rbio = raid56_alloc_missing_rbio(fs_info, bio, bbio, length);
2235         if (!rbio)
2236                 goto rbio_out;
2237
2238         for (i = 0; i < sblock->page_count; i++) {
2239                 struct scrub_page *spage = sblock->pagev[i];
2240
2241                 raid56_add_scrub_pages(rbio, spage->page, spage->logical);
2242         }
2243
2244         btrfs_init_work(&sblock->work, scrub_missing_raid56_worker, NULL, NULL);
2245         scrub_block_get(sblock);
2246         scrub_pending_bio_inc(sctx);
2247         raid56_submit_missing_rbio(rbio);
2248         return;
2249
2250 rbio_out:
2251         bio_put(bio);
2252 bbio_out:
2253         btrfs_bio_counter_dec(fs_info);
2254         btrfs_put_bbio(bbio);
2255         spin_lock(&sctx->stat_lock);
2256         sctx->stat.malloc_errors++;
2257         spin_unlock(&sctx->stat_lock);
2258 }
2259
2260 static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len,
2261                        u64 physical, struct btrfs_device *dev, u64 flags,
2262                        u64 gen, int mirror_num, u8 *csum,
2263                        u64 physical_for_dev_replace)
2264 {
2265         struct scrub_block *sblock;
2266         const u32 sectorsize = sctx->fs_info->sectorsize;
2267         int index;
2268
2269         sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2270         if (!sblock) {
2271                 spin_lock(&sctx->stat_lock);
2272                 sctx->stat.malloc_errors++;
2273                 spin_unlock(&sctx->stat_lock);
2274                 return -ENOMEM;
2275         }
2276
2277         /* one ref inside this function, plus one for each page added to
2278          * a bio later on */
2279         refcount_set(&sblock->refs, 1);
2280         sblock->sctx = sctx;
2281         sblock->no_io_error_seen = 1;
2282
2283         for (index = 0; len > 0; index++) {
2284                 struct scrub_page *spage;
2285                 /*
2286                  * Here we will allocate one page for one sector to scrub.
2287                  * This is fine if PAGE_SIZE == sectorsize, but will cost
2288                  * more memory for PAGE_SIZE > sectorsize case.
2289                  */
2290                 u32 l = min(sectorsize, len);
2291
2292                 spage = kzalloc(sizeof(*spage), GFP_KERNEL);
2293                 if (!spage) {
2294 leave_nomem:
2295                         spin_lock(&sctx->stat_lock);
2296                         sctx->stat.malloc_errors++;
2297                         spin_unlock(&sctx->stat_lock);
2298                         scrub_block_put(sblock);
2299                         return -ENOMEM;
2300                 }
2301                 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2302                 scrub_page_get(spage);
2303                 sblock->pagev[index] = spage;
2304                 spage->sblock = sblock;
2305                 spage->dev = dev;
2306                 spage->flags = flags;
2307                 spage->generation = gen;
2308                 spage->logical = logical;
2309                 spage->physical = physical;
2310                 spage->physical_for_dev_replace = physical_for_dev_replace;
2311                 spage->mirror_num = mirror_num;
2312                 if (csum) {
2313                         spage->have_csum = 1;
2314                         memcpy(spage->csum, csum, sctx->fs_info->csum_size);
2315                 } else {
2316                         spage->have_csum = 0;
2317                 }
2318                 sblock->page_count++;
2319                 spage->page = alloc_page(GFP_KERNEL);
2320                 if (!spage->page)
2321                         goto leave_nomem;
2322                 len -= l;
2323                 logical += l;
2324                 physical += l;
2325                 physical_for_dev_replace += l;
2326         }
2327
2328         WARN_ON(sblock->page_count == 0);
2329         if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2330                 /*
2331                  * This case should only be hit for RAID 5/6 device replace. See
2332                  * the comment in scrub_missing_raid56_pages() for details.
2333                  */
2334                 scrub_missing_raid56_pages(sblock);
2335         } else {
2336                 for (index = 0; index < sblock->page_count; index++) {
2337                         struct scrub_page *spage = sblock->pagev[index];
2338                         int ret;
2339
2340                         ret = scrub_add_page_to_rd_bio(sctx, spage);
2341                         if (ret) {
2342                                 scrub_block_put(sblock);
2343                                 return ret;
2344                         }
2345                 }
2346
2347                 if (flags & BTRFS_EXTENT_FLAG_SUPER)
2348                         scrub_submit(sctx);
2349         }
2350
2351         /* last one frees, either here or in bio completion for last page */
2352         scrub_block_put(sblock);
2353         return 0;
2354 }
2355
2356 static void scrub_bio_end_io(struct bio *bio)
2357 {
2358         struct scrub_bio *sbio = bio->bi_private;
2359         struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
2360
2361         sbio->status = bio->bi_status;
2362         sbio->bio = bio;
2363
2364         btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
2365 }
2366
2367 static void scrub_bio_end_io_worker(struct btrfs_work *work)
2368 {
2369         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
2370         struct scrub_ctx *sctx = sbio->sctx;
2371         int i;
2372
2373         BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
2374         if (sbio->status) {
2375                 for (i = 0; i < sbio->page_count; i++) {
2376                         struct scrub_page *spage = sbio->pagev[i];
2377
2378                         spage->io_error = 1;
2379                         spage->sblock->no_io_error_seen = 0;
2380                 }
2381         }
2382
2383         /* now complete the scrub_block items that have all pages completed */
2384         for (i = 0; i < sbio->page_count; i++) {
2385                 struct scrub_page *spage = sbio->pagev[i];
2386                 struct scrub_block *sblock = spage->sblock;
2387
2388                 if (atomic_dec_and_test(&sblock->outstanding_pages))
2389                         scrub_block_complete(sblock);
2390                 scrub_block_put(sblock);
2391         }
2392
2393         bio_put(sbio->bio);
2394         sbio->bio = NULL;
2395         spin_lock(&sctx->list_lock);
2396         sbio->next_free = sctx->first_free;
2397         sctx->first_free = sbio->index;
2398         spin_unlock(&sctx->list_lock);
2399
2400         if (sctx->is_dev_replace && sctx->flush_all_writes) {
2401                 mutex_lock(&sctx->wr_lock);
2402                 scrub_wr_submit(sctx);
2403                 mutex_unlock(&sctx->wr_lock);
2404         }
2405
2406         scrub_pending_bio_dec(sctx);
2407 }
2408
2409 static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
2410                                        unsigned long *bitmap,
2411                                        u64 start, u32 len)
2412 {
2413         u64 offset;
2414         u32 nsectors;
2415         u32 sectorsize_bits = sparity->sctx->fs_info->sectorsize_bits;
2416
2417         if (len >= sparity->stripe_len) {
2418                 bitmap_set(bitmap, 0, sparity->nsectors);
2419                 return;
2420         }
2421
2422         start -= sparity->logic_start;
2423         start = div64_u64_rem(start, sparity->stripe_len, &offset);
2424         offset = offset >> sectorsize_bits;
2425         nsectors = len >> sectorsize_bits;
2426
2427         if (offset + nsectors <= sparity->nsectors) {
2428                 bitmap_set(bitmap, offset, nsectors);
2429                 return;
2430         }
2431
2432         bitmap_set(bitmap, offset, sparity->nsectors - offset);
2433         bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
2434 }
2435
2436 static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
2437                                                    u64 start, u32 len)
2438 {
2439         __scrub_mark_bitmap(sparity, sparity->ebitmap, start, len);
2440 }
2441
2442 static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
2443                                                   u64 start, u32 len)
2444 {
2445         __scrub_mark_bitmap(sparity, sparity->dbitmap, start, len);
2446 }
2447
2448 static void scrub_block_complete(struct scrub_block *sblock)
2449 {
2450         int corrupted = 0;
2451
2452         if (!sblock->no_io_error_seen) {
2453                 corrupted = 1;
2454                 scrub_handle_errored_block(sblock);
2455         } else {
2456                 /*
2457                  * if has checksum error, write via repair mechanism in
2458                  * dev replace case, otherwise write here in dev replace
2459                  * case.
2460                  */
2461                 corrupted = scrub_checksum(sblock);
2462                 if (!corrupted && sblock->sctx->is_dev_replace)
2463                         scrub_write_block_to_dev_replace(sblock);
2464         }
2465
2466         if (sblock->sparity && corrupted && !sblock->data_corrected) {
2467                 u64 start = sblock->pagev[0]->logical;
2468                 u64 end = sblock->pagev[sblock->page_count - 1]->logical +
2469                           sblock->sctx->fs_info->sectorsize;
2470
2471                 ASSERT(end - start <= U32_MAX);
2472                 scrub_parity_mark_sectors_error(sblock->sparity,
2473                                                 start, end - start);
2474         }
2475 }
2476
2477 static void drop_csum_range(struct scrub_ctx *sctx, struct btrfs_ordered_sum *sum)
2478 {
2479         sctx->stat.csum_discards += sum->len >> sctx->fs_info->sectorsize_bits;
2480         list_del(&sum->list);
2481         kfree(sum);
2482 }
2483
2484 /*
2485  * Find the desired csum for range [logical, logical + sectorsize), and store
2486  * the csum into @csum.
2487  *
2488  * The search source is sctx->csum_list, which is a pre-populated list
2489  * storing bytenr ordered csum ranges.  We're responsible to cleanup any range
2490  * that is before @logical.
2491  *
2492  * Return 0 if there is no csum for the range.
2493  * Return 1 if there is csum for the range and copied to @csum.
2494  */
2495 static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
2496 {
2497         bool found = false;
2498
2499         while (!list_empty(&sctx->csum_list)) {
2500                 struct btrfs_ordered_sum *sum = NULL;
2501                 unsigned long index;
2502                 unsigned long num_sectors;
2503
2504                 sum = list_first_entry(&sctx->csum_list,
2505                                        struct btrfs_ordered_sum, list);
2506                 /* The current csum range is beyond our range, no csum found */
2507                 if (sum->bytenr > logical)
2508                         break;
2509
2510                 /*
2511                  * The current sum is before our bytenr, since scrub is always
2512                  * done in bytenr order, the csum will never be used anymore,
2513                  * clean it up so that later calls won't bother with the range,
2514                  * and continue search the next range.
2515                  */
2516                 if (sum->bytenr + sum->len <= logical) {
2517                         drop_csum_range(sctx, sum);
2518                         continue;
2519                 }
2520
2521                 /* Now the csum range covers our bytenr, copy the csum */
2522                 found = true;
2523                 index = (logical - sum->bytenr) >> sctx->fs_info->sectorsize_bits;
2524                 num_sectors = sum->len >> sctx->fs_info->sectorsize_bits;
2525
2526                 memcpy(csum, sum->sums + index * sctx->fs_info->csum_size,
2527                        sctx->fs_info->csum_size);
2528
2529                 /* Cleanup the range if we're at the end of the csum range */
2530                 if (index == num_sectors - 1)
2531                         drop_csum_range(sctx, sum);
2532                 break;
2533         }
2534         if (!found)
2535                 return 0;
2536         return 1;
2537 }
2538
2539 /* scrub extent tries to collect up to 64 kB for each bio */
2540 static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
2541                         u64 logical, u32 len,
2542                         u64 physical, struct btrfs_device *dev, u64 flags,
2543                         u64 gen, int mirror_num, u64 physical_for_dev_replace)
2544 {
2545         int ret;
2546         u8 csum[BTRFS_CSUM_SIZE];
2547         u32 blocksize;
2548
2549         if (flags & BTRFS_EXTENT_FLAG_DATA) {
2550                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2551                         blocksize = map->stripe_len;
2552                 else
2553                         blocksize = sctx->fs_info->sectorsize;
2554                 spin_lock(&sctx->stat_lock);
2555                 sctx->stat.data_extents_scrubbed++;
2556                 sctx->stat.data_bytes_scrubbed += len;
2557                 spin_unlock(&sctx->stat_lock);
2558         } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2559                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2560                         blocksize = map->stripe_len;
2561                 else
2562                         blocksize = sctx->fs_info->nodesize;
2563                 spin_lock(&sctx->stat_lock);
2564                 sctx->stat.tree_extents_scrubbed++;
2565                 sctx->stat.tree_bytes_scrubbed += len;
2566                 spin_unlock(&sctx->stat_lock);
2567         } else {
2568                 blocksize = sctx->fs_info->sectorsize;
2569                 WARN_ON(1);
2570         }
2571
2572         while (len) {
2573                 u32 l = min(len, blocksize);
2574                 int have_csum = 0;
2575
2576                 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2577                         /* push csums to sbio */
2578                         have_csum = scrub_find_csum(sctx, logical, csum);
2579                         if (have_csum == 0)
2580                                 ++sctx->stat.no_csum;
2581                 }
2582                 ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
2583                                   mirror_num, have_csum ? csum : NULL,
2584                                   physical_for_dev_replace);
2585                 if (ret)
2586                         return ret;
2587                 len -= l;
2588                 logical += l;
2589                 physical += l;
2590                 physical_for_dev_replace += l;
2591         }
2592         return 0;
2593 }
2594
2595 static int scrub_pages_for_parity(struct scrub_parity *sparity,
2596                                   u64 logical, u32 len,
2597                                   u64 physical, struct btrfs_device *dev,
2598                                   u64 flags, u64 gen, int mirror_num, u8 *csum)
2599 {
2600         struct scrub_ctx *sctx = sparity->sctx;
2601         struct scrub_block *sblock;
2602         const u32 sectorsize = sctx->fs_info->sectorsize;
2603         int index;
2604
2605         ASSERT(IS_ALIGNED(len, sectorsize));
2606
2607         sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2608         if (!sblock) {
2609                 spin_lock(&sctx->stat_lock);
2610                 sctx->stat.malloc_errors++;
2611                 spin_unlock(&sctx->stat_lock);
2612                 return -ENOMEM;
2613         }
2614
2615         /* one ref inside this function, plus one for each page added to
2616          * a bio later on */
2617         refcount_set(&sblock->refs, 1);
2618         sblock->sctx = sctx;
2619         sblock->no_io_error_seen = 1;
2620         sblock->sparity = sparity;
2621         scrub_parity_get(sparity);
2622
2623         for (index = 0; len > 0; index++) {
2624                 struct scrub_page *spage;
2625
2626                 spage = kzalloc(sizeof(*spage), GFP_KERNEL);
2627                 if (!spage) {
2628 leave_nomem:
2629                         spin_lock(&sctx->stat_lock);
2630                         sctx->stat.malloc_errors++;
2631                         spin_unlock(&sctx->stat_lock);
2632                         scrub_block_put(sblock);
2633                         return -ENOMEM;
2634                 }
2635                 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2636                 /* For scrub block */
2637                 scrub_page_get(spage);
2638                 sblock->pagev[index] = spage;
2639                 /* For scrub parity */
2640                 scrub_page_get(spage);
2641                 list_add_tail(&spage->list, &sparity->spages);
2642                 spage->sblock = sblock;
2643                 spage->dev = dev;
2644                 spage->flags = flags;
2645                 spage->generation = gen;
2646                 spage->logical = logical;
2647                 spage->physical = physical;
2648                 spage->mirror_num = mirror_num;
2649                 if (csum) {
2650                         spage->have_csum = 1;
2651                         memcpy(spage->csum, csum, sctx->fs_info->csum_size);
2652                 } else {
2653                         spage->have_csum = 0;
2654                 }
2655                 sblock->page_count++;
2656                 spage->page = alloc_page(GFP_KERNEL);
2657                 if (!spage->page)
2658                         goto leave_nomem;
2659
2660
2661                 /* Iterate over the stripe range in sectorsize steps */
2662                 len -= sectorsize;
2663                 logical += sectorsize;
2664                 physical += sectorsize;
2665         }
2666
2667         WARN_ON(sblock->page_count == 0);
2668         for (index = 0; index < sblock->page_count; index++) {
2669                 struct scrub_page *spage = sblock->pagev[index];
2670                 int ret;
2671
2672                 ret = scrub_add_page_to_rd_bio(sctx, spage);
2673                 if (ret) {
2674                         scrub_block_put(sblock);
2675                         return ret;
2676                 }
2677         }
2678
2679         /* last one frees, either here or in bio completion for last page */
2680         scrub_block_put(sblock);
2681         return 0;
2682 }
2683
2684 static int scrub_extent_for_parity(struct scrub_parity *sparity,
2685                                    u64 logical, u32 len,
2686                                    u64 physical, struct btrfs_device *dev,
2687                                    u64 flags, u64 gen, int mirror_num)
2688 {
2689         struct scrub_ctx *sctx = sparity->sctx;
2690         int ret;
2691         u8 csum[BTRFS_CSUM_SIZE];
2692         u32 blocksize;
2693
2694         if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2695                 scrub_parity_mark_sectors_error(sparity, logical, len);
2696                 return 0;
2697         }
2698
2699         if (flags & BTRFS_EXTENT_FLAG_DATA) {
2700                 blocksize = sparity->stripe_len;
2701         } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2702                 blocksize = sparity->stripe_len;
2703         } else {
2704                 blocksize = sctx->fs_info->sectorsize;
2705                 WARN_ON(1);
2706         }
2707
2708         while (len) {
2709                 u32 l = min(len, blocksize);
2710                 int have_csum = 0;
2711
2712                 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2713                         /* push csums to sbio */
2714                         have_csum = scrub_find_csum(sctx, logical, csum);
2715                         if (have_csum == 0)
2716                                 goto skip;
2717                 }
2718                 ret = scrub_pages_for_parity(sparity, logical, l, physical, dev,
2719                                              flags, gen, mirror_num,
2720                                              have_csum ? csum : NULL);
2721                 if (ret)
2722                         return ret;
2723 skip:
2724                 len -= l;
2725                 logical += l;
2726                 physical += l;
2727         }
2728         return 0;
2729 }
2730
2731 /*
2732  * Given a physical address, this will calculate it's
2733  * logical offset. if this is a parity stripe, it will return
2734  * the most left data stripe's logical offset.
2735  *
2736  * return 0 if it is a data stripe, 1 means parity stripe.
2737  */
2738 static int get_raid56_logic_offset(u64 physical, int num,
2739                                    struct map_lookup *map, u64 *offset,
2740                                    u64 *stripe_start)
2741 {
2742         int i;
2743         int j = 0;
2744         u64 stripe_nr;
2745         u64 last_offset;
2746         u32 stripe_index;
2747         u32 rot;
2748         const int data_stripes = nr_data_stripes(map);
2749
2750         last_offset = (physical - map->stripes[num].physical) * data_stripes;
2751         if (stripe_start)
2752                 *stripe_start = last_offset;
2753
2754         *offset = last_offset;
2755         for (i = 0; i < data_stripes; i++) {
2756                 *offset = last_offset + i * map->stripe_len;
2757
2758                 stripe_nr = div64_u64(*offset, map->stripe_len);
2759                 stripe_nr = div_u64(stripe_nr, data_stripes);
2760
2761                 /* Work out the disk rotation on this stripe-set */
2762                 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
2763                 /* calculate which stripe this data locates */
2764                 rot += i;
2765                 stripe_index = rot % map->num_stripes;
2766                 if (stripe_index == num)
2767                         return 0;
2768                 if (stripe_index < num)
2769                         j++;
2770         }
2771         *offset = last_offset + j * map->stripe_len;
2772         return 1;
2773 }
2774
2775 static void scrub_free_parity(struct scrub_parity *sparity)
2776 {
2777         struct scrub_ctx *sctx = sparity->sctx;
2778         struct scrub_page *curr, *next;
2779         int nbits;
2780
2781         nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors);
2782         if (nbits) {
2783                 spin_lock(&sctx->stat_lock);
2784                 sctx->stat.read_errors += nbits;
2785                 sctx->stat.uncorrectable_errors += nbits;
2786                 spin_unlock(&sctx->stat_lock);
2787         }
2788
2789         list_for_each_entry_safe(curr, next, &sparity->spages, list) {
2790                 list_del_init(&curr->list);
2791                 scrub_page_put(curr);
2792         }
2793
2794         kfree(sparity);
2795 }
2796
2797 static void scrub_parity_bio_endio_worker(struct btrfs_work *work)
2798 {
2799         struct scrub_parity *sparity = container_of(work, struct scrub_parity,
2800                                                     work);
2801         struct scrub_ctx *sctx = sparity->sctx;
2802
2803         scrub_free_parity(sparity);
2804         scrub_pending_bio_dec(sctx);
2805 }
2806
2807 static void scrub_parity_bio_endio(struct bio *bio)
2808 {
2809         struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
2810         struct btrfs_fs_info *fs_info = sparity->sctx->fs_info;
2811
2812         if (bio->bi_status)
2813                 bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
2814                           sparity->nsectors);
2815
2816         bio_put(bio);
2817
2818         btrfs_init_work(&sparity->work, scrub_parity_bio_endio_worker, NULL,
2819                         NULL);
2820         btrfs_queue_work(fs_info->scrub_parity_workers, &sparity->work);
2821 }
2822
2823 static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
2824 {
2825         struct scrub_ctx *sctx = sparity->sctx;
2826         struct btrfs_fs_info *fs_info = sctx->fs_info;
2827         struct bio *bio;
2828         struct btrfs_raid_bio *rbio;
2829         struct btrfs_bio *bbio = NULL;
2830         u64 length;
2831         int ret;
2832
2833         if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap,
2834                            sparity->nsectors))
2835                 goto out;
2836
2837         length = sparity->logic_end - sparity->logic_start;
2838
2839         btrfs_bio_counter_inc_blocked(fs_info);
2840         ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
2841                                &length, &bbio);
2842         if (ret || !bbio || !bbio->raid_map)
2843                 goto bbio_out;
2844
2845         bio = btrfs_io_bio_alloc(0);
2846         bio->bi_iter.bi_sector = sparity->logic_start >> 9;
2847         bio->bi_private = sparity;
2848         bio->bi_end_io = scrub_parity_bio_endio;
2849
2850         rbio = raid56_parity_alloc_scrub_rbio(fs_info, bio, bbio,
2851                                               length, sparity->scrub_dev,
2852                                               sparity->dbitmap,
2853                                               sparity->nsectors);
2854         if (!rbio)
2855                 goto rbio_out;
2856
2857         scrub_pending_bio_inc(sctx);
2858         raid56_parity_submit_scrub_rbio(rbio);
2859         return;
2860
2861 rbio_out:
2862         bio_put(bio);
2863 bbio_out:
2864         btrfs_bio_counter_dec(fs_info);
2865         btrfs_put_bbio(bbio);
2866         bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
2867                   sparity->nsectors);
2868         spin_lock(&sctx->stat_lock);
2869         sctx->stat.malloc_errors++;
2870         spin_unlock(&sctx->stat_lock);
2871 out:
2872         scrub_free_parity(sparity);
2873 }
2874
2875 static inline int scrub_calc_parity_bitmap_len(int nsectors)
2876 {
2877         return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * sizeof(long);
2878 }
2879
2880 static void scrub_parity_get(struct scrub_parity *sparity)
2881 {
2882         refcount_inc(&sparity->refs);
2883 }
2884
2885 static void scrub_parity_put(struct scrub_parity *sparity)
2886 {
2887         if (!refcount_dec_and_test(&sparity->refs))
2888                 return;
2889
2890         scrub_parity_check_and_repair(sparity);
2891 }
2892
2893 static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
2894                                                   struct map_lookup *map,
2895                                                   struct btrfs_device *sdev,
2896                                                   struct btrfs_path *path,
2897                                                   u64 logic_start,
2898                                                   u64 logic_end)
2899 {
2900         struct btrfs_fs_info *fs_info = sctx->fs_info;
2901         struct btrfs_root *root = fs_info->extent_root;
2902         struct btrfs_root *csum_root = fs_info->csum_root;
2903         struct btrfs_extent_item *extent;
2904         struct btrfs_bio *bbio = NULL;
2905         u64 flags;
2906         int ret;
2907         int slot;
2908         struct extent_buffer *l;
2909         struct btrfs_key key;
2910         u64 generation;
2911         u64 extent_logical;
2912         u64 extent_physical;
2913         /* Check the comment in scrub_stripe() for why u32 is enough here */
2914         u32 extent_len;
2915         u64 mapped_length;
2916         struct btrfs_device *extent_dev;
2917         struct scrub_parity *sparity;
2918         int nsectors;
2919         int bitmap_len;
2920         int extent_mirror_num;
2921         int stop_loop = 0;
2922
2923         ASSERT(map->stripe_len <= U32_MAX);
2924         nsectors = map->stripe_len >> fs_info->sectorsize_bits;
2925         bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
2926         sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len,
2927                           GFP_NOFS);
2928         if (!sparity) {
2929                 spin_lock(&sctx->stat_lock);
2930                 sctx->stat.malloc_errors++;
2931                 spin_unlock(&sctx->stat_lock);
2932                 return -ENOMEM;
2933         }
2934
2935         ASSERT(map->stripe_len <= U32_MAX);
2936         sparity->stripe_len = map->stripe_len;
2937         sparity->nsectors = nsectors;
2938         sparity->sctx = sctx;
2939         sparity->scrub_dev = sdev;
2940         sparity->logic_start = logic_start;
2941         sparity->logic_end = logic_end;
2942         refcount_set(&sparity->refs, 1);
2943         INIT_LIST_HEAD(&sparity->spages);
2944         sparity->dbitmap = sparity->bitmap;
2945         sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
2946
2947         ret = 0;
2948         while (logic_start < logic_end) {
2949                 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2950                         key.type = BTRFS_METADATA_ITEM_KEY;
2951                 else
2952                         key.type = BTRFS_EXTENT_ITEM_KEY;
2953                 key.objectid = logic_start;
2954                 key.offset = (u64)-1;
2955
2956                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2957                 if (ret < 0)
2958                         goto out;
2959
2960                 if (ret > 0) {
2961                         ret = btrfs_previous_extent_item(root, path, 0);
2962                         if (ret < 0)
2963                                 goto out;
2964                         if (ret > 0) {
2965                                 btrfs_release_path(path);
2966                                 ret = btrfs_search_slot(NULL, root, &key,
2967                                                         path, 0, 0);
2968                                 if (ret < 0)
2969                                         goto out;
2970                         }
2971                 }
2972
2973                 stop_loop = 0;
2974                 while (1) {
2975                         u64 bytes;
2976
2977                         l = path->nodes[0];
2978                         slot = path->slots[0];
2979                         if (slot >= btrfs_header_nritems(l)) {
2980                                 ret = btrfs_next_leaf(root, path);
2981                                 if (ret == 0)
2982                                         continue;
2983                                 if (ret < 0)
2984                                         goto out;
2985
2986                                 stop_loop = 1;
2987                                 break;
2988                         }
2989                         btrfs_item_key_to_cpu(l, &key, slot);
2990
2991                         if (key.type != BTRFS_EXTENT_ITEM_KEY &&
2992                             key.type != BTRFS_METADATA_ITEM_KEY)
2993                                 goto next;
2994
2995                         if (key.type == BTRFS_METADATA_ITEM_KEY)
2996                                 bytes = fs_info->nodesize;
2997                         else
2998                                 bytes = key.offset;
2999
3000                         if (key.objectid + bytes <= logic_start)
3001                                 goto next;
3002
3003                         if (key.objectid >= logic_end) {
3004                                 stop_loop = 1;
3005                                 break;
3006                         }
3007
3008                         while (key.objectid >= logic_start + map->stripe_len)
3009                                 logic_start += map->stripe_len;
3010
3011                         extent = btrfs_item_ptr(l, slot,
3012                                                 struct btrfs_extent_item);
3013                         flags = btrfs_extent_flags(l, extent);
3014                         generation = btrfs_extent_generation(l, extent);
3015
3016                         if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3017                             (key.objectid < logic_start ||
3018                              key.objectid + bytes >
3019                              logic_start + map->stripe_len)) {
3020                                 btrfs_err(fs_info,
3021                                           "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
3022                                           key.objectid, logic_start);
3023                                 spin_lock(&sctx->stat_lock);
3024                                 sctx->stat.uncorrectable_errors++;
3025                                 spin_unlock(&sctx->stat_lock);
3026                                 goto next;
3027                         }
3028 again:
3029                         extent_logical = key.objectid;
3030                         ASSERT(bytes <= U32_MAX);
3031                         extent_len = bytes;
3032
3033                         if (extent_logical < logic_start) {
3034                                 extent_len -= logic_start - extent_logical;
3035                                 extent_logical = logic_start;
3036                         }
3037
3038                         if (extent_logical + extent_len >
3039                             logic_start + map->stripe_len)
3040                                 extent_len = logic_start + map->stripe_len -
3041                                              extent_logical;
3042
3043                         scrub_parity_mark_sectors_data(sparity, extent_logical,
3044                                                        extent_len);
3045
3046                         mapped_length = extent_len;
3047                         bbio = NULL;
3048                         ret = btrfs_map_block(fs_info, BTRFS_MAP_READ,
3049                                         extent_logical, &mapped_length, &bbio,
3050                                         0);
3051                         if (!ret) {
3052                                 if (!bbio || mapped_length < extent_len)
3053                                         ret = -EIO;
3054                         }
3055                         if (ret) {
3056                                 btrfs_put_bbio(bbio);
3057                                 goto out;
3058                         }
3059                         extent_physical = bbio->stripes[0].physical;
3060                         extent_mirror_num = bbio->mirror_num;
3061                         extent_dev = bbio->stripes[0].dev;
3062                         btrfs_put_bbio(bbio);
3063
3064                         ret = btrfs_lookup_csums_range(csum_root,
3065                                                 extent_logical,
3066                                                 extent_logical + extent_len - 1,
3067                                                 &sctx->csum_list, 1);
3068                         if (ret)
3069                                 goto out;
3070
3071                         ret = scrub_extent_for_parity(sparity, extent_logical,
3072                                                       extent_len,
3073                                                       extent_physical,
3074                                                       extent_dev, flags,
3075                                                       generation,
3076                                                       extent_mirror_num);
3077
3078                         scrub_free_csums(sctx);
3079
3080                         if (ret)
3081                                 goto out;
3082
3083                         if (extent_logical + extent_len <
3084                             key.objectid + bytes) {
3085                                 logic_start += map->stripe_len;
3086
3087                                 if (logic_start >= logic_end) {
3088                                         stop_loop = 1;
3089                                         break;
3090                                 }
3091
3092                                 if (logic_start < key.objectid + bytes) {
3093                                         cond_resched();
3094                                         goto again;
3095                                 }
3096                         }
3097 next:
3098                         path->slots[0]++;
3099                 }
3100
3101                 btrfs_release_path(path);
3102
3103                 if (stop_loop)
3104                         break;
3105
3106                 logic_start += map->stripe_len;
3107         }
3108 out:
3109         if (ret < 0) {
3110                 ASSERT(logic_end - logic_start <= U32_MAX);
3111                 scrub_parity_mark_sectors_error(sparity, logic_start,
3112                                                 logic_end - logic_start);
3113         }
3114         scrub_parity_put(sparity);
3115         scrub_submit(sctx);
3116         mutex_lock(&sctx->wr_lock);
3117         scrub_wr_submit(sctx);
3118         mutex_unlock(&sctx->wr_lock);
3119
3120         btrfs_release_path(path);
3121         return ret < 0 ? ret : 0;
3122 }
3123
3124 static void sync_replace_for_zoned(struct scrub_ctx *sctx)
3125 {
3126         if (!btrfs_is_zoned(sctx->fs_info))
3127                 return;
3128
3129         sctx->flush_all_writes = true;
3130         scrub_submit(sctx);
3131         mutex_lock(&sctx->wr_lock);
3132         scrub_wr_submit(sctx);
3133         mutex_unlock(&sctx->wr_lock);
3134
3135         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3136 }
3137
3138 static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical,
3139                                         u64 physical, u64 physical_end)
3140 {
3141         struct btrfs_fs_info *fs_info = sctx->fs_info;
3142         int ret = 0;
3143
3144         if (!btrfs_is_zoned(fs_info))
3145                 return 0;
3146
3147         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3148
3149         mutex_lock(&sctx->wr_lock);
3150         if (sctx->write_pointer < physical_end) {
3151                 ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical,
3152                                                     physical,
3153                                                     sctx->write_pointer);
3154                 if (ret)
3155                         btrfs_err(fs_info,
3156                                   "zoned: failed to recover write pointer");
3157         }
3158         mutex_unlock(&sctx->wr_lock);
3159         btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical);
3160
3161         return ret;
3162 }
3163
3164 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
3165                                            struct map_lookup *map,
3166                                            struct btrfs_device *scrub_dev,
3167                                            int num, u64 base, u64 length,
3168                                            struct btrfs_block_group *cache)
3169 {
3170         struct btrfs_path *path, *ppath;
3171         struct btrfs_fs_info *fs_info = sctx->fs_info;
3172         struct btrfs_root *root = fs_info->extent_root;
3173         struct btrfs_root *csum_root = fs_info->csum_root;
3174         struct btrfs_extent_item *extent;
3175         struct blk_plug plug;
3176         u64 flags;
3177         int ret;
3178         int slot;
3179         u64 nstripes;
3180         struct extent_buffer *l;
3181         u64 physical;
3182         u64 logical;
3183         u64 logic_end;
3184         u64 physical_end;
3185         u64 generation;
3186         int mirror_num;
3187         struct reada_control *reada1;
3188         struct reada_control *reada2;
3189         struct btrfs_key key;
3190         struct btrfs_key key_end;
3191         u64 increment = map->stripe_len;
3192         u64 offset;
3193         u64 extent_logical;
3194         u64 extent_physical;
3195         /*
3196          * Unlike chunk length, extent length should never go beyond
3197          * BTRFS_MAX_EXTENT_SIZE, thus u32 is enough here.
3198          */
3199         u32 extent_len;
3200         u64 stripe_logical;
3201         u64 stripe_end;
3202         struct btrfs_device *extent_dev;
3203         int extent_mirror_num;
3204         int stop_loop = 0;
3205
3206         physical = map->stripes[num].physical;
3207         offset = 0;
3208         nstripes = div64_u64(length, map->stripe_len);
3209         mirror_num = 1;
3210         increment = map->stripe_len;
3211         if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3212                 offset = map->stripe_len * num;
3213                 increment = map->stripe_len * map->num_stripes;
3214         } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3215                 int factor = map->num_stripes / map->sub_stripes;
3216                 offset = map->stripe_len * (num / map->sub_stripes);
3217                 increment = map->stripe_len * factor;
3218                 mirror_num = num % map->sub_stripes + 1;
3219         } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
3220                 mirror_num = num % map->num_stripes + 1;
3221         } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
3222                 mirror_num = num % map->num_stripes + 1;
3223         } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3224                 get_raid56_logic_offset(physical, num, map, &offset, NULL);
3225                 increment = map->stripe_len * nr_data_stripes(map);
3226         }
3227
3228         path = btrfs_alloc_path();
3229         if (!path)
3230                 return -ENOMEM;
3231
3232         ppath = btrfs_alloc_path();
3233         if (!ppath) {
3234                 btrfs_free_path(path);
3235                 return -ENOMEM;
3236         }
3237
3238         /*
3239          * work on commit root. The related disk blocks are static as
3240          * long as COW is applied. This means, it is save to rewrite
3241          * them to repair disk errors without any race conditions
3242          */
3243         path->search_commit_root = 1;
3244         path->skip_locking = 1;
3245
3246         ppath->search_commit_root = 1;
3247         ppath->skip_locking = 1;
3248         /*
3249          * trigger the readahead for extent tree csum tree and wait for
3250          * completion. During readahead, the scrub is officially paused
3251          * to not hold off transaction commits
3252          */
3253         logical = base + offset;
3254         physical_end = physical + nstripes * map->stripe_len;
3255         if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3256                 get_raid56_logic_offset(physical_end, num,
3257                                         map, &logic_end, NULL);
3258                 logic_end += base;
3259         } else {
3260                 logic_end = logical + increment * nstripes;
3261         }
3262         wait_event(sctx->list_wait,
3263                    atomic_read(&sctx->bios_in_flight) == 0);
3264         scrub_blocked_if_needed(fs_info);
3265
3266         /* FIXME it might be better to start readahead at commit root */
3267         key.objectid = logical;
3268         key.type = BTRFS_EXTENT_ITEM_KEY;
3269         key.offset = (u64)0;
3270         key_end.objectid = logic_end;
3271         key_end.type = BTRFS_METADATA_ITEM_KEY;
3272         key_end.offset = (u64)-1;
3273         reada1 = btrfs_reada_add(root, &key, &key_end);
3274
3275         if (cache->flags & BTRFS_BLOCK_GROUP_DATA) {
3276                 key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
3277                 key.type = BTRFS_EXTENT_CSUM_KEY;
3278                 key.offset = logical;
3279                 key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
3280                 key_end.type = BTRFS_EXTENT_CSUM_KEY;
3281                 key_end.offset = logic_end;
3282                 reada2 = btrfs_reada_add(csum_root, &key, &key_end);
3283         } else {
3284                 reada2 = NULL;
3285         }
3286
3287         if (!IS_ERR(reada1))
3288                 btrfs_reada_wait(reada1);
3289         if (!IS_ERR_OR_NULL(reada2))
3290                 btrfs_reada_wait(reada2);
3291
3292
3293         /*
3294          * collect all data csums for the stripe to avoid seeking during
3295          * the scrub. This might currently (crc32) end up to be about 1MB
3296          */
3297         blk_start_plug(&plug);
3298
3299         if (sctx->is_dev_replace &&
3300             btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) {
3301                 mutex_lock(&sctx->wr_lock);
3302                 sctx->write_pointer = physical;
3303                 mutex_unlock(&sctx->wr_lock);
3304                 sctx->flush_all_writes = true;
3305         }
3306
3307         /*
3308          * now find all extents for each stripe and scrub them
3309          */
3310         ret = 0;
3311         while (physical < physical_end) {
3312                 /*
3313                  * canceled?
3314                  */
3315                 if (atomic_read(&fs_info->scrub_cancel_req) ||
3316                     atomic_read(&sctx->cancel_req)) {
3317                         ret = -ECANCELED;
3318                         goto out;
3319                 }
3320                 /*
3321                  * check to see if we have to pause
3322                  */
3323                 if (atomic_read(&fs_info->scrub_pause_req)) {
3324                         /* push queued extents */
3325                         sctx->flush_all_writes = true;
3326                         scrub_submit(sctx);
3327                         mutex_lock(&sctx->wr_lock);
3328                         scrub_wr_submit(sctx);
3329                         mutex_unlock(&sctx->wr_lock);
3330                         wait_event(sctx->list_wait,
3331                                    atomic_read(&sctx->bios_in_flight) == 0);
3332                         sctx->flush_all_writes = false;
3333                         scrub_blocked_if_needed(fs_info);
3334                 }
3335
3336                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3337                         ret = get_raid56_logic_offset(physical, num, map,
3338                                                       &logical,
3339                                                       &stripe_logical);
3340                         logical += base;
3341                         if (ret) {
3342                                 /* it is parity strip */
3343                                 stripe_logical += base;
3344                                 stripe_end = stripe_logical + increment;
3345                                 ret = scrub_raid56_parity(sctx, map, scrub_dev,
3346                                                           ppath, stripe_logical,
3347                                                           stripe_end);
3348                                 if (ret)
3349                                         goto out;
3350                                 goto skip;
3351                         }
3352                 }
3353
3354                 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
3355                         key.type = BTRFS_METADATA_ITEM_KEY;
3356                 else
3357                         key.type = BTRFS_EXTENT_ITEM_KEY;
3358                 key.objectid = logical;
3359                 key.offset = (u64)-1;
3360
3361                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3362                 if (ret < 0)
3363                         goto out;
3364
3365                 if (ret > 0) {
3366                         ret = btrfs_previous_extent_item(root, path, 0);
3367                         if (ret < 0)
3368                                 goto out;
3369                         if (ret > 0) {
3370                                 /* there's no smaller item, so stick with the
3371                                  * larger one */
3372                                 btrfs_release_path(path);
3373                                 ret = btrfs_search_slot(NULL, root, &key,
3374                                                         path, 0, 0);
3375                                 if (ret < 0)
3376                                         goto out;
3377                         }
3378                 }
3379
3380                 stop_loop = 0;
3381                 while (1) {
3382                         u64 bytes;
3383
3384                         l = path->nodes[0];
3385                         slot = path->slots[0];
3386                         if (slot >= btrfs_header_nritems(l)) {
3387                                 ret = btrfs_next_leaf(root, path);
3388                                 if (ret == 0)
3389                                         continue;
3390                                 if (ret < 0)
3391                                         goto out;
3392
3393                                 stop_loop = 1;
3394                                 break;
3395                         }
3396                         btrfs_item_key_to_cpu(l, &key, slot);
3397
3398                         if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3399                             key.type != BTRFS_METADATA_ITEM_KEY)
3400                                 goto next;
3401
3402                         if (key.type == BTRFS_METADATA_ITEM_KEY)
3403                                 bytes = fs_info->nodesize;
3404                         else
3405                                 bytes = key.offset;
3406
3407                         if (key.objectid + bytes <= logical)
3408                                 goto next;
3409
3410                         if (key.objectid >= logical + map->stripe_len) {
3411                                 /* out of this device extent */
3412                                 if (key.objectid >= logic_end)
3413                                         stop_loop = 1;
3414                                 break;
3415                         }
3416
3417                         /*
3418                          * If our block group was removed in the meanwhile, just
3419                          * stop scrubbing since there is no point in continuing.
3420                          * Continuing would prevent reusing its device extents
3421                          * for new block groups for a long time.
3422                          */
3423                         spin_lock(&cache->lock);
3424                         if (cache->removed) {
3425                                 spin_unlock(&cache->lock);
3426                                 ret = 0;
3427                                 goto out;
3428                         }
3429                         spin_unlock(&cache->lock);
3430
3431                         extent = btrfs_item_ptr(l, slot,
3432                                                 struct btrfs_extent_item);
3433                         flags = btrfs_extent_flags(l, extent);
3434                         generation = btrfs_extent_generation(l, extent);
3435
3436                         if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3437                             (key.objectid < logical ||
3438                              key.objectid + bytes >
3439                              logical + map->stripe_len)) {
3440                                 btrfs_err(fs_info,
3441                                            "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
3442                                        key.objectid, logical);
3443                                 spin_lock(&sctx->stat_lock);
3444                                 sctx->stat.uncorrectable_errors++;
3445                                 spin_unlock(&sctx->stat_lock);
3446                                 goto next;
3447                         }
3448
3449 again:
3450                         extent_logical = key.objectid;
3451                         ASSERT(bytes <= U32_MAX);
3452                         extent_len = bytes;
3453
3454                         /*
3455                          * trim extent to this stripe
3456                          */
3457                         if (extent_logical < logical) {
3458                                 extent_len -= logical - extent_logical;
3459                                 extent_logical = logical;
3460                         }
3461                         if (extent_logical + extent_len >
3462                             logical + map->stripe_len) {
3463                                 extent_len = logical + map->stripe_len -
3464                                              extent_logical;
3465                         }
3466
3467                         extent_physical = extent_logical - logical + physical;
3468                         extent_dev = scrub_dev;
3469                         extent_mirror_num = mirror_num;
3470                         if (sctx->is_dev_replace)
3471                                 scrub_remap_extent(fs_info, extent_logical,
3472                                                    extent_len, &extent_physical,
3473                                                    &extent_dev,
3474                                                    &extent_mirror_num);
3475
3476                         if (flags & BTRFS_EXTENT_FLAG_DATA) {
3477                                 ret = btrfs_lookup_csums_range(csum_root,
3478                                                 extent_logical,
3479                                                 extent_logical + extent_len - 1,
3480                                                 &sctx->csum_list, 1);
3481                                 if (ret)
3482                                         goto out;
3483                         }
3484
3485                         ret = scrub_extent(sctx, map, extent_logical, extent_len,
3486                                            extent_physical, extent_dev, flags,
3487                                            generation, extent_mirror_num,
3488                                            extent_logical - logical + physical);
3489
3490                         scrub_free_csums(sctx);
3491
3492                         if (ret)
3493                                 goto out;
3494
3495                         if (sctx->is_dev_replace)
3496                                 sync_replace_for_zoned(sctx);
3497
3498                         if (extent_logical + extent_len <
3499                             key.objectid + bytes) {
3500                                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3501                                         /*
3502                                          * loop until we find next data stripe
3503                                          * or we have finished all stripes.
3504                                          */
3505 loop:
3506                                         physical += map->stripe_len;
3507                                         ret = get_raid56_logic_offset(physical,
3508                                                         num, map, &logical,
3509                                                         &stripe_logical);
3510                                         logical += base;
3511
3512                                         if (ret && physical < physical_end) {
3513                                                 stripe_logical += base;
3514                                                 stripe_end = stripe_logical +
3515                                                                 increment;
3516                                                 ret = scrub_raid56_parity(sctx,
3517                                                         map, scrub_dev, ppath,
3518                                                         stripe_logical,
3519                                                         stripe_end);
3520                                                 if (ret)
3521                                                         goto out;
3522                                                 goto loop;
3523                                         }
3524                                 } else {
3525                                         physical += map->stripe_len;
3526                                         logical += increment;
3527                                 }
3528                                 if (logical < key.objectid + bytes) {
3529                                         cond_resched();
3530                                         goto again;
3531                                 }
3532
3533                                 if (physical >= physical_end) {
3534                                         stop_loop = 1;
3535                                         break;
3536                                 }
3537                         }
3538 next:
3539                         path->slots[0]++;
3540                 }
3541                 btrfs_release_path(path);
3542 skip:
3543                 logical += increment;
3544                 physical += map->stripe_len;
3545                 spin_lock(&sctx->stat_lock);
3546                 if (stop_loop)
3547                         sctx->stat.last_physical = map->stripes[num].physical +
3548                                                    length;
3549                 else
3550                         sctx->stat.last_physical = physical;
3551                 spin_unlock(&sctx->stat_lock);
3552                 if (stop_loop)
3553                         break;
3554         }
3555 out:
3556         /* push queued extents */
3557         scrub_submit(sctx);
3558         mutex_lock(&sctx->wr_lock);
3559         scrub_wr_submit(sctx);
3560         mutex_unlock(&sctx->wr_lock);
3561
3562         blk_finish_plug(&plug);
3563         btrfs_free_path(path);
3564         btrfs_free_path(ppath);
3565
3566         if (sctx->is_dev_replace && ret >= 0) {
3567                 int ret2;
3568
3569                 ret2 = sync_write_pointer_for_zoned(sctx, base + offset,
3570                                                     map->stripes[num].physical,
3571                                                     physical_end);
3572                 if (ret2)
3573                         ret = ret2;
3574         }
3575
3576         return ret < 0 ? ret : 0;
3577 }
3578
3579 static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
3580                                           struct btrfs_device *scrub_dev,
3581                                           u64 chunk_offset, u64 length,
3582                                           u64 dev_offset,
3583                                           struct btrfs_block_group *cache)
3584 {
3585         struct btrfs_fs_info *fs_info = sctx->fs_info;
3586         struct extent_map_tree *map_tree = &fs_info->mapping_tree;
3587         struct map_lookup *map;
3588         struct extent_map *em;
3589         int i;
3590         int ret = 0;
3591
3592         read_lock(&map_tree->lock);
3593         em = lookup_extent_mapping(map_tree, chunk_offset, 1);
3594         read_unlock(&map_tree->lock);
3595
3596         if (!em) {
3597                 /*
3598                  * Might have been an unused block group deleted by the cleaner
3599                  * kthread or relocation.
3600                  */
3601                 spin_lock(&cache->lock);
3602                 if (!cache->removed)
3603                         ret = -EINVAL;
3604                 spin_unlock(&cache->lock);
3605
3606                 return ret;
3607         }
3608
3609         map = em->map_lookup;
3610         if (em->start != chunk_offset)
3611                 goto out;
3612
3613         if (em->len < length)
3614                 goto out;
3615
3616         for (i = 0; i < map->num_stripes; ++i) {
3617                 if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
3618                     map->stripes[i].physical == dev_offset) {
3619                         ret = scrub_stripe(sctx, map, scrub_dev, i,
3620                                            chunk_offset, length, cache);
3621                         if (ret)
3622                                 goto out;
3623                 }
3624         }
3625 out:
3626         free_extent_map(em);
3627
3628         return ret;
3629 }
3630
3631 static int finish_extent_writes_for_zoned(struct btrfs_root *root,
3632                                           struct btrfs_block_group *cache)
3633 {
3634         struct btrfs_fs_info *fs_info = cache->fs_info;
3635         struct btrfs_trans_handle *trans;
3636
3637         if (!btrfs_is_zoned(fs_info))
3638                 return 0;
3639
3640         btrfs_wait_block_group_reservations(cache);
3641         btrfs_wait_nocow_writers(cache);
3642         btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start, cache->length);
3643
3644         trans = btrfs_join_transaction(root);
3645         if (IS_ERR(trans))
3646                 return PTR_ERR(trans);
3647         return btrfs_commit_transaction(trans);
3648 }
3649
3650 static noinline_for_stack
3651 int scrub_enumerate_chunks(struct scrub_ctx *sctx,
3652                            struct btrfs_device *scrub_dev, u64 start, u64 end)
3653 {
3654         struct btrfs_dev_extent *dev_extent = NULL;
3655         struct btrfs_path *path;
3656         struct btrfs_fs_info *fs_info = sctx->fs_info;
3657         struct btrfs_root *root = fs_info->dev_root;
3658         u64 length;
3659         u64 chunk_offset;
3660         int ret = 0;
3661         int ro_set;
3662         int slot;
3663         struct extent_buffer *l;
3664         struct btrfs_key key;
3665         struct btrfs_key found_key;
3666         struct btrfs_block_group *cache;
3667         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
3668
3669         path = btrfs_alloc_path();
3670         if (!path)
3671                 return -ENOMEM;
3672
3673         path->reada = READA_FORWARD;
3674         path->search_commit_root = 1;
3675         path->skip_locking = 1;
3676
3677         key.objectid = scrub_dev->devid;
3678         key.offset = 0ull;
3679         key.type = BTRFS_DEV_EXTENT_KEY;
3680
3681         while (1) {
3682                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3683                 if (ret < 0)
3684                         break;
3685                 if (ret > 0) {
3686                         if (path->slots[0] >=
3687                             btrfs_header_nritems(path->nodes[0])) {
3688                                 ret = btrfs_next_leaf(root, path);
3689                                 if (ret < 0)
3690                                         break;
3691                                 if (ret > 0) {
3692                                         ret = 0;
3693                                         break;
3694                                 }
3695                         } else {
3696                                 ret = 0;
3697                         }
3698                 }
3699
3700                 l = path->nodes[0];
3701                 slot = path->slots[0];
3702
3703                 btrfs_item_key_to_cpu(l, &found_key, slot);
3704
3705                 if (found_key.objectid != scrub_dev->devid)
3706                         break;
3707
3708                 if (found_key.type != BTRFS_DEV_EXTENT_KEY)
3709                         break;
3710
3711                 if (found_key.offset >= end)
3712                         break;
3713
3714                 if (found_key.offset < key.offset)
3715                         break;
3716
3717                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
3718                 length = btrfs_dev_extent_length(l, dev_extent);
3719
3720                 if (found_key.offset + length <= start)
3721                         goto skip;
3722
3723                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
3724
3725                 /*
3726                  * get a reference on the corresponding block group to prevent
3727                  * the chunk from going away while we scrub it
3728                  */
3729                 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3730
3731                 /* some chunks are removed but not committed to disk yet,
3732                  * continue scrubbing */
3733                 if (!cache)
3734                         goto skip;
3735
3736                 if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) {
3737                         spin_lock(&cache->lock);
3738                         if (!cache->to_copy) {
3739                                 spin_unlock(&cache->lock);
3740                                 btrfs_put_block_group(cache);
3741                                 goto skip;
3742                         }
3743                         spin_unlock(&cache->lock);
3744                 }
3745
3746                 /*
3747                  * Make sure that while we are scrubbing the corresponding block
3748                  * group doesn't get its logical address and its device extents
3749                  * reused for another block group, which can possibly be of a
3750                  * different type and different profile. We do this to prevent
3751                  * false error detections and crashes due to bogus attempts to
3752                  * repair extents.
3753                  */
3754                 spin_lock(&cache->lock);
3755                 if (cache->removed) {
3756                         spin_unlock(&cache->lock);
3757                         btrfs_put_block_group(cache);
3758                         goto skip;
3759                 }
3760                 btrfs_freeze_block_group(cache);
3761                 spin_unlock(&cache->lock);
3762
3763                 /*
3764                  * we need call btrfs_inc_block_group_ro() with scrubs_paused,
3765                  * to avoid deadlock caused by:
3766                  * btrfs_inc_block_group_ro()
3767                  * -> btrfs_wait_for_commit()
3768                  * -> btrfs_commit_transaction()
3769                  * -> btrfs_scrub_pause()
3770                  */
3771                 scrub_pause_on(fs_info);
3772
3773                 /*
3774                  * Don't do chunk preallocation for scrub.
3775                  *
3776                  * This is especially important for SYSTEM bgs, or we can hit
3777                  * -EFBIG from btrfs_finish_chunk_alloc() like:
3778                  * 1. The only SYSTEM bg is marked RO.
3779                  *    Since SYSTEM bg is small, that's pretty common.
3780                  * 2. New SYSTEM bg will be allocated
3781                  *    Due to regular version will allocate new chunk.
3782                  * 3. New SYSTEM bg is empty and will get cleaned up
3783                  *    Before cleanup really happens, it's marked RO again.
3784                  * 4. Empty SYSTEM bg get scrubbed
3785                  *    We go back to 2.
3786                  *
3787                  * This can easily boost the amount of SYSTEM chunks if cleaner
3788                  * thread can't be triggered fast enough, and use up all space
3789                  * of btrfs_super_block::sys_chunk_array
3790                  *
3791                  * While for dev replace, we need to try our best to mark block
3792                  * group RO, to prevent race between:
3793                  * - Write duplication
3794                  *   Contains latest data
3795                  * - Scrub copy
3796                  *   Contains data from commit tree
3797                  *
3798                  * If target block group is not marked RO, nocow writes can
3799                  * be overwritten by scrub copy, causing data corruption.
3800                  * So for dev-replace, it's not allowed to continue if a block
3801                  * group is not RO.
3802                  */
3803                 ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace);
3804                 if (!ret && sctx->is_dev_replace) {
3805                         ret = finish_extent_writes_for_zoned(root, cache);
3806                         if (ret) {
3807                                 btrfs_dec_block_group_ro(cache);
3808                                 scrub_pause_off(fs_info);
3809                                 btrfs_put_block_group(cache);
3810                                 break;
3811                         }
3812                 }
3813
3814                 if (ret == 0) {
3815                         ro_set = 1;
3816                 } else if (ret == -ENOSPC && !sctx->is_dev_replace) {
3817                         /*
3818                          * btrfs_inc_block_group_ro return -ENOSPC when it
3819                          * failed in creating new chunk for metadata.
3820                          * It is not a problem for scrub, because
3821                          * metadata are always cowed, and our scrub paused
3822                          * commit_transactions.
3823                          */
3824                         ro_set = 0;
3825                 } else if (ret == -ETXTBSY) {
3826                         btrfs_warn(fs_info,
3827                    "skipping scrub of block group %llu due to active swapfile",
3828                                    cache->start);
3829                         scrub_pause_off(fs_info);
3830                         ret = 0;
3831                         goto skip_unfreeze;
3832                 } else {
3833                         btrfs_warn(fs_info,
3834                                    "failed setting block group ro: %d", ret);
3835                         btrfs_unfreeze_block_group(cache);
3836                         btrfs_put_block_group(cache);
3837                         scrub_pause_off(fs_info);
3838                         break;
3839                 }
3840
3841                 /*
3842                  * Now the target block is marked RO, wait for nocow writes to
3843                  * finish before dev-replace.
3844                  * COW is fine, as COW never overwrites extents in commit tree.
3845                  */
3846                 if (sctx->is_dev_replace) {
3847                         btrfs_wait_nocow_writers(cache);
3848                         btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start,
3849                                         cache->length);
3850                 }
3851
3852                 scrub_pause_off(fs_info);
3853                 down_write(&dev_replace->rwsem);
3854                 dev_replace->cursor_right = found_key.offset + length;
3855                 dev_replace->cursor_left = found_key.offset;
3856                 dev_replace->item_needs_writeback = 1;
3857                 up_write(&dev_replace->rwsem);
3858
3859                 ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length,
3860                                   found_key.offset, cache);
3861
3862                 /*
3863                  * flush, submit all pending read and write bios, afterwards
3864                  * wait for them.
3865                  * Note that in the dev replace case, a read request causes
3866                  * write requests that are submitted in the read completion
3867                  * worker. Therefore in the current situation, it is required
3868                  * that all write requests are flushed, so that all read and
3869                  * write requests are really completed when bios_in_flight
3870                  * changes to 0.
3871                  */
3872                 sctx->flush_all_writes = true;
3873                 scrub_submit(sctx);
3874                 mutex_lock(&sctx->wr_lock);
3875                 scrub_wr_submit(sctx);
3876                 mutex_unlock(&sctx->wr_lock);
3877
3878                 wait_event(sctx->list_wait,
3879                            atomic_read(&sctx->bios_in_flight) == 0);
3880
3881                 scrub_pause_on(fs_info);
3882
3883                 /*
3884                  * must be called before we decrease @scrub_paused.
3885                  * make sure we don't block transaction commit while
3886                  * we are waiting pending workers finished.
3887                  */
3888                 wait_event(sctx->list_wait,
3889                            atomic_read(&sctx->workers_pending) == 0);
3890                 sctx->flush_all_writes = false;
3891
3892                 scrub_pause_off(fs_info);
3893
3894                 if (sctx->is_dev_replace &&
3895                     !btrfs_finish_block_group_to_copy(dev_replace->srcdev,
3896                                                       cache, found_key.offset))
3897                         ro_set = 0;
3898
3899                 down_write(&dev_replace->rwsem);
3900                 dev_replace->cursor_left = dev_replace->cursor_right;
3901                 dev_replace->item_needs_writeback = 1;
3902                 up_write(&dev_replace->rwsem);
3903
3904                 if (ro_set)
3905                         btrfs_dec_block_group_ro(cache);
3906
3907                 /*
3908                  * We might have prevented the cleaner kthread from deleting
3909                  * this block group if it was already unused because we raced
3910                  * and set it to RO mode first. So add it back to the unused
3911                  * list, otherwise it might not ever be deleted unless a manual
3912                  * balance is triggered or it becomes used and unused again.
3913                  */
3914                 spin_lock(&cache->lock);
3915                 if (!cache->removed && !cache->ro && cache->reserved == 0 &&
3916                     cache->used == 0) {
3917                         spin_unlock(&cache->lock);
3918                         if (btrfs_test_opt(fs_info, DISCARD_ASYNC))
3919                                 btrfs_discard_queue_work(&fs_info->discard_ctl,
3920                                                          cache);
3921                         else
3922                                 btrfs_mark_bg_unused(cache);
3923                 } else {
3924                         spin_unlock(&cache->lock);
3925                 }
3926 skip_unfreeze:
3927                 btrfs_unfreeze_block_group(cache);
3928                 btrfs_put_block_group(cache);
3929                 if (ret)
3930                         break;
3931                 if (sctx->is_dev_replace &&
3932                     atomic64_read(&dev_replace->num_write_errors) > 0) {
3933                         ret = -EIO;
3934                         break;
3935                 }
3936                 if (sctx->stat.malloc_errors > 0) {
3937                         ret = -ENOMEM;
3938                         break;
3939                 }
3940 skip:
3941                 key.offset = found_key.offset + length;
3942                 btrfs_release_path(path);
3943         }
3944
3945         btrfs_free_path(path);
3946
3947         return ret;
3948 }
3949
3950 static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
3951                                            struct btrfs_device *scrub_dev)
3952 {
3953         int     i;
3954         u64     bytenr;
3955         u64     gen;
3956         int     ret;
3957         struct btrfs_fs_info *fs_info = sctx->fs_info;
3958
3959         if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
3960                 return -EROFS;
3961
3962         /* Seed devices of a new filesystem has their own generation. */
3963         if (scrub_dev->fs_devices != fs_info->fs_devices)
3964                 gen = scrub_dev->generation;
3965         else
3966                 gen = fs_info->last_trans_committed;
3967
3968         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
3969                 bytenr = btrfs_sb_offset(i);
3970                 if (bytenr + BTRFS_SUPER_INFO_SIZE >
3971                     scrub_dev->commit_total_bytes)
3972                         break;
3973                 if (!btrfs_check_super_location(scrub_dev, bytenr))
3974                         continue;
3975
3976                 ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
3977                                   scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
3978                                   NULL, bytenr);
3979                 if (ret)
3980                         return ret;
3981         }
3982         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3983
3984         return 0;
3985 }
3986
3987 static void scrub_workers_put(struct btrfs_fs_info *fs_info)
3988 {
3989         if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt,
3990                                         &fs_info->scrub_lock)) {
3991                 struct btrfs_workqueue *scrub_workers = NULL;
3992                 struct btrfs_workqueue *scrub_wr_comp = NULL;
3993                 struct btrfs_workqueue *scrub_parity = NULL;
3994
3995                 scrub_workers = fs_info->scrub_workers;
3996                 scrub_wr_comp = fs_info->scrub_wr_completion_workers;
3997                 scrub_parity = fs_info->scrub_parity_workers;
3998
3999                 fs_info->scrub_workers = NULL;
4000                 fs_info->scrub_wr_completion_workers = NULL;
4001                 fs_info->scrub_parity_workers = NULL;
4002                 mutex_unlock(&fs_info->scrub_lock);
4003
4004                 btrfs_destroy_workqueue(scrub_workers);
4005                 btrfs_destroy_workqueue(scrub_wr_comp);
4006                 btrfs_destroy_workqueue(scrub_parity);
4007         }
4008 }
4009
4010 /*
4011  * get a reference count on fs_info->scrub_workers. start worker if necessary
4012  */
4013 static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
4014                                                 int is_dev_replace)
4015 {
4016         struct btrfs_workqueue *scrub_workers = NULL;
4017         struct btrfs_workqueue *scrub_wr_comp = NULL;
4018         struct btrfs_workqueue *scrub_parity = NULL;
4019         unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
4020         int max_active = fs_info->thread_pool_size;
4021         int ret = -ENOMEM;
4022
4023         if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt))
4024                 return 0;
4025
4026         scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub", flags,
4027                                               is_dev_replace ? 1 : max_active, 4);
4028         if (!scrub_workers)
4029                 goto fail_scrub_workers;
4030
4031         scrub_wr_comp = btrfs_alloc_workqueue(fs_info, "scrubwrc", flags,
4032                                               max_active, 2);
4033         if (!scrub_wr_comp)
4034                 goto fail_scrub_wr_completion_workers;
4035
4036         scrub_parity = btrfs_alloc_workqueue(fs_info, "scrubparity", flags,
4037                                              max_active, 2);
4038         if (!scrub_parity)
4039                 goto fail_scrub_parity_workers;
4040
4041         mutex_lock(&fs_info->scrub_lock);
4042         if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
4043                 ASSERT(fs_info->scrub_workers == NULL &&
4044                        fs_info->scrub_wr_completion_workers == NULL &&
4045                        fs_info->scrub_parity_workers == NULL);
4046                 fs_info->scrub_workers = scrub_workers;
4047                 fs_info->scrub_wr_completion_workers = scrub_wr_comp;
4048                 fs_info->scrub_parity_workers = scrub_parity;
4049                 refcount_set(&fs_info->scrub_workers_refcnt, 1);
4050                 mutex_unlock(&fs_info->scrub_lock);
4051                 return 0;
4052         }
4053         /* Other thread raced in and created the workers for us */
4054         refcount_inc(&fs_info->scrub_workers_refcnt);
4055         mutex_unlock(&fs_info->scrub_lock);
4056
4057         ret = 0;
4058         btrfs_destroy_workqueue(scrub_parity);
4059 fail_scrub_parity_workers:
4060         btrfs_destroy_workqueue(scrub_wr_comp);
4061 fail_scrub_wr_completion_workers:
4062         btrfs_destroy_workqueue(scrub_workers);
4063 fail_scrub_workers:
4064         return ret;
4065 }
4066
4067 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
4068                     u64 end, struct btrfs_scrub_progress *progress,
4069                     int readonly, int is_dev_replace)
4070 {
4071         struct scrub_ctx *sctx;
4072         int ret;
4073         struct btrfs_device *dev;
4074         unsigned int nofs_flag;
4075
4076         if (btrfs_fs_closing(fs_info))
4077                 return -EAGAIN;
4078
4079         if (fs_info->nodesize > BTRFS_STRIPE_LEN) {
4080                 /*
4081                  * in this case scrub is unable to calculate the checksum
4082                  * the way scrub is implemented. Do not handle this
4083                  * situation at all because it won't ever happen.
4084                  */
4085                 btrfs_err(fs_info,
4086                            "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
4087                        fs_info->nodesize,
4088                        BTRFS_STRIPE_LEN);
4089                 return -EINVAL;
4090         }
4091
4092         if (fs_info->nodesize >
4093             PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
4094             fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
4095                 /*
4096                  * would exhaust the array bounds of pagev member in
4097                  * struct scrub_block
4098                  */
4099                 btrfs_err(fs_info,
4100                           "scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails",
4101                        fs_info->nodesize,
4102                        SCRUB_MAX_PAGES_PER_BLOCK,
4103                        fs_info->sectorsize,
4104                        SCRUB_MAX_PAGES_PER_BLOCK);
4105                 return -EINVAL;
4106         }
4107
4108         /* Allocate outside of device_list_mutex */
4109         sctx = scrub_setup_ctx(fs_info, is_dev_replace);
4110         if (IS_ERR(sctx))
4111                 return PTR_ERR(sctx);
4112
4113         ret = scrub_workers_get(fs_info, is_dev_replace);
4114         if (ret)
4115                 goto out_free_ctx;
4116
4117         mutex_lock(&fs_info->fs_devices->device_list_mutex);
4118         dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
4119         if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
4120                      !is_dev_replace)) {
4121                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4122                 ret = -ENODEV;
4123                 goto out;
4124         }
4125
4126         if (!is_dev_replace && !readonly &&
4127             !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
4128                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4129                 btrfs_err_in_rcu(fs_info,
4130                         "scrub on devid %llu: filesystem on %s is not writable",
4131                                  devid, rcu_str_deref(dev->name));
4132                 ret = -EROFS;
4133                 goto out;
4134         }
4135
4136         mutex_lock(&fs_info->scrub_lock);
4137         if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
4138             test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) {
4139                 mutex_unlock(&fs_info->scrub_lock);
4140                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4141                 ret = -EIO;
4142                 goto out;
4143         }
4144
4145         down_read(&fs_info->dev_replace.rwsem);
4146         if (dev->scrub_ctx ||
4147             (!is_dev_replace &&
4148              btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
4149                 up_read(&fs_info->dev_replace.rwsem);
4150                 mutex_unlock(&fs_info->scrub_lock);
4151                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4152                 ret = -EINPROGRESS;
4153                 goto out;
4154         }
4155         up_read(&fs_info->dev_replace.rwsem);
4156
4157         sctx->readonly = readonly;
4158         dev->scrub_ctx = sctx;
4159         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4160
4161         /*
4162          * checking @scrub_pause_req here, we can avoid
4163          * race between committing transaction and scrubbing.
4164          */
4165         __scrub_blocked_if_needed(fs_info);
4166         atomic_inc(&fs_info->scrubs_running);
4167         mutex_unlock(&fs_info->scrub_lock);
4168
4169         /*
4170          * In order to avoid deadlock with reclaim when there is a transaction
4171          * trying to pause scrub, make sure we use GFP_NOFS for all the
4172          * allocations done at btrfs_scrub_pages() and scrub_pages_for_parity()
4173          * invoked by our callees. The pausing request is done when the
4174          * transaction commit starts, and it blocks the transaction until scrub
4175          * is paused (done at specific points at scrub_stripe() or right above
4176          * before incrementing fs_info->scrubs_running).
4177          */
4178         nofs_flag = memalloc_nofs_save();
4179         if (!is_dev_replace) {
4180                 btrfs_info(fs_info, "scrub: started on devid %llu", devid);
4181                 /*
4182                  * by holding device list mutex, we can
4183                  * kick off writing super in log tree sync.
4184                  */
4185                 mutex_lock(&fs_info->fs_devices->device_list_mutex);
4186                 ret = scrub_supers(sctx, dev);
4187                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4188         }
4189
4190         if (!ret)
4191                 ret = scrub_enumerate_chunks(sctx, dev, start, end);
4192         memalloc_nofs_restore(nofs_flag);
4193
4194         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
4195         atomic_dec(&fs_info->scrubs_running);
4196         wake_up(&fs_info->scrub_pause_wait);
4197
4198         wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
4199
4200         if (progress)
4201                 memcpy(progress, &sctx->stat, sizeof(*progress));
4202
4203         if (!is_dev_replace)
4204                 btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d",
4205                         ret ? "not finished" : "finished", devid, ret);
4206
4207         mutex_lock(&fs_info->scrub_lock);
4208         dev->scrub_ctx = NULL;
4209         mutex_unlock(&fs_info->scrub_lock);
4210
4211         scrub_workers_put(fs_info);
4212         scrub_put_ctx(sctx);
4213
4214         return ret;
4215 out:
4216         scrub_workers_put(fs_info);
4217 out_free_ctx:
4218         scrub_free_ctx(sctx);
4219
4220         return ret;
4221 }
4222
4223 void btrfs_scrub_pause(struct btrfs_fs_info *fs_info)
4224 {
4225         mutex_lock(&fs_info->scrub_lock);
4226         atomic_inc(&fs_info->scrub_pause_req);
4227         while (atomic_read(&fs_info->scrubs_paused) !=
4228                atomic_read(&fs_info->scrubs_running)) {
4229                 mutex_unlock(&fs_info->scrub_lock);
4230                 wait_event(fs_info->scrub_pause_wait,
4231                            atomic_read(&fs_info->scrubs_paused) ==
4232                            atomic_read(&fs_info->scrubs_running));
4233                 mutex_lock(&fs_info->scrub_lock);
4234         }
4235         mutex_unlock(&fs_info->scrub_lock);
4236 }
4237
4238 void btrfs_scrub_continue(struct btrfs_fs_info *fs_info)
4239 {
4240         atomic_dec(&fs_info->scrub_pause_req);
4241         wake_up(&fs_info->scrub_pause_wait);
4242 }
4243
4244 int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
4245 {
4246         mutex_lock(&fs_info->scrub_lock);
4247         if (!atomic_read(&fs_info->scrubs_running)) {
4248                 mutex_unlock(&fs_info->scrub_lock);
4249                 return -ENOTCONN;
4250         }
4251
4252         atomic_inc(&fs_info->scrub_cancel_req);
4253         while (atomic_read(&fs_info->scrubs_running)) {
4254                 mutex_unlock(&fs_info->scrub_lock);
4255                 wait_event(fs_info->scrub_pause_wait,
4256                            atomic_read(&fs_info->scrubs_running) == 0);
4257                 mutex_lock(&fs_info->scrub_lock);
4258         }
4259         atomic_dec(&fs_info->scrub_cancel_req);
4260         mutex_unlock(&fs_info->scrub_lock);
4261
4262         return 0;
4263 }
4264
4265 int btrfs_scrub_cancel_dev(struct btrfs_device *dev)
4266 {
4267         struct btrfs_fs_info *fs_info = dev->fs_info;
4268         struct scrub_ctx *sctx;
4269
4270         mutex_lock(&fs_info->scrub_lock);
4271         sctx = dev->scrub_ctx;
4272         if (!sctx) {
4273                 mutex_unlock(&fs_info->scrub_lock);
4274                 return -ENOTCONN;
4275         }
4276         atomic_inc(&sctx->cancel_req);
4277         while (dev->scrub_ctx) {
4278                 mutex_unlock(&fs_info->scrub_lock);
4279                 wait_event(fs_info->scrub_pause_wait,
4280                            dev->scrub_ctx == NULL);
4281                 mutex_lock(&fs_info->scrub_lock);
4282         }
4283         mutex_unlock(&fs_info->scrub_lock);
4284
4285         return 0;
4286 }
4287
4288 int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
4289                          struct btrfs_scrub_progress *progress)
4290 {
4291         struct btrfs_device *dev;
4292         struct scrub_ctx *sctx = NULL;
4293
4294         mutex_lock(&fs_info->fs_devices->device_list_mutex);
4295         dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
4296         if (dev)
4297                 sctx = dev->scrub_ctx;
4298         if (sctx)
4299                 memcpy(progress, &sctx->stat, sizeof(*progress));
4300         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4301
4302         return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
4303 }
4304
4305 static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
4306                                u64 extent_logical, u32 extent_len,
4307                                u64 *extent_physical,
4308                                struct btrfs_device **extent_dev,
4309                                int *extent_mirror_num)
4310 {
4311         u64 mapped_length;
4312         struct btrfs_bio *bbio = NULL;
4313         int ret;
4314
4315         mapped_length = extent_len;
4316         ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical,
4317                               &mapped_length, &bbio, 0);
4318         if (ret || !bbio || mapped_length < extent_len ||
4319             !bbio->stripes[0].dev->bdev) {
4320                 btrfs_put_bbio(bbio);
4321                 return;
4322         }
4323
4324         *extent_physical = bbio->stripes[0].physical;
4325         *extent_mirror_num = bbio->mirror_num;
4326         *extent_dev = bbio->stripes[0].dev;
4327         btrfs_put_bbio(bbio);
4328 }