btrfs: fix race between quota disable and quota assign ioctls
[platform/kernel/linux-rpi.git] / fs / btrfs / scrub.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2011, 2012 STRATO.  All rights reserved.
4  */
5
6 #include <linux/blkdev.h>
7 #include <linux/ratelimit.h>
8 #include <linux/sched/mm.h>
9 #include <crypto/hash.h>
10 #include "ctree.h"
11 #include "discard.h"
12 #include "volumes.h"
13 #include "disk-io.h"
14 #include "ordered-data.h"
15 #include "transaction.h"
16 #include "backref.h"
17 #include "extent_io.h"
18 #include "dev-replace.h"
19 #include "check-integrity.h"
20 #include "rcu-string.h"
21 #include "raid56.h"
22 #include "block-group.h"
23 #include "zoned.h"
24
25 /*
26  * This is only the first step towards a full-features scrub. It reads all
27  * extent and super block and verifies the checksums. In case a bad checksum
28  * is found or the extent cannot be read, good data will be written back if
29  * any can be found.
30  *
31  * Future enhancements:
32  *  - In case an unrepairable extent is encountered, track which files are
33  *    affected and report them
34  *  - track and record media errors, throw out bad devices
35  *  - add a mode to also read unallocated space
36  */
37
38 struct scrub_block;
39 struct scrub_ctx;
40
41 /*
42  * the following three values only influence the performance.
43  * The last one configures the number of parallel and outstanding I/O
44  * operations. The first two values configure an upper limit for the number
45  * of (dynamically allocated) pages that are added to a bio.
46  */
47 #define SCRUB_PAGES_PER_RD_BIO  32      /* 128k per bio */
48 #define SCRUB_PAGES_PER_WR_BIO  32      /* 128k per bio */
49 #define SCRUB_BIOS_PER_SCTX     64      /* 8MB per device in flight */
50
51 /*
52  * the following value times PAGE_SIZE needs to be large enough to match the
53  * largest node/leaf/sector size that shall be supported.
54  * Values larger than BTRFS_STRIPE_LEN are not supported.
55  */
56 #define SCRUB_MAX_PAGES_PER_BLOCK       16      /* 64k per node/leaf/sector */
57
58 struct scrub_recover {
59         refcount_t              refs;
60         struct btrfs_io_context *bioc;
61         u64                     map_length;
62 };
63
64 struct scrub_page {
65         struct scrub_block      *sblock;
66         struct page             *page;
67         struct btrfs_device     *dev;
68         struct list_head        list;
69         u64                     flags;  /* extent flags */
70         u64                     generation;
71         u64                     logical;
72         u64                     physical;
73         u64                     physical_for_dev_replace;
74         atomic_t                refs;
75         u8                      mirror_num;
76         unsigned int            have_csum:1;
77         unsigned int            io_error:1;
78         u8                      csum[BTRFS_CSUM_SIZE];
79
80         struct scrub_recover    *recover;
81 };
82
83 struct scrub_bio {
84         int                     index;
85         struct scrub_ctx        *sctx;
86         struct btrfs_device     *dev;
87         struct bio              *bio;
88         blk_status_t            status;
89         u64                     logical;
90         u64                     physical;
91 #if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
92         struct scrub_page       *pagev[SCRUB_PAGES_PER_WR_BIO];
93 #else
94         struct scrub_page       *pagev[SCRUB_PAGES_PER_RD_BIO];
95 #endif
96         int                     page_count;
97         int                     next_free;
98         struct btrfs_work       work;
99 };
100
101 struct scrub_block {
102         struct scrub_page       *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
103         int                     page_count;
104         atomic_t                outstanding_pages;
105         refcount_t              refs; /* free mem on transition to zero */
106         struct scrub_ctx        *sctx;
107         struct scrub_parity     *sparity;
108         struct {
109                 unsigned int    header_error:1;
110                 unsigned int    checksum_error:1;
111                 unsigned int    no_io_error_seen:1;
112                 unsigned int    generation_error:1; /* also sets header_error */
113
114                 /* The following is for the data used to check parity */
115                 /* It is for the data with checksum */
116                 unsigned int    data_corrected:1;
117         };
118         struct btrfs_work       work;
119 };
120
121 /* Used for the chunks with parity stripe such RAID5/6 */
122 struct scrub_parity {
123         struct scrub_ctx        *sctx;
124
125         struct btrfs_device     *scrub_dev;
126
127         u64                     logic_start;
128
129         u64                     logic_end;
130
131         int                     nsectors;
132
133         u32                     stripe_len;
134
135         refcount_t              refs;
136
137         struct list_head        spages;
138
139         /* Work of parity check and repair */
140         struct btrfs_work       work;
141
142         /* Mark the parity blocks which have data */
143         unsigned long           *dbitmap;
144
145         /*
146          * Mark the parity blocks which have data, but errors happen when
147          * read data or check data
148          */
149         unsigned long           *ebitmap;
150
151         unsigned long           bitmap[];
152 };
153
154 struct scrub_ctx {
155         struct scrub_bio        *bios[SCRUB_BIOS_PER_SCTX];
156         struct btrfs_fs_info    *fs_info;
157         int                     first_free;
158         int                     curr;
159         atomic_t                bios_in_flight;
160         atomic_t                workers_pending;
161         spinlock_t              list_lock;
162         wait_queue_head_t       list_wait;
163         struct list_head        csum_list;
164         atomic_t                cancel_req;
165         int                     readonly;
166         int                     pages_per_rd_bio;
167
168         /* State of IO submission throttling affecting the associated device */
169         ktime_t                 throttle_deadline;
170         u64                     throttle_sent;
171
172         int                     is_dev_replace;
173         u64                     write_pointer;
174
175         struct scrub_bio        *wr_curr_bio;
176         struct mutex            wr_lock;
177         int                     pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
178         struct btrfs_device     *wr_tgtdev;
179         bool                    flush_all_writes;
180
181         /*
182          * statistics
183          */
184         struct btrfs_scrub_progress stat;
185         spinlock_t              stat_lock;
186
187         /*
188          * Use a ref counter to avoid use-after-free issues. Scrub workers
189          * decrement bios_in_flight and workers_pending and then do a wakeup
190          * on the list_wait wait queue. We must ensure the main scrub task
191          * doesn't free the scrub context before or while the workers are
192          * doing the wakeup() call.
193          */
194         refcount_t              refs;
195 };
196
197 struct scrub_warning {
198         struct btrfs_path       *path;
199         u64                     extent_item_size;
200         const char              *errstr;
201         u64                     physical;
202         u64                     logical;
203         struct btrfs_device     *dev;
204 };
205
206 struct full_stripe_lock {
207         struct rb_node node;
208         u64 logical;
209         u64 refs;
210         struct mutex mutex;
211 };
212
213 static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
214                                      struct scrub_block *sblocks_for_recheck);
215 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
216                                 struct scrub_block *sblock,
217                                 int retry_failed_mirror);
218 static void scrub_recheck_block_checksum(struct scrub_block *sblock);
219 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
220                                              struct scrub_block *sblock_good);
221 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
222                                             struct scrub_block *sblock_good,
223                                             int page_num, int force_write);
224 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
225 static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
226                                            int page_num);
227 static int scrub_checksum_data(struct scrub_block *sblock);
228 static int scrub_checksum_tree_block(struct scrub_block *sblock);
229 static int scrub_checksum_super(struct scrub_block *sblock);
230 static void scrub_block_put(struct scrub_block *sblock);
231 static void scrub_page_get(struct scrub_page *spage);
232 static void scrub_page_put(struct scrub_page *spage);
233 static void scrub_parity_get(struct scrub_parity *sparity);
234 static void scrub_parity_put(struct scrub_parity *sparity);
235 static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len,
236                        u64 physical, struct btrfs_device *dev, u64 flags,
237                        u64 gen, int mirror_num, u8 *csum,
238                        u64 physical_for_dev_replace);
239 static void scrub_bio_end_io(struct bio *bio);
240 static void scrub_bio_end_io_worker(struct btrfs_work *work);
241 static void scrub_block_complete(struct scrub_block *sblock);
242 static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
243                                u64 extent_logical, u32 extent_len,
244                                u64 *extent_physical,
245                                struct btrfs_device **extent_dev,
246                                int *extent_mirror_num);
247 static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
248                                     struct scrub_page *spage);
249 static void scrub_wr_submit(struct scrub_ctx *sctx);
250 static void scrub_wr_bio_end_io(struct bio *bio);
251 static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
252 static void scrub_put_ctx(struct scrub_ctx *sctx);
253
254 static inline int scrub_is_page_on_raid56(struct scrub_page *spage)
255 {
256         return spage->recover &&
257                (spage->recover->bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
258 }
259
260 static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
261 {
262         refcount_inc(&sctx->refs);
263         atomic_inc(&sctx->bios_in_flight);
264 }
265
266 static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
267 {
268         atomic_dec(&sctx->bios_in_flight);
269         wake_up(&sctx->list_wait);
270         scrub_put_ctx(sctx);
271 }
272
273 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
274 {
275         while (atomic_read(&fs_info->scrub_pause_req)) {
276                 mutex_unlock(&fs_info->scrub_lock);
277                 wait_event(fs_info->scrub_pause_wait,
278                    atomic_read(&fs_info->scrub_pause_req) == 0);
279                 mutex_lock(&fs_info->scrub_lock);
280         }
281 }
282
283 static void scrub_pause_on(struct btrfs_fs_info *fs_info)
284 {
285         atomic_inc(&fs_info->scrubs_paused);
286         wake_up(&fs_info->scrub_pause_wait);
287 }
288
289 static void scrub_pause_off(struct btrfs_fs_info *fs_info)
290 {
291         mutex_lock(&fs_info->scrub_lock);
292         __scrub_blocked_if_needed(fs_info);
293         atomic_dec(&fs_info->scrubs_paused);
294         mutex_unlock(&fs_info->scrub_lock);
295
296         wake_up(&fs_info->scrub_pause_wait);
297 }
298
299 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
300 {
301         scrub_pause_on(fs_info);
302         scrub_pause_off(fs_info);
303 }
304
305 /*
306  * Insert new full stripe lock into full stripe locks tree
307  *
308  * Return pointer to existing or newly inserted full_stripe_lock structure if
309  * everything works well.
310  * Return ERR_PTR(-ENOMEM) if we failed to allocate memory
311  *
312  * NOTE: caller must hold full_stripe_locks_root->lock before calling this
313  * function
314  */
315 static struct full_stripe_lock *insert_full_stripe_lock(
316                 struct btrfs_full_stripe_locks_tree *locks_root,
317                 u64 fstripe_logical)
318 {
319         struct rb_node **p;
320         struct rb_node *parent = NULL;
321         struct full_stripe_lock *entry;
322         struct full_stripe_lock *ret;
323
324         lockdep_assert_held(&locks_root->lock);
325
326         p = &locks_root->root.rb_node;
327         while (*p) {
328                 parent = *p;
329                 entry = rb_entry(parent, struct full_stripe_lock, node);
330                 if (fstripe_logical < entry->logical) {
331                         p = &(*p)->rb_left;
332                 } else if (fstripe_logical > entry->logical) {
333                         p = &(*p)->rb_right;
334                 } else {
335                         entry->refs++;
336                         return entry;
337                 }
338         }
339
340         /*
341          * Insert new lock.
342          */
343         ret = kmalloc(sizeof(*ret), GFP_KERNEL);
344         if (!ret)
345                 return ERR_PTR(-ENOMEM);
346         ret->logical = fstripe_logical;
347         ret->refs = 1;
348         mutex_init(&ret->mutex);
349
350         rb_link_node(&ret->node, parent, p);
351         rb_insert_color(&ret->node, &locks_root->root);
352         return ret;
353 }
354
355 /*
356  * Search for a full stripe lock of a block group
357  *
358  * Return pointer to existing full stripe lock if found
359  * Return NULL if not found
360  */
361 static struct full_stripe_lock *search_full_stripe_lock(
362                 struct btrfs_full_stripe_locks_tree *locks_root,
363                 u64 fstripe_logical)
364 {
365         struct rb_node *node;
366         struct full_stripe_lock *entry;
367
368         lockdep_assert_held(&locks_root->lock);
369
370         node = locks_root->root.rb_node;
371         while (node) {
372                 entry = rb_entry(node, struct full_stripe_lock, node);
373                 if (fstripe_logical < entry->logical)
374                         node = node->rb_left;
375                 else if (fstripe_logical > entry->logical)
376                         node = node->rb_right;
377                 else
378                         return entry;
379         }
380         return NULL;
381 }
382
383 /*
384  * Helper to get full stripe logical from a normal bytenr.
385  *
386  * Caller must ensure @cache is a RAID56 block group.
387  */
388 static u64 get_full_stripe_logical(struct btrfs_block_group *cache, u64 bytenr)
389 {
390         u64 ret;
391
392         /*
393          * Due to chunk item size limit, full stripe length should not be
394          * larger than U32_MAX. Just a sanity check here.
395          */
396         WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX);
397
398         /*
399          * round_down() can only handle power of 2, while RAID56 full
400          * stripe length can be 64KiB * n, so we need to manually round down.
401          */
402         ret = div64_u64(bytenr - cache->start, cache->full_stripe_len) *
403                         cache->full_stripe_len + cache->start;
404         return ret;
405 }
406
407 /*
408  * Lock a full stripe to avoid concurrency of recovery and read
409  *
410  * It's only used for profiles with parities (RAID5/6), for other profiles it
411  * does nothing.
412  *
413  * Return 0 if we locked full stripe covering @bytenr, with a mutex held.
414  * So caller must call unlock_full_stripe() at the same context.
415  *
416  * Return <0 if encounters error.
417  */
418 static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
419                             bool *locked_ret)
420 {
421         struct btrfs_block_group *bg_cache;
422         struct btrfs_full_stripe_locks_tree *locks_root;
423         struct full_stripe_lock *existing;
424         u64 fstripe_start;
425         int ret = 0;
426
427         *locked_ret = false;
428         bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
429         if (!bg_cache) {
430                 ASSERT(0);
431                 return -ENOENT;
432         }
433
434         /* Profiles not based on parity don't need full stripe lock */
435         if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
436                 goto out;
437         locks_root = &bg_cache->full_stripe_locks_root;
438
439         fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
440
441         /* Now insert the full stripe lock */
442         mutex_lock(&locks_root->lock);
443         existing = insert_full_stripe_lock(locks_root, fstripe_start);
444         mutex_unlock(&locks_root->lock);
445         if (IS_ERR(existing)) {
446                 ret = PTR_ERR(existing);
447                 goto out;
448         }
449         mutex_lock(&existing->mutex);
450         *locked_ret = true;
451 out:
452         btrfs_put_block_group(bg_cache);
453         return ret;
454 }
455
456 /*
457  * Unlock a full stripe.
458  *
459  * NOTE: Caller must ensure it's the same context calling corresponding
460  * lock_full_stripe().
461  *
462  * Return 0 if we unlock full stripe without problem.
463  * Return <0 for error
464  */
465 static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
466                               bool locked)
467 {
468         struct btrfs_block_group *bg_cache;
469         struct btrfs_full_stripe_locks_tree *locks_root;
470         struct full_stripe_lock *fstripe_lock;
471         u64 fstripe_start;
472         bool freeit = false;
473         int ret = 0;
474
475         /* If we didn't acquire full stripe lock, no need to continue */
476         if (!locked)
477                 return 0;
478
479         bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
480         if (!bg_cache) {
481                 ASSERT(0);
482                 return -ENOENT;
483         }
484         if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
485                 goto out;
486
487         locks_root = &bg_cache->full_stripe_locks_root;
488         fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
489
490         mutex_lock(&locks_root->lock);
491         fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start);
492         /* Unpaired unlock_full_stripe() detected */
493         if (!fstripe_lock) {
494                 WARN_ON(1);
495                 ret = -ENOENT;
496                 mutex_unlock(&locks_root->lock);
497                 goto out;
498         }
499
500         if (fstripe_lock->refs == 0) {
501                 WARN_ON(1);
502                 btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow",
503                         fstripe_lock->logical);
504         } else {
505                 fstripe_lock->refs--;
506         }
507
508         if (fstripe_lock->refs == 0) {
509                 rb_erase(&fstripe_lock->node, &locks_root->root);
510                 freeit = true;
511         }
512         mutex_unlock(&locks_root->lock);
513
514         mutex_unlock(&fstripe_lock->mutex);
515         if (freeit)
516                 kfree(fstripe_lock);
517 out:
518         btrfs_put_block_group(bg_cache);
519         return ret;
520 }
521
522 static void scrub_free_csums(struct scrub_ctx *sctx)
523 {
524         while (!list_empty(&sctx->csum_list)) {
525                 struct btrfs_ordered_sum *sum;
526                 sum = list_first_entry(&sctx->csum_list,
527                                        struct btrfs_ordered_sum, list);
528                 list_del(&sum->list);
529                 kfree(sum);
530         }
531 }
532
533 static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
534 {
535         int i;
536
537         if (!sctx)
538                 return;
539
540         /* this can happen when scrub is cancelled */
541         if (sctx->curr != -1) {
542                 struct scrub_bio *sbio = sctx->bios[sctx->curr];
543
544                 for (i = 0; i < sbio->page_count; i++) {
545                         WARN_ON(!sbio->pagev[i]->page);
546                         scrub_block_put(sbio->pagev[i]->sblock);
547                 }
548                 bio_put(sbio->bio);
549         }
550
551         for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
552                 struct scrub_bio *sbio = sctx->bios[i];
553
554                 if (!sbio)
555                         break;
556                 kfree(sbio);
557         }
558
559         kfree(sctx->wr_curr_bio);
560         scrub_free_csums(sctx);
561         kfree(sctx);
562 }
563
564 static void scrub_put_ctx(struct scrub_ctx *sctx)
565 {
566         if (refcount_dec_and_test(&sctx->refs))
567                 scrub_free_ctx(sctx);
568 }
569
570 static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
571                 struct btrfs_fs_info *fs_info, int is_dev_replace)
572 {
573         struct scrub_ctx *sctx;
574         int             i;
575
576         sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
577         if (!sctx)
578                 goto nomem;
579         refcount_set(&sctx->refs, 1);
580         sctx->is_dev_replace = is_dev_replace;
581         sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
582         sctx->curr = -1;
583         sctx->fs_info = fs_info;
584         INIT_LIST_HEAD(&sctx->csum_list);
585         for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
586                 struct scrub_bio *sbio;
587
588                 sbio = kzalloc(sizeof(*sbio), GFP_KERNEL);
589                 if (!sbio)
590                         goto nomem;
591                 sctx->bios[i] = sbio;
592
593                 sbio->index = i;
594                 sbio->sctx = sctx;
595                 sbio->page_count = 0;
596                 btrfs_init_work(&sbio->work, scrub_bio_end_io_worker, NULL,
597                                 NULL);
598
599                 if (i != SCRUB_BIOS_PER_SCTX - 1)
600                         sctx->bios[i]->next_free = i + 1;
601                 else
602                         sctx->bios[i]->next_free = -1;
603         }
604         sctx->first_free = 0;
605         atomic_set(&sctx->bios_in_flight, 0);
606         atomic_set(&sctx->workers_pending, 0);
607         atomic_set(&sctx->cancel_req, 0);
608
609         spin_lock_init(&sctx->list_lock);
610         spin_lock_init(&sctx->stat_lock);
611         init_waitqueue_head(&sctx->list_wait);
612         sctx->throttle_deadline = 0;
613
614         WARN_ON(sctx->wr_curr_bio != NULL);
615         mutex_init(&sctx->wr_lock);
616         sctx->wr_curr_bio = NULL;
617         if (is_dev_replace) {
618                 WARN_ON(!fs_info->dev_replace.tgtdev);
619                 sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO;
620                 sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
621                 sctx->flush_all_writes = false;
622         }
623
624         return sctx;
625
626 nomem:
627         scrub_free_ctx(sctx);
628         return ERR_PTR(-ENOMEM);
629 }
630
631 static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
632                                      void *warn_ctx)
633 {
634         u32 nlink;
635         int ret;
636         int i;
637         unsigned nofs_flag;
638         struct extent_buffer *eb;
639         struct btrfs_inode_item *inode_item;
640         struct scrub_warning *swarn = warn_ctx;
641         struct btrfs_fs_info *fs_info = swarn->dev->fs_info;
642         struct inode_fs_paths *ipath = NULL;
643         struct btrfs_root *local_root;
644         struct btrfs_key key;
645
646         local_root = btrfs_get_fs_root(fs_info, root, true);
647         if (IS_ERR(local_root)) {
648                 ret = PTR_ERR(local_root);
649                 goto err;
650         }
651
652         /*
653          * this makes the path point to (inum INODE_ITEM ioff)
654          */
655         key.objectid = inum;
656         key.type = BTRFS_INODE_ITEM_KEY;
657         key.offset = 0;
658
659         ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
660         if (ret) {
661                 btrfs_put_root(local_root);
662                 btrfs_release_path(swarn->path);
663                 goto err;
664         }
665
666         eb = swarn->path->nodes[0];
667         inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
668                                         struct btrfs_inode_item);
669         nlink = btrfs_inode_nlink(eb, inode_item);
670         btrfs_release_path(swarn->path);
671
672         /*
673          * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub
674          * uses GFP_NOFS in this context, so we keep it consistent but it does
675          * not seem to be strictly necessary.
676          */
677         nofs_flag = memalloc_nofs_save();
678         ipath = init_ipath(4096, local_root, swarn->path);
679         memalloc_nofs_restore(nofs_flag);
680         if (IS_ERR(ipath)) {
681                 btrfs_put_root(local_root);
682                 ret = PTR_ERR(ipath);
683                 ipath = NULL;
684                 goto err;
685         }
686         ret = paths_from_inode(inum, ipath);
687
688         if (ret < 0)
689                 goto err;
690
691         /*
692          * we deliberately ignore the bit ipath might have been too small to
693          * hold all of the paths here
694          */
695         for (i = 0; i < ipath->fspath->elem_cnt; ++i)
696                 btrfs_warn_in_rcu(fs_info,
697 "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)",
698                                   swarn->errstr, swarn->logical,
699                                   rcu_str_deref(swarn->dev->name),
700                                   swarn->physical,
701                                   root, inum, offset,
702                                   fs_info->sectorsize, nlink,
703                                   (char *)(unsigned long)ipath->fspath->val[i]);
704
705         btrfs_put_root(local_root);
706         free_ipath(ipath);
707         return 0;
708
709 err:
710         btrfs_warn_in_rcu(fs_info,
711                           "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
712                           swarn->errstr, swarn->logical,
713                           rcu_str_deref(swarn->dev->name),
714                           swarn->physical,
715                           root, inum, offset, ret);
716
717         free_ipath(ipath);
718         return 0;
719 }
720
721 static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
722 {
723         struct btrfs_device *dev;
724         struct btrfs_fs_info *fs_info;
725         struct btrfs_path *path;
726         struct btrfs_key found_key;
727         struct extent_buffer *eb;
728         struct btrfs_extent_item *ei;
729         struct scrub_warning swarn;
730         unsigned long ptr = 0;
731         u64 extent_item_pos;
732         u64 flags = 0;
733         u64 ref_root;
734         u32 item_size;
735         u8 ref_level = 0;
736         int ret;
737
738         WARN_ON(sblock->page_count < 1);
739         dev = sblock->pagev[0]->dev;
740         fs_info = sblock->sctx->fs_info;
741
742         path = btrfs_alloc_path();
743         if (!path)
744                 return;
745
746         swarn.physical = sblock->pagev[0]->physical;
747         swarn.logical = sblock->pagev[0]->logical;
748         swarn.errstr = errstr;
749         swarn.dev = NULL;
750
751         ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
752                                   &flags);
753         if (ret < 0)
754                 goto out;
755
756         extent_item_pos = swarn.logical - found_key.objectid;
757         swarn.extent_item_size = found_key.offset;
758
759         eb = path->nodes[0];
760         ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
761         item_size = btrfs_item_size_nr(eb, path->slots[0]);
762
763         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
764                 do {
765                         ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
766                                                       item_size, &ref_root,
767                                                       &ref_level);
768                         btrfs_warn_in_rcu(fs_info,
769 "%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
770                                 errstr, swarn.logical,
771                                 rcu_str_deref(dev->name),
772                                 swarn.physical,
773                                 ref_level ? "node" : "leaf",
774                                 ret < 0 ? -1 : ref_level,
775                                 ret < 0 ? -1 : ref_root);
776                 } while (ret != 1);
777                 btrfs_release_path(path);
778         } else {
779                 btrfs_release_path(path);
780                 swarn.path = path;
781                 swarn.dev = dev;
782                 iterate_extent_inodes(fs_info, found_key.objectid,
783                                         extent_item_pos, 1,
784                                         scrub_print_warning_inode, &swarn, false);
785         }
786
787 out:
788         btrfs_free_path(path);
789 }
790
791 static inline void scrub_get_recover(struct scrub_recover *recover)
792 {
793         refcount_inc(&recover->refs);
794 }
795
796 static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
797                                      struct scrub_recover *recover)
798 {
799         if (refcount_dec_and_test(&recover->refs)) {
800                 btrfs_bio_counter_dec(fs_info);
801                 btrfs_put_bioc(recover->bioc);
802                 kfree(recover);
803         }
804 }
805
806 /*
807  * scrub_handle_errored_block gets called when either verification of the
808  * pages failed or the bio failed to read, e.g. with EIO. In the latter
809  * case, this function handles all pages in the bio, even though only one
810  * may be bad.
811  * The goal of this function is to repair the errored block by using the
812  * contents of one of the mirrors.
813  */
814 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
815 {
816         struct scrub_ctx *sctx = sblock_to_check->sctx;
817         struct btrfs_device *dev;
818         struct btrfs_fs_info *fs_info;
819         u64 logical;
820         unsigned int failed_mirror_index;
821         unsigned int is_metadata;
822         unsigned int have_csum;
823         struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
824         struct scrub_block *sblock_bad;
825         int ret;
826         int mirror_index;
827         int page_num;
828         int success;
829         bool full_stripe_locked;
830         unsigned int nofs_flag;
831         static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
832                                       DEFAULT_RATELIMIT_BURST);
833
834         BUG_ON(sblock_to_check->page_count < 1);
835         fs_info = sctx->fs_info;
836         if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
837                 /*
838                  * if we find an error in a super block, we just report it.
839                  * They will get written with the next transaction commit
840                  * anyway
841                  */
842                 spin_lock(&sctx->stat_lock);
843                 ++sctx->stat.super_errors;
844                 spin_unlock(&sctx->stat_lock);
845                 return 0;
846         }
847         logical = sblock_to_check->pagev[0]->logical;
848         BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
849         failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
850         is_metadata = !(sblock_to_check->pagev[0]->flags &
851                         BTRFS_EXTENT_FLAG_DATA);
852         have_csum = sblock_to_check->pagev[0]->have_csum;
853         dev = sblock_to_check->pagev[0]->dev;
854
855         if (btrfs_is_zoned(fs_info) && !sctx->is_dev_replace)
856                 return btrfs_repair_one_zone(fs_info, logical);
857
858         /*
859          * We must use GFP_NOFS because the scrub task might be waiting for a
860          * worker task executing this function and in turn a transaction commit
861          * might be waiting the scrub task to pause (which needs to wait for all
862          * the worker tasks to complete before pausing).
863          * We do allocations in the workers through insert_full_stripe_lock()
864          * and scrub_add_page_to_wr_bio(), which happens down the call chain of
865          * this function.
866          */
867         nofs_flag = memalloc_nofs_save();
868         /*
869          * For RAID5/6, race can happen for a different device scrub thread.
870          * For data corruption, Parity and Data threads will both try
871          * to recovery the data.
872          * Race can lead to doubly added csum error, or even unrecoverable
873          * error.
874          */
875         ret = lock_full_stripe(fs_info, logical, &full_stripe_locked);
876         if (ret < 0) {
877                 memalloc_nofs_restore(nofs_flag);
878                 spin_lock(&sctx->stat_lock);
879                 if (ret == -ENOMEM)
880                         sctx->stat.malloc_errors++;
881                 sctx->stat.read_errors++;
882                 sctx->stat.uncorrectable_errors++;
883                 spin_unlock(&sctx->stat_lock);
884                 return ret;
885         }
886
887         /*
888          * read all mirrors one after the other. This includes to
889          * re-read the extent or metadata block that failed (that was
890          * the cause that this fixup code is called) another time,
891          * sector by sector this time in order to know which sectors
892          * caused I/O errors and which ones are good (for all mirrors).
893          * It is the goal to handle the situation when more than one
894          * mirror contains I/O errors, but the errors do not
895          * overlap, i.e. the data can be repaired by selecting the
896          * sectors from those mirrors without I/O error on the
897          * particular sectors. One example (with blocks >= 2 * sectorsize)
898          * would be that mirror #1 has an I/O error on the first sector,
899          * the second sector is good, and mirror #2 has an I/O error on
900          * the second sector, but the first sector is good.
901          * Then the first sector of the first mirror can be repaired by
902          * taking the first sector of the second mirror, and the
903          * second sector of the second mirror can be repaired by
904          * copying the contents of the 2nd sector of the 1st mirror.
905          * One more note: if the sectors of one mirror contain I/O
906          * errors, the checksum cannot be verified. In order to get
907          * the best data for repairing, the first attempt is to find
908          * a mirror without I/O errors and with a validated checksum.
909          * Only if this is not possible, the sectors are picked from
910          * mirrors with I/O errors without considering the checksum.
911          * If the latter is the case, at the end, the checksum of the
912          * repaired area is verified in order to correctly maintain
913          * the statistics.
914          */
915
916         sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS,
917                                       sizeof(*sblocks_for_recheck), GFP_KERNEL);
918         if (!sblocks_for_recheck) {
919                 spin_lock(&sctx->stat_lock);
920                 sctx->stat.malloc_errors++;
921                 sctx->stat.read_errors++;
922                 sctx->stat.uncorrectable_errors++;
923                 spin_unlock(&sctx->stat_lock);
924                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
925                 goto out;
926         }
927
928         /* setup the context, map the logical blocks and alloc the pages */
929         ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
930         if (ret) {
931                 spin_lock(&sctx->stat_lock);
932                 sctx->stat.read_errors++;
933                 sctx->stat.uncorrectable_errors++;
934                 spin_unlock(&sctx->stat_lock);
935                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
936                 goto out;
937         }
938         BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
939         sblock_bad = sblocks_for_recheck + failed_mirror_index;
940
941         /* build and submit the bios for the failed mirror, check checksums */
942         scrub_recheck_block(fs_info, sblock_bad, 1);
943
944         if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
945             sblock_bad->no_io_error_seen) {
946                 /*
947                  * the error disappeared after reading page by page, or
948                  * the area was part of a huge bio and other parts of the
949                  * bio caused I/O errors, or the block layer merged several
950                  * read requests into one and the error is caused by a
951                  * different bio (usually one of the two latter cases is
952                  * the cause)
953                  */
954                 spin_lock(&sctx->stat_lock);
955                 sctx->stat.unverified_errors++;
956                 sblock_to_check->data_corrected = 1;
957                 spin_unlock(&sctx->stat_lock);
958
959                 if (sctx->is_dev_replace)
960                         scrub_write_block_to_dev_replace(sblock_bad);
961                 goto out;
962         }
963
964         if (!sblock_bad->no_io_error_seen) {
965                 spin_lock(&sctx->stat_lock);
966                 sctx->stat.read_errors++;
967                 spin_unlock(&sctx->stat_lock);
968                 if (__ratelimit(&rs))
969                         scrub_print_warning("i/o error", sblock_to_check);
970                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
971         } else if (sblock_bad->checksum_error) {
972                 spin_lock(&sctx->stat_lock);
973                 sctx->stat.csum_errors++;
974                 spin_unlock(&sctx->stat_lock);
975                 if (__ratelimit(&rs))
976                         scrub_print_warning("checksum error", sblock_to_check);
977                 btrfs_dev_stat_inc_and_print(dev,
978                                              BTRFS_DEV_STAT_CORRUPTION_ERRS);
979         } else if (sblock_bad->header_error) {
980                 spin_lock(&sctx->stat_lock);
981                 sctx->stat.verify_errors++;
982                 spin_unlock(&sctx->stat_lock);
983                 if (__ratelimit(&rs))
984                         scrub_print_warning("checksum/header error",
985                                             sblock_to_check);
986                 if (sblock_bad->generation_error)
987                         btrfs_dev_stat_inc_and_print(dev,
988                                 BTRFS_DEV_STAT_GENERATION_ERRS);
989                 else
990                         btrfs_dev_stat_inc_and_print(dev,
991                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
992         }
993
994         if (sctx->readonly) {
995                 ASSERT(!sctx->is_dev_replace);
996                 goto out;
997         }
998
999         /*
1000          * now build and submit the bios for the other mirrors, check
1001          * checksums.
1002          * First try to pick the mirror which is completely without I/O
1003          * errors and also does not have a checksum error.
1004          * If one is found, and if a checksum is present, the full block
1005          * that is known to contain an error is rewritten. Afterwards
1006          * the block is known to be corrected.
1007          * If a mirror is found which is completely correct, and no
1008          * checksum is present, only those pages are rewritten that had
1009          * an I/O error in the block to be repaired, since it cannot be
1010          * determined, which copy of the other pages is better (and it
1011          * could happen otherwise that a correct page would be
1012          * overwritten by a bad one).
1013          */
1014         for (mirror_index = 0; ;mirror_index++) {
1015                 struct scrub_block *sblock_other;
1016
1017                 if (mirror_index == failed_mirror_index)
1018                         continue;
1019
1020                 /* raid56's mirror can be more than BTRFS_MAX_MIRRORS */
1021                 if (!scrub_is_page_on_raid56(sblock_bad->pagev[0])) {
1022                         if (mirror_index >= BTRFS_MAX_MIRRORS)
1023                                 break;
1024                         if (!sblocks_for_recheck[mirror_index].page_count)
1025                                 break;
1026
1027                         sblock_other = sblocks_for_recheck + mirror_index;
1028                 } else {
1029                         struct scrub_recover *r = sblock_bad->pagev[0]->recover;
1030                         int max_allowed = r->bioc->num_stripes - r->bioc->num_tgtdevs;
1031
1032                         if (mirror_index >= max_allowed)
1033                                 break;
1034                         if (!sblocks_for_recheck[1].page_count)
1035                                 break;
1036
1037                         ASSERT(failed_mirror_index == 0);
1038                         sblock_other = sblocks_for_recheck + 1;
1039                         sblock_other->pagev[0]->mirror_num = 1 + mirror_index;
1040                 }
1041
1042                 /* build and submit the bios, check checksums */
1043                 scrub_recheck_block(fs_info, sblock_other, 0);
1044
1045                 if (!sblock_other->header_error &&
1046                     !sblock_other->checksum_error &&
1047                     sblock_other->no_io_error_seen) {
1048                         if (sctx->is_dev_replace) {
1049                                 scrub_write_block_to_dev_replace(sblock_other);
1050                                 goto corrected_error;
1051                         } else {
1052                                 ret = scrub_repair_block_from_good_copy(
1053                                                 sblock_bad, sblock_other);
1054                                 if (!ret)
1055                                         goto corrected_error;
1056                         }
1057                 }
1058         }
1059
1060         if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
1061                 goto did_not_correct_error;
1062
1063         /*
1064          * In case of I/O errors in the area that is supposed to be
1065          * repaired, continue by picking good copies of those sectors.
1066          * Select the good sectors from mirrors to rewrite bad sectors from
1067          * the area to fix. Afterwards verify the checksum of the block
1068          * that is supposed to be repaired. This verification step is
1069          * only done for the purpose of statistic counting and for the
1070          * final scrub report, whether errors remain.
1071          * A perfect algorithm could make use of the checksum and try
1072          * all possible combinations of sectors from the different mirrors
1073          * until the checksum verification succeeds. For example, when
1074          * the 2nd sector of mirror #1 faces I/O errors, and the 2nd sector
1075          * of mirror #2 is readable but the final checksum test fails,
1076          * then the 2nd sector of mirror #3 could be tried, whether now
1077          * the final checksum succeeds. But this would be a rare
1078          * exception and is therefore not implemented. At least it is
1079          * avoided that the good copy is overwritten.
1080          * A more useful improvement would be to pick the sectors
1081          * without I/O error based on sector sizes (512 bytes on legacy
1082          * disks) instead of on sectorsize. Then maybe 512 byte of one
1083          * mirror could be repaired by taking 512 byte of a different
1084          * mirror, even if other 512 byte sectors in the same sectorsize
1085          * area are unreadable.
1086          */
1087         success = 1;
1088         for (page_num = 0; page_num < sblock_bad->page_count;
1089              page_num++) {
1090                 struct scrub_page *spage_bad = sblock_bad->pagev[page_num];
1091                 struct scrub_block *sblock_other = NULL;
1092
1093                 /* skip no-io-error page in scrub */
1094                 if (!spage_bad->io_error && !sctx->is_dev_replace)
1095                         continue;
1096
1097                 if (scrub_is_page_on_raid56(sblock_bad->pagev[0])) {
1098                         /*
1099                          * In case of dev replace, if raid56 rebuild process
1100                          * didn't work out correct data, then copy the content
1101                          * in sblock_bad to make sure target device is identical
1102                          * to source device, instead of writing garbage data in
1103                          * sblock_for_recheck array to target device.
1104                          */
1105                         sblock_other = NULL;
1106                 } else if (spage_bad->io_error) {
1107                         /* try to find no-io-error page in mirrors */
1108                         for (mirror_index = 0;
1109                              mirror_index < BTRFS_MAX_MIRRORS &&
1110                              sblocks_for_recheck[mirror_index].page_count > 0;
1111                              mirror_index++) {
1112                                 if (!sblocks_for_recheck[mirror_index].
1113                                     pagev[page_num]->io_error) {
1114                                         sblock_other = sblocks_for_recheck +
1115                                                        mirror_index;
1116                                         break;
1117                                 }
1118                         }
1119                         if (!sblock_other)
1120                                 success = 0;
1121                 }
1122
1123                 if (sctx->is_dev_replace) {
1124                         /*
1125                          * did not find a mirror to fetch the page
1126                          * from. scrub_write_page_to_dev_replace()
1127                          * handles this case (page->io_error), by
1128                          * filling the block with zeros before
1129                          * submitting the write request
1130                          */
1131                         if (!sblock_other)
1132                                 sblock_other = sblock_bad;
1133
1134                         if (scrub_write_page_to_dev_replace(sblock_other,
1135                                                             page_num) != 0) {
1136                                 atomic64_inc(
1137                                         &fs_info->dev_replace.num_write_errors);
1138                                 success = 0;
1139                         }
1140                 } else if (sblock_other) {
1141                         ret = scrub_repair_page_from_good_copy(sblock_bad,
1142                                                                sblock_other,
1143                                                                page_num, 0);
1144                         if (0 == ret)
1145                                 spage_bad->io_error = 0;
1146                         else
1147                                 success = 0;
1148                 }
1149         }
1150
1151         if (success && !sctx->is_dev_replace) {
1152                 if (is_metadata || have_csum) {
1153                         /*
1154                          * need to verify the checksum now that all
1155                          * sectors on disk are repaired (the write
1156                          * request for data to be repaired is on its way).
1157                          * Just be lazy and use scrub_recheck_block()
1158                          * which re-reads the data before the checksum
1159                          * is verified, but most likely the data comes out
1160                          * of the page cache.
1161                          */
1162                         scrub_recheck_block(fs_info, sblock_bad, 1);
1163                         if (!sblock_bad->header_error &&
1164                             !sblock_bad->checksum_error &&
1165                             sblock_bad->no_io_error_seen)
1166                                 goto corrected_error;
1167                         else
1168                                 goto did_not_correct_error;
1169                 } else {
1170 corrected_error:
1171                         spin_lock(&sctx->stat_lock);
1172                         sctx->stat.corrected_errors++;
1173                         sblock_to_check->data_corrected = 1;
1174                         spin_unlock(&sctx->stat_lock);
1175                         btrfs_err_rl_in_rcu(fs_info,
1176                                 "fixed up error at logical %llu on dev %s",
1177                                 logical, rcu_str_deref(dev->name));
1178                 }
1179         } else {
1180 did_not_correct_error:
1181                 spin_lock(&sctx->stat_lock);
1182                 sctx->stat.uncorrectable_errors++;
1183                 spin_unlock(&sctx->stat_lock);
1184                 btrfs_err_rl_in_rcu(fs_info,
1185                         "unable to fixup (regular) error at logical %llu on dev %s",
1186                         logical, rcu_str_deref(dev->name));
1187         }
1188
1189 out:
1190         if (sblocks_for_recheck) {
1191                 for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
1192                      mirror_index++) {
1193                         struct scrub_block *sblock = sblocks_for_recheck +
1194                                                      mirror_index;
1195                         struct scrub_recover *recover;
1196                         int page_index;
1197
1198                         for (page_index = 0; page_index < sblock->page_count;
1199                              page_index++) {
1200                                 sblock->pagev[page_index]->sblock = NULL;
1201                                 recover = sblock->pagev[page_index]->recover;
1202                                 if (recover) {
1203                                         scrub_put_recover(fs_info, recover);
1204                                         sblock->pagev[page_index]->recover =
1205                                                                         NULL;
1206                                 }
1207                                 scrub_page_put(sblock->pagev[page_index]);
1208                         }
1209                 }
1210                 kfree(sblocks_for_recheck);
1211         }
1212
1213         ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
1214         memalloc_nofs_restore(nofs_flag);
1215         if (ret < 0)
1216                 return ret;
1217         return 0;
1218 }
1219
1220 static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc)
1221 {
1222         if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
1223                 return 2;
1224         else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
1225                 return 3;
1226         else
1227                 return (int)bioc->num_stripes;
1228 }
1229
1230 static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
1231                                                  u64 *raid_map,
1232                                                  u64 mapped_length,
1233                                                  int nstripes, int mirror,
1234                                                  int *stripe_index,
1235                                                  u64 *stripe_offset)
1236 {
1237         int i;
1238
1239         if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
1240                 /* RAID5/6 */
1241                 for (i = 0; i < nstripes; i++) {
1242                         if (raid_map[i] == RAID6_Q_STRIPE ||
1243                             raid_map[i] == RAID5_P_STRIPE)
1244                                 continue;
1245
1246                         if (logical >= raid_map[i] &&
1247                             logical < raid_map[i] + mapped_length)
1248                                 break;
1249                 }
1250
1251                 *stripe_index = i;
1252                 *stripe_offset = logical - raid_map[i];
1253         } else {
1254                 /* The other RAID type */
1255                 *stripe_index = mirror;
1256                 *stripe_offset = 0;
1257         }
1258 }
1259
1260 static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
1261                                      struct scrub_block *sblocks_for_recheck)
1262 {
1263         struct scrub_ctx *sctx = original_sblock->sctx;
1264         struct btrfs_fs_info *fs_info = sctx->fs_info;
1265         u64 length = original_sblock->page_count * fs_info->sectorsize;
1266         u64 logical = original_sblock->pagev[0]->logical;
1267         u64 generation = original_sblock->pagev[0]->generation;
1268         u64 flags = original_sblock->pagev[0]->flags;
1269         u64 have_csum = original_sblock->pagev[0]->have_csum;
1270         struct scrub_recover *recover;
1271         struct btrfs_io_context *bioc;
1272         u64 sublen;
1273         u64 mapped_length;
1274         u64 stripe_offset;
1275         int stripe_index;
1276         int page_index = 0;
1277         int mirror_index;
1278         int nmirrors;
1279         int ret;
1280
1281         /*
1282          * note: the two members refs and outstanding_pages
1283          * are not used (and not set) in the blocks that are used for
1284          * the recheck procedure
1285          */
1286
1287         while (length > 0) {
1288                 sublen = min_t(u64, length, fs_info->sectorsize);
1289                 mapped_length = sublen;
1290                 bioc = NULL;
1291
1292                 /*
1293                  * With a length of sectorsize, each returned stripe represents
1294                  * one mirror
1295                  */
1296                 btrfs_bio_counter_inc_blocked(fs_info);
1297                 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
1298                                        logical, &mapped_length, &bioc);
1299                 if (ret || !bioc || mapped_length < sublen) {
1300                         btrfs_put_bioc(bioc);
1301                         btrfs_bio_counter_dec(fs_info);
1302                         return -EIO;
1303                 }
1304
1305                 recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
1306                 if (!recover) {
1307                         btrfs_put_bioc(bioc);
1308                         btrfs_bio_counter_dec(fs_info);
1309                         return -ENOMEM;
1310                 }
1311
1312                 refcount_set(&recover->refs, 1);
1313                 recover->bioc = bioc;
1314                 recover->map_length = mapped_length;
1315
1316                 BUG_ON(page_index >= SCRUB_MAX_PAGES_PER_BLOCK);
1317
1318                 nmirrors = min(scrub_nr_raid_mirrors(bioc), BTRFS_MAX_MIRRORS);
1319
1320                 for (mirror_index = 0; mirror_index < nmirrors;
1321                      mirror_index++) {
1322                         struct scrub_block *sblock;
1323                         struct scrub_page *spage;
1324
1325                         sblock = sblocks_for_recheck + mirror_index;
1326                         sblock->sctx = sctx;
1327
1328                         spage = kzalloc(sizeof(*spage), GFP_NOFS);
1329                         if (!spage) {
1330 leave_nomem:
1331                                 spin_lock(&sctx->stat_lock);
1332                                 sctx->stat.malloc_errors++;
1333                                 spin_unlock(&sctx->stat_lock);
1334                                 scrub_put_recover(fs_info, recover);
1335                                 return -ENOMEM;
1336                         }
1337                         scrub_page_get(spage);
1338                         sblock->pagev[page_index] = spage;
1339                         spage->sblock = sblock;
1340                         spage->flags = flags;
1341                         spage->generation = generation;
1342                         spage->logical = logical;
1343                         spage->have_csum = have_csum;
1344                         if (have_csum)
1345                                 memcpy(spage->csum,
1346                                        original_sblock->pagev[0]->csum,
1347                                        sctx->fs_info->csum_size);
1348
1349                         scrub_stripe_index_and_offset(logical,
1350                                                       bioc->map_type,
1351                                                       bioc->raid_map,
1352                                                       mapped_length,
1353                                                       bioc->num_stripes -
1354                                                       bioc->num_tgtdevs,
1355                                                       mirror_index,
1356                                                       &stripe_index,
1357                                                       &stripe_offset);
1358                         spage->physical = bioc->stripes[stripe_index].physical +
1359                                          stripe_offset;
1360                         spage->dev = bioc->stripes[stripe_index].dev;
1361
1362                         BUG_ON(page_index >= original_sblock->page_count);
1363                         spage->physical_for_dev_replace =
1364                                 original_sblock->pagev[page_index]->
1365                                 physical_for_dev_replace;
1366                         /* for missing devices, dev->bdev is NULL */
1367                         spage->mirror_num = mirror_index + 1;
1368                         sblock->page_count++;
1369                         spage->page = alloc_page(GFP_NOFS);
1370                         if (!spage->page)
1371                                 goto leave_nomem;
1372
1373                         scrub_get_recover(recover);
1374                         spage->recover = recover;
1375                 }
1376                 scrub_put_recover(fs_info, recover);
1377                 length -= sublen;
1378                 logical += sublen;
1379                 page_index++;
1380         }
1381
1382         return 0;
1383 }
1384
1385 static void scrub_bio_wait_endio(struct bio *bio)
1386 {
1387         complete(bio->bi_private);
1388 }
1389
1390 static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
1391                                         struct bio *bio,
1392                                         struct scrub_page *spage)
1393 {
1394         DECLARE_COMPLETION_ONSTACK(done);
1395         int ret;
1396         int mirror_num;
1397
1398         bio->bi_iter.bi_sector = spage->logical >> 9;
1399         bio->bi_private = &done;
1400         bio->bi_end_io = scrub_bio_wait_endio;
1401
1402         mirror_num = spage->sblock->pagev[0]->mirror_num;
1403         ret = raid56_parity_recover(fs_info, bio, spage->recover->bioc,
1404                                     spage->recover->map_length,
1405                                     mirror_num, 0);
1406         if (ret)
1407                 return ret;
1408
1409         wait_for_completion_io(&done);
1410         return blk_status_to_errno(bio->bi_status);
1411 }
1412
1413 static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
1414                                           struct scrub_block *sblock)
1415 {
1416         struct scrub_page *first_page = sblock->pagev[0];
1417         struct bio *bio;
1418         int page_num;
1419
1420         /* All pages in sblock belong to the same stripe on the same device. */
1421         ASSERT(first_page->dev);
1422         if (!first_page->dev->bdev)
1423                 goto out;
1424
1425         bio = btrfs_io_bio_alloc(BIO_MAX_VECS);
1426         bio_set_dev(bio, first_page->dev->bdev);
1427
1428         for (page_num = 0; page_num < sblock->page_count; page_num++) {
1429                 struct scrub_page *spage = sblock->pagev[page_num];
1430
1431                 WARN_ON(!spage->page);
1432                 bio_add_page(bio, spage->page, PAGE_SIZE, 0);
1433         }
1434
1435         if (scrub_submit_raid56_bio_wait(fs_info, bio, first_page)) {
1436                 bio_put(bio);
1437                 goto out;
1438         }
1439
1440         bio_put(bio);
1441
1442         scrub_recheck_block_checksum(sblock);
1443
1444         return;
1445 out:
1446         for (page_num = 0; page_num < sblock->page_count; page_num++)
1447                 sblock->pagev[page_num]->io_error = 1;
1448
1449         sblock->no_io_error_seen = 0;
1450 }
1451
1452 /*
1453  * this function will check the on disk data for checksum errors, header
1454  * errors and read I/O errors. If any I/O errors happen, the exact pages
1455  * which are errored are marked as being bad. The goal is to enable scrub
1456  * to take those pages that are not errored from all the mirrors so that
1457  * the pages that are errored in the just handled mirror can be repaired.
1458  */
1459 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1460                                 struct scrub_block *sblock,
1461                                 int retry_failed_mirror)
1462 {
1463         int page_num;
1464
1465         sblock->no_io_error_seen = 1;
1466
1467         /* short cut for raid56 */
1468         if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->pagev[0]))
1469                 return scrub_recheck_block_on_raid56(fs_info, sblock);
1470
1471         for (page_num = 0; page_num < sblock->page_count; page_num++) {
1472                 struct bio *bio;
1473                 struct scrub_page *spage = sblock->pagev[page_num];
1474
1475                 if (spage->dev->bdev == NULL) {
1476                         spage->io_error = 1;
1477                         sblock->no_io_error_seen = 0;
1478                         continue;
1479                 }
1480
1481                 WARN_ON(!spage->page);
1482                 bio = btrfs_io_bio_alloc(1);
1483                 bio_set_dev(bio, spage->dev->bdev);
1484
1485                 bio_add_page(bio, spage->page, fs_info->sectorsize, 0);
1486                 bio->bi_iter.bi_sector = spage->physical >> 9;
1487                 bio->bi_opf = REQ_OP_READ;
1488
1489                 if (btrfsic_submit_bio_wait(bio)) {
1490                         spage->io_error = 1;
1491                         sblock->no_io_error_seen = 0;
1492                 }
1493
1494                 bio_put(bio);
1495         }
1496
1497         if (sblock->no_io_error_seen)
1498                 scrub_recheck_block_checksum(sblock);
1499 }
1500
1501 static inline int scrub_check_fsid(u8 fsid[],
1502                                    struct scrub_page *spage)
1503 {
1504         struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices;
1505         int ret;
1506
1507         ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1508         return !ret;
1509 }
1510
1511 static void scrub_recheck_block_checksum(struct scrub_block *sblock)
1512 {
1513         sblock->header_error = 0;
1514         sblock->checksum_error = 0;
1515         sblock->generation_error = 0;
1516
1517         if (sblock->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA)
1518                 scrub_checksum_data(sblock);
1519         else
1520                 scrub_checksum_tree_block(sblock);
1521 }
1522
1523 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
1524                                              struct scrub_block *sblock_good)
1525 {
1526         int page_num;
1527         int ret = 0;
1528
1529         for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
1530                 int ret_sub;
1531
1532                 ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
1533                                                            sblock_good,
1534                                                            page_num, 1);
1535                 if (ret_sub)
1536                         ret = ret_sub;
1537         }
1538
1539         return ret;
1540 }
1541
1542 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1543                                             struct scrub_block *sblock_good,
1544                                             int page_num, int force_write)
1545 {
1546         struct scrub_page *spage_bad = sblock_bad->pagev[page_num];
1547         struct scrub_page *spage_good = sblock_good->pagev[page_num];
1548         struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
1549         const u32 sectorsize = fs_info->sectorsize;
1550
1551         BUG_ON(spage_bad->page == NULL);
1552         BUG_ON(spage_good->page == NULL);
1553         if (force_write || sblock_bad->header_error ||
1554             sblock_bad->checksum_error || spage_bad->io_error) {
1555                 struct bio *bio;
1556                 int ret;
1557
1558                 if (!spage_bad->dev->bdev) {
1559                         btrfs_warn_rl(fs_info,
1560                                 "scrub_repair_page_from_good_copy(bdev == NULL) is unexpected");
1561                         return -EIO;
1562                 }
1563
1564                 bio = btrfs_io_bio_alloc(1);
1565                 bio_set_dev(bio, spage_bad->dev->bdev);
1566                 bio->bi_iter.bi_sector = spage_bad->physical >> 9;
1567                 bio->bi_opf = REQ_OP_WRITE;
1568
1569                 ret = bio_add_page(bio, spage_good->page, sectorsize, 0);
1570                 if (ret != sectorsize) {
1571                         bio_put(bio);
1572                         return -EIO;
1573                 }
1574
1575                 if (btrfsic_submit_bio_wait(bio)) {
1576                         btrfs_dev_stat_inc_and_print(spage_bad->dev,
1577                                 BTRFS_DEV_STAT_WRITE_ERRS);
1578                         atomic64_inc(&fs_info->dev_replace.num_write_errors);
1579                         bio_put(bio);
1580                         return -EIO;
1581                 }
1582                 bio_put(bio);
1583         }
1584
1585         return 0;
1586 }
1587
1588 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1589 {
1590         struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
1591         int page_num;
1592
1593         /*
1594          * This block is used for the check of the parity on the source device,
1595          * so the data needn't be written into the destination device.
1596          */
1597         if (sblock->sparity)
1598                 return;
1599
1600         for (page_num = 0; page_num < sblock->page_count; page_num++) {
1601                 int ret;
1602
1603                 ret = scrub_write_page_to_dev_replace(sblock, page_num);
1604                 if (ret)
1605                         atomic64_inc(&fs_info->dev_replace.num_write_errors);
1606         }
1607 }
1608
1609 static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
1610                                            int page_num)
1611 {
1612         struct scrub_page *spage = sblock->pagev[page_num];
1613
1614         BUG_ON(spage->page == NULL);
1615         if (spage->io_error)
1616                 clear_page(page_address(spage->page));
1617
1618         return scrub_add_page_to_wr_bio(sblock->sctx, spage);
1619 }
1620
1621 static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
1622 {
1623         int ret = 0;
1624         u64 length;
1625
1626         if (!btrfs_is_zoned(sctx->fs_info))
1627                 return 0;
1628
1629         if (!btrfs_dev_is_sequential(sctx->wr_tgtdev, physical))
1630                 return 0;
1631
1632         if (sctx->write_pointer < physical) {
1633                 length = physical - sctx->write_pointer;
1634
1635                 ret = btrfs_zoned_issue_zeroout(sctx->wr_tgtdev,
1636                                                 sctx->write_pointer, length);
1637                 if (!ret)
1638                         sctx->write_pointer = physical;
1639         }
1640         return ret;
1641 }
1642
1643 static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
1644                                     struct scrub_page *spage)
1645 {
1646         struct scrub_bio *sbio;
1647         int ret;
1648         const u32 sectorsize = sctx->fs_info->sectorsize;
1649
1650         mutex_lock(&sctx->wr_lock);
1651 again:
1652         if (!sctx->wr_curr_bio) {
1653                 sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio),
1654                                               GFP_KERNEL);
1655                 if (!sctx->wr_curr_bio) {
1656                         mutex_unlock(&sctx->wr_lock);
1657                         return -ENOMEM;
1658                 }
1659                 sctx->wr_curr_bio->sctx = sctx;
1660                 sctx->wr_curr_bio->page_count = 0;
1661         }
1662         sbio = sctx->wr_curr_bio;
1663         if (sbio->page_count == 0) {
1664                 struct bio *bio;
1665
1666                 ret = fill_writer_pointer_gap(sctx,
1667                                               spage->physical_for_dev_replace);
1668                 if (ret) {
1669                         mutex_unlock(&sctx->wr_lock);
1670                         return ret;
1671                 }
1672
1673                 sbio->physical = spage->physical_for_dev_replace;
1674                 sbio->logical = spage->logical;
1675                 sbio->dev = sctx->wr_tgtdev;
1676                 bio = sbio->bio;
1677                 if (!bio) {
1678                         bio = btrfs_io_bio_alloc(sctx->pages_per_wr_bio);
1679                         sbio->bio = bio;
1680                 }
1681
1682                 bio->bi_private = sbio;
1683                 bio->bi_end_io = scrub_wr_bio_end_io;
1684                 bio_set_dev(bio, sbio->dev->bdev);
1685                 bio->bi_iter.bi_sector = sbio->physical >> 9;
1686                 bio->bi_opf = REQ_OP_WRITE;
1687                 sbio->status = 0;
1688         } else if (sbio->physical + sbio->page_count * sectorsize !=
1689                    spage->physical_for_dev_replace ||
1690                    sbio->logical + sbio->page_count * sectorsize !=
1691                    spage->logical) {
1692                 scrub_wr_submit(sctx);
1693                 goto again;
1694         }
1695
1696         ret = bio_add_page(sbio->bio, spage->page, sectorsize, 0);
1697         if (ret != sectorsize) {
1698                 if (sbio->page_count < 1) {
1699                         bio_put(sbio->bio);
1700                         sbio->bio = NULL;
1701                         mutex_unlock(&sctx->wr_lock);
1702                         return -EIO;
1703                 }
1704                 scrub_wr_submit(sctx);
1705                 goto again;
1706         }
1707
1708         sbio->pagev[sbio->page_count] = spage;
1709         scrub_page_get(spage);
1710         sbio->page_count++;
1711         if (sbio->page_count == sctx->pages_per_wr_bio)
1712                 scrub_wr_submit(sctx);
1713         mutex_unlock(&sctx->wr_lock);
1714
1715         return 0;
1716 }
1717
1718 static void scrub_wr_submit(struct scrub_ctx *sctx)
1719 {
1720         struct scrub_bio *sbio;
1721
1722         if (!sctx->wr_curr_bio)
1723                 return;
1724
1725         sbio = sctx->wr_curr_bio;
1726         sctx->wr_curr_bio = NULL;
1727         WARN_ON(!sbio->bio->bi_bdev);
1728         scrub_pending_bio_inc(sctx);
1729         /* process all writes in a single worker thread. Then the block layer
1730          * orders the requests before sending them to the driver which
1731          * doubled the write performance on spinning disks when measured
1732          * with Linux 3.5 */
1733         btrfsic_submit_bio(sbio->bio);
1734
1735         if (btrfs_is_zoned(sctx->fs_info))
1736                 sctx->write_pointer = sbio->physical + sbio->page_count *
1737                         sctx->fs_info->sectorsize;
1738 }
1739
1740 static void scrub_wr_bio_end_io(struct bio *bio)
1741 {
1742         struct scrub_bio *sbio = bio->bi_private;
1743         struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
1744
1745         sbio->status = bio->bi_status;
1746         sbio->bio = bio;
1747
1748         btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL);
1749         btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
1750 }
1751
1752 static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
1753 {
1754         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1755         struct scrub_ctx *sctx = sbio->sctx;
1756         int i;
1757
1758         WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
1759         if (sbio->status) {
1760                 struct btrfs_dev_replace *dev_replace =
1761                         &sbio->sctx->fs_info->dev_replace;
1762
1763                 for (i = 0; i < sbio->page_count; i++) {
1764                         struct scrub_page *spage = sbio->pagev[i];
1765
1766                         spage->io_error = 1;
1767                         atomic64_inc(&dev_replace->num_write_errors);
1768                 }
1769         }
1770
1771         for (i = 0; i < sbio->page_count; i++)
1772                 scrub_page_put(sbio->pagev[i]);
1773
1774         bio_put(sbio->bio);
1775         kfree(sbio);
1776         scrub_pending_bio_dec(sctx);
1777 }
1778
1779 static int scrub_checksum(struct scrub_block *sblock)
1780 {
1781         u64 flags;
1782         int ret;
1783
1784         /*
1785          * No need to initialize these stats currently,
1786          * because this function only use return value
1787          * instead of these stats value.
1788          *
1789          * Todo:
1790          * always use stats
1791          */
1792         sblock->header_error = 0;
1793         sblock->generation_error = 0;
1794         sblock->checksum_error = 0;
1795
1796         WARN_ON(sblock->page_count < 1);
1797         flags = sblock->pagev[0]->flags;
1798         ret = 0;
1799         if (flags & BTRFS_EXTENT_FLAG_DATA)
1800                 ret = scrub_checksum_data(sblock);
1801         else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1802                 ret = scrub_checksum_tree_block(sblock);
1803         else if (flags & BTRFS_EXTENT_FLAG_SUPER)
1804                 (void)scrub_checksum_super(sblock);
1805         else
1806                 WARN_ON(1);
1807         if (ret)
1808                 scrub_handle_errored_block(sblock);
1809
1810         return ret;
1811 }
1812
1813 static int scrub_checksum_data(struct scrub_block *sblock)
1814 {
1815         struct scrub_ctx *sctx = sblock->sctx;
1816         struct btrfs_fs_info *fs_info = sctx->fs_info;
1817         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1818         u8 csum[BTRFS_CSUM_SIZE];
1819         struct scrub_page *spage;
1820         char *kaddr;
1821
1822         BUG_ON(sblock->page_count < 1);
1823         spage = sblock->pagev[0];
1824         if (!spage->have_csum)
1825                 return 0;
1826
1827         kaddr = page_address(spage->page);
1828
1829         shash->tfm = fs_info->csum_shash;
1830         crypto_shash_init(shash);
1831
1832         /*
1833          * In scrub_pages() and scrub_pages_for_parity() we ensure each spage
1834          * only contains one sector of data.
1835          */
1836         crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
1837
1838         if (memcmp(csum, spage->csum, fs_info->csum_size))
1839                 sblock->checksum_error = 1;
1840         return sblock->checksum_error;
1841 }
1842
1843 static int scrub_checksum_tree_block(struct scrub_block *sblock)
1844 {
1845         struct scrub_ctx *sctx = sblock->sctx;
1846         struct btrfs_header *h;
1847         struct btrfs_fs_info *fs_info = sctx->fs_info;
1848         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1849         u8 calculated_csum[BTRFS_CSUM_SIZE];
1850         u8 on_disk_csum[BTRFS_CSUM_SIZE];
1851         /*
1852          * This is done in sectorsize steps even for metadata as there's a
1853          * constraint for nodesize to be aligned to sectorsize. This will need
1854          * to change so we don't misuse data and metadata units like that.
1855          */
1856         const u32 sectorsize = sctx->fs_info->sectorsize;
1857         const int num_sectors = fs_info->nodesize >> fs_info->sectorsize_bits;
1858         int i;
1859         struct scrub_page *spage;
1860         char *kaddr;
1861
1862         BUG_ON(sblock->page_count < 1);
1863
1864         /* Each member in pagev is just one block, not a full page */
1865         ASSERT(sblock->page_count == num_sectors);
1866
1867         spage = sblock->pagev[0];
1868         kaddr = page_address(spage->page);
1869         h = (struct btrfs_header *)kaddr;
1870         memcpy(on_disk_csum, h->csum, sctx->fs_info->csum_size);
1871
1872         /*
1873          * we don't use the getter functions here, as we
1874          * a) don't have an extent buffer and
1875          * b) the page is already kmapped
1876          */
1877         if (spage->logical != btrfs_stack_header_bytenr(h))
1878                 sblock->header_error = 1;
1879
1880         if (spage->generation != btrfs_stack_header_generation(h)) {
1881                 sblock->header_error = 1;
1882                 sblock->generation_error = 1;
1883         }
1884
1885         if (!scrub_check_fsid(h->fsid, spage))
1886                 sblock->header_error = 1;
1887
1888         if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1889                    BTRFS_UUID_SIZE))
1890                 sblock->header_error = 1;
1891
1892         shash->tfm = fs_info->csum_shash;
1893         crypto_shash_init(shash);
1894         crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
1895                             sectorsize - BTRFS_CSUM_SIZE);
1896
1897         for (i = 1; i < num_sectors; i++) {
1898                 kaddr = page_address(sblock->pagev[i]->page);
1899                 crypto_shash_update(shash, kaddr, sectorsize);
1900         }
1901
1902         crypto_shash_final(shash, calculated_csum);
1903         if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size))
1904                 sblock->checksum_error = 1;
1905
1906         return sblock->header_error || sblock->checksum_error;
1907 }
1908
1909 static int scrub_checksum_super(struct scrub_block *sblock)
1910 {
1911         struct btrfs_super_block *s;
1912         struct scrub_ctx *sctx = sblock->sctx;
1913         struct btrfs_fs_info *fs_info = sctx->fs_info;
1914         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1915         u8 calculated_csum[BTRFS_CSUM_SIZE];
1916         struct scrub_page *spage;
1917         char *kaddr;
1918         int fail_gen = 0;
1919         int fail_cor = 0;
1920
1921         BUG_ON(sblock->page_count < 1);
1922         spage = sblock->pagev[0];
1923         kaddr = page_address(spage->page);
1924         s = (struct btrfs_super_block *)kaddr;
1925
1926         if (spage->logical != btrfs_super_bytenr(s))
1927                 ++fail_cor;
1928
1929         if (spage->generation != btrfs_super_generation(s))
1930                 ++fail_gen;
1931
1932         if (!scrub_check_fsid(s->fsid, spage))
1933                 ++fail_cor;
1934
1935         shash->tfm = fs_info->csum_shash;
1936         crypto_shash_init(shash);
1937         crypto_shash_digest(shash, kaddr + BTRFS_CSUM_SIZE,
1938                         BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, calculated_csum);
1939
1940         if (memcmp(calculated_csum, s->csum, sctx->fs_info->csum_size))
1941                 ++fail_cor;
1942
1943         if (fail_cor + fail_gen) {
1944                 /*
1945                  * if we find an error in a super block, we just report it.
1946                  * They will get written with the next transaction commit
1947                  * anyway
1948                  */
1949                 spin_lock(&sctx->stat_lock);
1950                 ++sctx->stat.super_errors;
1951                 spin_unlock(&sctx->stat_lock);
1952                 if (fail_cor)
1953                         btrfs_dev_stat_inc_and_print(spage->dev,
1954                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
1955                 else
1956                         btrfs_dev_stat_inc_and_print(spage->dev,
1957                                 BTRFS_DEV_STAT_GENERATION_ERRS);
1958         }
1959
1960         return fail_cor + fail_gen;
1961 }
1962
1963 static void scrub_block_get(struct scrub_block *sblock)
1964 {
1965         refcount_inc(&sblock->refs);
1966 }
1967
1968 static void scrub_block_put(struct scrub_block *sblock)
1969 {
1970         if (refcount_dec_and_test(&sblock->refs)) {
1971                 int i;
1972
1973                 if (sblock->sparity)
1974                         scrub_parity_put(sblock->sparity);
1975
1976                 for (i = 0; i < sblock->page_count; i++)
1977                         scrub_page_put(sblock->pagev[i]);
1978                 kfree(sblock);
1979         }
1980 }
1981
1982 static void scrub_page_get(struct scrub_page *spage)
1983 {
1984         atomic_inc(&spage->refs);
1985 }
1986
1987 static void scrub_page_put(struct scrub_page *spage)
1988 {
1989         if (atomic_dec_and_test(&spage->refs)) {
1990                 if (spage->page)
1991                         __free_page(spage->page);
1992                 kfree(spage);
1993         }
1994 }
1995
1996 /*
1997  * Throttling of IO submission, bandwidth-limit based, the timeslice is 1
1998  * second.  Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max.
1999  */
2000 static void scrub_throttle(struct scrub_ctx *sctx)
2001 {
2002         const int time_slice = 1000;
2003         struct scrub_bio *sbio;
2004         struct btrfs_device *device;
2005         s64 delta;
2006         ktime_t now;
2007         u32 div;
2008         u64 bwlimit;
2009
2010         sbio = sctx->bios[sctx->curr];
2011         device = sbio->dev;
2012         bwlimit = READ_ONCE(device->scrub_speed_max);
2013         if (bwlimit == 0)
2014                 return;
2015
2016         /*
2017          * Slice is divided into intervals when the IO is submitted, adjust by
2018          * bwlimit and maximum of 64 intervals.
2019          */
2020         div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024)));
2021         div = min_t(u32, 64, div);
2022
2023         /* Start new epoch, set deadline */
2024         now = ktime_get();
2025         if (sctx->throttle_deadline == 0) {
2026                 sctx->throttle_deadline = ktime_add_ms(now, time_slice / div);
2027                 sctx->throttle_sent = 0;
2028         }
2029
2030         /* Still in the time to send? */
2031         if (ktime_before(now, sctx->throttle_deadline)) {
2032                 /* If current bio is within the limit, send it */
2033                 sctx->throttle_sent += sbio->bio->bi_iter.bi_size;
2034                 if (sctx->throttle_sent <= div_u64(bwlimit, div))
2035                         return;
2036
2037                 /* We're over the limit, sleep until the rest of the slice */
2038                 delta = ktime_ms_delta(sctx->throttle_deadline, now);
2039         } else {
2040                 /* New request after deadline, start new epoch */
2041                 delta = 0;
2042         }
2043
2044         if (delta) {
2045                 long timeout;
2046
2047                 timeout = div_u64(delta * HZ, 1000);
2048                 schedule_timeout_interruptible(timeout);
2049         }
2050
2051         /* Next call will start the deadline period */
2052         sctx->throttle_deadline = 0;
2053 }
2054
2055 static void scrub_submit(struct scrub_ctx *sctx)
2056 {
2057         struct scrub_bio *sbio;
2058
2059         if (sctx->curr == -1)
2060                 return;
2061
2062         scrub_throttle(sctx);
2063
2064         sbio = sctx->bios[sctx->curr];
2065         sctx->curr = -1;
2066         scrub_pending_bio_inc(sctx);
2067         btrfsic_submit_bio(sbio->bio);
2068 }
2069
2070 static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
2071                                     struct scrub_page *spage)
2072 {
2073         struct scrub_block *sblock = spage->sblock;
2074         struct scrub_bio *sbio;
2075         const u32 sectorsize = sctx->fs_info->sectorsize;
2076         int ret;
2077
2078 again:
2079         /*
2080          * grab a fresh bio or wait for one to become available
2081          */
2082         while (sctx->curr == -1) {
2083                 spin_lock(&sctx->list_lock);
2084                 sctx->curr = sctx->first_free;
2085                 if (sctx->curr != -1) {
2086                         sctx->first_free = sctx->bios[sctx->curr]->next_free;
2087                         sctx->bios[sctx->curr]->next_free = -1;
2088                         sctx->bios[sctx->curr]->page_count = 0;
2089                         spin_unlock(&sctx->list_lock);
2090                 } else {
2091                         spin_unlock(&sctx->list_lock);
2092                         wait_event(sctx->list_wait, sctx->first_free != -1);
2093                 }
2094         }
2095         sbio = sctx->bios[sctx->curr];
2096         if (sbio->page_count == 0) {
2097                 struct bio *bio;
2098
2099                 sbio->physical = spage->physical;
2100                 sbio->logical = spage->logical;
2101                 sbio->dev = spage->dev;
2102                 bio = sbio->bio;
2103                 if (!bio) {
2104                         bio = btrfs_io_bio_alloc(sctx->pages_per_rd_bio);
2105                         sbio->bio = bio;
2106                 }
2107
2108                 bio->bi_private = sbio;
2109                 bio->bi_end_io = scrub_bio_end_io;
2110                 bio_set_dev(bio, sbio->dev->bdev);
2111                 bio->bi_iter.bi_sector = sbio->physical >> 9;
2112                 bio->bi_opf = REQ_OP_READ;
2113                 sbio->status = 0;
2114         } else if (sbio->physical + sbio->page_count * sectorsize !=
2115                    spage->physical ||
2116                    sbio->logical + sbio->page_count * sectorsize !=
2117                    spage->logical ||
2118                    sbio->dev != spage->dev) {
2119                 scrub_submit(sctx);
2120                 goto again;
2121         }
2122
2123         sbio->pagev[sbio->page_count] = spage;
2124         ret = bio_add_page(sbio->bio, spage->page, sectorsize, 0);
2125         if (ret != sectorsize) {
2126                 if (sbio->page_count < 1) {
2127                         bio_put(sbio->bio);
2128                         sbio->bio = NULL;
2129                         return -EIO;
2130                 }
2131                 scrub_submit(sctx);
2132                 goto again;
2133         }
2134
2135         scrub_block_get(sblock); /* one for the page added to the bio */
2136         atomic_inc(&sblock->outstanding_pages);
2137         sbio->page_count++;
2138         if (sbio->page_count == sctx->pages_per_rd_bio)
2139                 scrub_submit(sctx);
2140
2141         return 0;
2142 }
2143
2144 static void scrub_missing_raid56_end_io(struct bio *bio)
2145 {
2146         struct scrub_block *sblock = bio->bi_private;
2147         struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
2148
2149         if (bio->bi_status)
2150                 sblock->no_io_error_seen = 0;
2151
2152         bio_put(bio);
2153
2154         btrfs_queue_work(fs_info->scrub_workers, &sblock->work);
2155 }
2156
2157 static void scrub_missing_raid56_worker(struct btrfs_work *work)
2158 {
2159         struct scrub_block *sblock = container_of(work, struct scrub_block, work);
2160         struct scrub_ctx *sctx = sblock->sctx;
2161         struct btrfs_fs_info *fs_info = sctx->fs_info;
2162         u64 logical;
2163         struct btrfs_device *dev;
2164
2165         logical = sblock->pagev[0]->logical;
2166         dev = sblock->pagev[0]->dev;
2167
2168         if (sblock->no_io_error_seen)
2169                 scrub_recheck_block_checksum(sblock);
2170
2171         if (!sblock->no_io_error_seen) {
2172                 spin_lock(&sctx->stat_lock);
2173                 sctx->stat.read_errors++;
2174                 spin_unlock(&sctx->stat_lock);
2175                 btrfs_err_rl_in_rcu(fs_info,
2176                         "IO error rebuilding logical %llu for dev %s",
2177                         logical, rcu_str_deref(dev->name));
2178         } else if (sblock->header_error || sblock->checksum_error) {
2179                 spin_lock(&sctx->stat_lock);
2180                 sctx->stat.uncorrectable_errors++;
2181                 spin_unlock(&sctx->stat_lock);
2182                 btrfs_err_rl_in_rcu(fs_info,
2183                         "failed to rebuild valid logical %llu for dev %s",
2184                         logical, rcu_str_deref(dev->name));
2185         } else {
2186                 scrub_write_block_to_dev_replace(sblock);
2187         }
2188
2189         if (sctx->is_dev_replace && sctx->flush_all_writes) {
2190                 mutex_lock(&sctx->wr_lock);
2191                 scrub_wr_submit(sctx);
2192                 mutex_unlock(&sctx->wr_lock);
2193         }
2194
2195         scrub_block_put(sblock);
2196         scrub_pending_bio_dec(sctx);
2197 }
2198
2199 static void scrub_missing_raid56_pages(struct scrub_block *sblock)
2200 {
2201         struct scrub_ctx *sctx = sblock->sctx;
2202         struct btrfs_fs_info *fs_info = sctx->fs_info;
2203         u64 length = sblock->page_count * PAGE_SIZE;
2204         u64 logical = sblock->pagev[0]->logical;
2205         struct btrfs_io_context *bioc = NULL;
2206         struct bio *bio;
2207         struct btrfs_raid_bio *rbio;
2208         int ret;
2209         int i;
2210
2211         btrfs_bio_counter_inc_blocked(fs_info);
2212         ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
2213                                &length, &bioc);
2214         if (ret || !bioc || !bioc->raid_map)
2215                 goto bioc_out;
2216
2217         if (WARN_ON(!sctx->is_dev_replace ||
2218                     !(bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
2219                 /*
2220                  * We shouldn't be scrubbing a missing device. Even for dev
2221                  * replace, we should only get here for RAID 5/6. We either
2222                  * managed to mount something with no mirrors remaining or
2223                  * there's a bug in scrub_remap_extent()/btrfs_map_block().
2224                  */
2225                 goto bioc_out;
2226         }
2227
2228         bio = btrfs_io_bio_alloc(0);
2229         bio->bi_iter.bi_sector = logical >> 9;
2230         bio->bi_private = sblock;
2231         bio->bi_end_io = scrub_missing_raid56_end_io;
2232
2233         rbio = raid56_alloc_missing_rbio(fs_info, bio, bioc, length);
2234         if (!rbio)
2235                 goto rbio_out;
2236
2237         for (i = 0; i < sblock->page_count; i++) {
2238                 struct scrub_page *spage = sblock->pagev[i];
2239
2240                 raid56_add_scrub_pages(rbio, spage->page, spage->logical);
2241         }
2242
2243         btrfs_init_work(&sblock->work, scrub_missing_raid56_worker, NULL, NULL);
2244         scrub_block_get(sblock);
2245         scrub_pending_bio_inc(sctx);
2246         raid56_submit_missing_rbio(rbio);
2247         return;
2248
2249 rbio_out:
2250         bio_put(bio);
2251 bioc_out:
2252         btrfs_bio_counter_dec(fs_info);
2253         btrfs_put_bioc(bioc);
2254         spin_lock(&sctx->stat_lock);
2255         sctx->stat.malloc_errors++;
2256         spin_unlock(&sctx->stat_lock);
2257 }
2258
2259 static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len,
2260                        u64 physical, struct btrfs_device *dev, u64 flags,
2261                        u64 gen, int mirror_num, u8 *csum,
2262                        u64 physical_for_dev_replace)
2263 {
2264         struct scrub_block *sblock;
2265         const u32 sectorsize = sctx->fs_info->sectorsize;
2266         int index;
2267
2268         sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2269         if (!sblock) {
2270                 spin_lock(&sctx->stat_lock);
2271                 sctx->stat.malloc_errors++;
2272                 spin_unlock(&sctx->stat_lock);
2273                 return -ENOMEM;
2274         }
2275
2276         /* one ref inside this function, plus one for each page added to
2277          * a bio later on */
2278         refcount_set(&sblock->refs, 1);
2279         sblock->sctx = sctx;
2280         sblock->no_io_error_seen = 1;
2281
2282         for (index = 0; len > 0; index++) {
2283                 struct scrub_page *spage;
2284                 /*
2285                  * Here we will allocate one page for one sector to scrub.
2286                  * This is fine if PAGE_SIZE == sectorsize, but will cost
2287                  * more memory for PAGE_SIZE > sectorsize case.
2288                  */
2289                 u32 l = min(sectorsize, len);
2290
2291                 spage = kzalloc(sizeof(*spage), GFP_KERNEL);
2292                 if (!spage) {
2293 leave_nomem:
2294                         spin_lock(&sctx->stat_lock);
2295                         sctx->stat.malloc_errors++;
2296                         spin_unlock(&sctx->stat_lock);
2297                         scrub_block_put(sblock);
2298                         return -ENOMEM;
2299                 }
2300                 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2301                 scrub_page_get(spage);
2302                 sblock->pagev[index] = spage;
2303                 spage->sblock = sblock;
2304                 spage->dev = dev;
2305                 spage->flags = flags;
2306                 spage->generation = gen;
2307                 spage->logical = logical;
2308                 spage->physical = physical;
2309                 spage->physical_for_dev_replace = physical_for_dev_replace;
2310                 spage->mirror_num = mirror_num;
2311                 if (csum) {
2312                         spage->have_csum = 1;
2313                         memcpy(spage->csum, csum, sctx->fs_info->csum_size);
2314                 } else {
2315                         spage->have_csum = 0;
2316                 }
2317                 sblock->page_count++;
2318                 spage->page = alloc_page(GFP_KERNEL);
2319                 if (!spage->page)
2320                         goto leave_nomem;
2321                 len -= l;
2322                 logical += l;
2323                 physical += l;
2324                 physical_for_dev_replace += l;
2325         }
2326
2327         WARN_ON(sblock->page_count == 0);
2328         if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2329                 /*
2330                  * This case should only be hit for RAID 5/6 device replace. See
2331                  * the comment in scrub_missing_raid56_pages() for details.
2332                  */
2333                 scrub_missing_raid56_pages(sblock);
2334         } else {
2335                 for (index = 0; index < sblock->page_count; index++) {
2336                         struct scrub_page *spage = sblock->pagev[index];
2337                         int ret;
2338
2339                         ret = scrub_add_page_to_rd_bio(sctx, spage);
2340                         if (ret) {
2341                                 scrub_block_put(sblock);
2342                                 return ret;
2343                         }
2344                 }
2345
2346                 if (flags & BTRFS_EXTENT_FLAG_SUPER)
2347                         scrub_submit(sctx);
2348         }
2349
2350         /* last one frees, either here or in bio completion for last page */
2351         scrub_block_put(sblock);
2352         return 0;
2353 }
2354
2355 static void scrub_bio_end_io(struct bio *bio)
2356 {
2357         struct scrub_bio *sbio = bio->bi_private;
2358         struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
2359
2360         sbio->status = bio->bi_status;
2361         sbio->bio = bio;
2362
2363         btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
2364 }
2365
2366 static void scrub_bio_end_io_worker(struct btrfs_work *work)
2367 {
2368         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
2369         struct scrub_ctx *sctx = sbio->sctx;
2370         int i;
2371
2372         BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
2373         if (sbio->status) {
2374                 for (i = 0; i < sbio->page_count; i++) {
2375                         struct scrub_page *spage = sbio->pagev[i];
2376
2377                         spage->io_error = 1;
2378                         spage->sblock->no_io_error_seen = 0;
2379                 }
2380         }
2381
2382         /* now complete the scrub_block items that have all pages completed */
2383         for (i = 0; i < sbio->page_count; i++) {
2384                 struct scrub_page *spage = sbio->pagev[i];
2385                 struct scrub_block *sblock = spage->sblock;
2386
2387                 if (atomic_dec_and_test(&sblock->outstanding_pages))
2388                         scrub_block_complete(sblock);
2389                 scrub_block_put(sblock);
2390         }
2391
2392         bio_put(sbio->bio);
2393         sbio->bio = NULL;
2394         spin_lock(&sctx->list_lock);
2395         sbio->next_free = sctx->first_free;
2396         sctx->first_free = sbio->index;
2397         spin_unlock(&sctx->list_lock);
2398
2399         if (sctx->is_dev_replace && sctx->flush_all_writes) {
2400                 mutex_lock(&sctx->wr_lock);
2401                 scrub_wr_submit(sctx);
2402                 mutex_unlock(&sctx->wr_lock);
2403         }
2404
2405         scrub_pending_bio_dec(sctx);
2406 }
2407
2408 static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
2409                                        unsigned long *bitmap,
2410                                        u64 start, u32 len)
2411 {
2412         u64 offset;
2413         u32 nsectors;
2414         u32 sectorsize_bits = sparity->sctx->fs_info->sectorsize_bits;
2415
2416         if (len >= sparity->stripe_len) {
2417                 bitmap_set(bitmap, 0, sparity->nsectors);
2418                 return;
2419         }
2420
2421         start -= sparity->logic_start;
2422         start = div64_u64_rem(start, sparity->stripe_len, &offset);
2423         offset = offset >> sectorsize_bits;
2424         nsectors = len >> sectorsize_bits;
2425
2426         if (offset + nsectors <= sparity->nsectors) {
2427                 bitmap_set(bitmap, offset, nsectors);
2428                 return;
2429         }
2430
2431         bitmap_set(bitmap, offset, sparity->nsectors - offset);
2432         bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
2433 }
2434
2435 static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
2436                                                    u64 start, u32 len)
2437 {
2438         __scrub_mark_bitmap(sparity, sparity->ebitmap, start, len);
2439 }
2440
2441 static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
2442                                                   u64 start, u32 len)
2443 {
2444         __scrub_mark_bitmap(sparity, sparity->dbitmap, start, len);
2445 }
2446
2447 static void scrub_block_complete(struct scrub_block *sblock)
2448 {
2449         int corrupted = 0;
2450
2451         if (!sblock->no_io_error_seen) {
2452                 corrupted = 1;
2453                 scrub_handle_errored_block(sblock);
2454         } else {
2455                 /*
2456                  * if has checksum error, write via repair mechanism in
2457                  * dev replace case, otherwise write here in dev replace
2458                  * case.
2459                  */
2460                 corrupted = scrub_checksum(sblock);
2461                 if (!corrupted && sblock->sctx->is_dev_replace)
2462                         scrub_write_block_to_dev_replace(sblock);
2463         }
2464
2465         if (sblock->sparity && corrupted && !sblock->data_corrected) {
2466                 u64 start = sblock->pagev[0]->logical;
2467                 u64 end = sblock->pagev[sblock->page_count - 1]->logical +
2468                           sblock->sctx->fs_info->sectorsize;
2469
2470                 ASSERT(end - start <= U32_MAX);
2471                 scrub_parity_mark_sectors_error(sblock->sparity,
2472                                                 start, end - start);
2473         }
2474 }
2475
2476 static void drop_csum_range(struct scrub_ctx *sctx, struct btrfs_ordered_sum *sum)
2477 {
2478         sctx->stat.csum_discards += sum->len >> sctx->fs_info->sectorsize_bits;
2479         list_del(&sum->list);
2480         kfree(sum);
2481 }
2482
2483 /*
2484  * Find the desired csum for range [logical, logical + sectorsize), and store
2485  * the csum into @csum.
2486  *
2487  * The search source is sctx->csum_list, which is a pre-populated list
2488  * storing bytenr ordered csum ranges.  We're responsible to cleanup any range
2489  * that is before @logical.
2490  *
2491  * Return 0 if there is no csum for the range.
2492  * Return 1 if there is csum for the range and copied to @csum.
2493  */
2494 static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
2495 {
2496         bool found = false;
2497
2498         while (!list_empty(&sctx->csum_list)) {
2499                 struct btrfs_ordered_sum *sum = NULL;
2500                 unsigned long index;
2501                 unsigned long num_sectors;
2502
2503                 sum = list_first_entry(&sctx->csum_list,
2504                                        struct btrfs_ordered_sum, list);
2505                 /* The current csum range is beyond our range, no csum found */
2506                 if (sum->bytenr > logical)
2507                         break;
2508
2509                 /*
2510                  * The current sum is before our bytenr, since scrub is always
2511                  * done in bytenr order, the csum will never be used anymore,
2512                  * clean it up so that later calls won't bother with the range,
2513                  * and continue search the next range.
2514                  */
2515                 if (sum->bytenr + sum->len <= logical) {
2516                         drop_csum_range(sctx, sum);
2517                         continue;
2518                 }
2519
2520                 /* Now the csum range covers our bytenr, copy the csum */
2521                 found = true;
2522                 index = (logical - sum->bytenr) >> sctx->fs_info->sectorsize_bits;
2523                 num_sectors = sum->len >> sctx->fs_info->sectorsize_bits;
2524
2525                 memcpy(csum, sum->sums + index * sctx->fs_info->csum_size,
2526                        sctx->fs_info->csum_size);
2527
2528                 /* Cleanup the range if we're at the end of the csum range */
2529                 if (index == num_sectors - 1)
2530                         drop_csum_range(sctx, sum);
2531                 break;
2532         }
2533         if (!found)
2534                 return 0;
2535         return 1;
2536 }
2537
2538 /* scrub extent tries to collect up to 64 kB for each bio */
2539 static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
2540                         u64 logical, u32 len,
2541                         u64 physical, struct btrfs_device *dev, u64 flags,
2542                         u64 gen, int mirror_num, u64 physical_for_dev_replace)
2543 {
2544         int ret;
2545         u8 csum[BTRFS_CSUM_SIZE];
2546         u32 blocksize;
2547
2548         if (flags & BTRFS_EXTENT_FLAG_DATA) {
2549                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2550                         blocksize = map->stripe_len;
2551                 else
2552                         blocksize = sctx->fs_info->sectorsize;
2553                 spin_lock(&sctx->stat_lock);
2554                 sctx->stat.data_extents_scrubbed++;
2555                 sctx->stat.data_bytes_scrubbed += len;
2556                 spin_unlock(&sctx->stat_lock);
2557         } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2558                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2559                         blocksize = map->stripe_len;
2560                 else
2561                         blocksize = sctx->fs_info->nodesize;
2562                 spin_lock(&sctx->stat_lock);
2563                 sctx->stat.tree_extents_scrubbed++;
2564                 sctx->stat.tree_bytes_scrubbed += len;
2565                 spin_unlock(&sctx->stat_lock);
2566         } else {
2567                 blocksize = sctx->fs_info->sectorsize;
2568                 WARN_ON(1);
2569         }
2570
2571         while (len) {
2572                 u32 l = min(len, blocksize);
2573                 int have_csum = 0;
2574
2575                 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2576                         /* push csums to sbio */
2577                         have_csum = scrub_find_csum(sctx, logical, csum);
2578                         if (have_csum == 0)
2579                                 ++sctx->stat.no_csum;
2580                 }
2581                 ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
2582                                   mirror_num, have_csum ? csum : NULL,
2583                                   physical_for_dev_replace);
2584                 if (ret)
2585                         return ret;
2586                 len -= l;
2587                 logical += l;
2588                 physical += l;
2589                 physical_for_dev_replace += l;
2590         }
2591         return 0;
2592 }
2593
2594 static int scrub_pages_for_parity(struct scrub_parity *sparity,
2595                                   u64 logical, u32 len,
2596                                   u64 physical, struct btrfs_device *dev,
2597                                   u64 flags, u64 gen, int mirror_num, u8 *csum)
2598 {
2599         struct scrub_ctx *sctx = sparity->sctx;
2600         struct scrub_block *sblock;
2601         const u32 sectorsize = sctx->fs_info->sectorsize;
2602         int index;
2603
2604         ASSERT(IS_ALIGNED(len, sectorsize));
2605
2606         sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2607         if (!sblock) {
2608                 spin_lock(&sctx->stat_lock);
2609                 sctx->stat.malloc_errors++;
2610                 spin_unlock(&sctx->stat_lock);
2611                 return -ENOMEM;
2612         }
2613
2614         /* one ref inside this function, plus one for each page added to
2615          * a bio later on */
2616         refcount_set(&sblock->refs, 1);
2617         sblock->sctx = sctx;
2618         sblock->no_io_error_seen = 1;
2619         sblock->sparity = sparity;
2620         scrub_parity_get(sparity);
2621
2622         for (index = 0; len > 0; index++) {
2623                 struct scrub_page *spage;
2624
2625                 spage = kzalloc(sizeof(*spage), GFP_KERNEL);
2626                 if (!spage) {
2627 leave_nomem:
2628                         spin_lock(&sctx->stat_lock);
2629                         sctx->stat.malloc_errors++;
2630                         spin_unlock(&sctx->stat_lock);
2631                         scrub_block_put(sblock);
2632                         return -ENOMEM;
2633                 }
2634                 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2635                 /* For scrub block */
2636                 scrub_page_get(spage);
2637                 sblock->pagev[index] = spage;
2638                 /* For scrub parity */
2639                 scrub_page_get(spage);
2640                 list_add_tail(&spage->list, &sparity->spages);
2641                 spage->sblock = sblock;
2642                 spage->dev = dev;
2643                 spage->flags = flags;
2644                 spage->generation = gen;
2645                 spage->logical = logical;
2646                 spage->physical = physical;
2647                 spage->mirror_num = mirror_num;
2648                 if (csum) {
2649                         spage->have_csum = 1;
2650                         memcpy(spage->csum, csum, sctx->fs_info->csum_size);
2651                 } else {
2652                         spage->have_csum = 0;
2653                 }
2654                 sblock->page_count++;
2655                 spage->page = alloc_page(GFP_KERNEL);
2656                 if (!spage->page)
2657                         goto leave_nomem;
2658
2659
2660                 /* Iterate over the stripe range in sectorsize steps */
2661                 len -= sectorsize;
2662                 logical += sectorsize;
2663                 physical += sectorsize;
2664         }
2665
2666         WARN_ON(sblock->page_count == 0);
2667         for (index = 0; index < sblock->page_count; index++) {
2668                 struct scrub_page *spage = sblock->pagev[index];
2669                 int ret;
2670
2671                 ret = scrub_add_page_to_rd_bio(sctx, spage);
2672                 if (ret) {
2673                         scrub_block_put(sblock);
2674                         return ret;
2675                 }
2676         }
2677
2678         /* last one frees, either here or in bio completion for last page */
2679         scrub_block_put(sblock);
2680         return 0;
2681 }
2682
2683 static int scrub_extent_for_parity(struct scrub_parity *sparity,
2684                                    u64 logical, u32 len,
2685                                    u64 physical, struct btrfs_device *dev,
2686                                    u64 flags, u64 gen, int mirror_num)
2687 {
2688         struct scrub_ctx *sctx = sparity->sctx;
2689         int ret;
2690         u8 csum[BTRFS_CSUM_SIZE];
2691         u32 blocksize;
2692
2693         if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2694                 scrub_parity_mark_sectors_error(sparity, logical, len);
2695                 return 0;
2696         }
2697
2698         if (flags & BTRFS_EXTENT_FLAG_DATA) {
2699                 blocksize = sparity->stripe_len;
2700         } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2701                 blocksize = sparity->stripe_len;
2702         } else {
2703                 blocksize = sctx->fs_info->sectorsize;
2704                 WARN_ON(1);
2705         }
2706
2707         while (len) {
2708                 u32 l = min(len, blocksize);
2709                 int have_csum = 0;
2710
2711                 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2712                         /* push csums to sbio */
2713                         have_csum = scrub_find_csum(sctx, logical, csum);
2714                         if (have_csum == 0)
2715                                 goto skip;
2716                 }
2717                 ret = scrub_pages_for_parity(sparity, logical, l, physical, dev,
2718                                              flags, gen, mirror_num,
2719                                              have_csum ? csum : NULL);
2720                 if (ret)
2721                         return ret;
2722 skip:
2723                 len -= l;
2724                 logical += l;
2725                 physical += l;
2726         }
2727         return 0;
2728 }
2729
2730 /*
2731  * Given a physical address, this will calculate it's
2732  * logical offset. if this is a parity stripe, it will return
2733  * the most left data stripe's logical offset.
2734  *
2735  * return 0 if it is a data stripe, 1 means parity stripe.
2736  */
2737 static int get_raid56_logic_offset(u64 physical, int num,
2738                                    struct map_lookup *map, u64 *offset,
2739                                    u64 *stripe_start)
2740 {
2741         int i;
2742         int j = 0;
2743         u64 stripe_nr;
2744         u64 last_offset;
2745         u32 stripe_index;
2746         u32 rot;
2747         const int data_stripes = nr_data_stripes(map);
2748
2749         last_offset = (physical - map->stripes[num].physical) * data_stripes;
2750         if (stripe_start)
2751                 *stripe_start = last_offset;
2752
2753         *offset = last_offset;
2754         for (i = 0; i < data_stripes; i++) {
2755                 *offset = last_offset + i * map->stripe_len;
2756
2757                 stripe_nr = div64_u64(*offset, map->stripe_len);
2758                 stripe_nr = div_u64(stripe_nr, data_stripes);
2759
2760                 /* Work out the disk rotation on this stripe-set */
2761                 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
2762                 /* calculate which stripe this data locates */
2763                 rot += i;
2764                 stripe_index = rot % map->num_stripes;
2765                 if (stripe_index == num)
2766                         return 0;
2767                 if (stripe_index < num)
2768                         j++;
2769         }
2770         *offset = last_offset + j * map->stripe_len;
2771         return 1;
2772 }
2773
2774 static void scrub_free_parity(struct scrub_parity *sparity)
2775 {
2776         struct scrub_ctx *sctx = sparity->sctx;
2777         struct scrub_page *curr, *next;
2778         int nbits;
2779
2780         nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors);
2781         if (nbits) {
2782                 spin_lock(&sctx->stat_lock);
2783                 sctx->stat.read_errors += nbits;
2784                 sctx->stat.uncorrectable_errors += nbits;
2785                 spin_unlock(&sctx->stat_lock);
2786         }
2787
2788         list_for_each_entry_safe(curr, next, &sparity->spages, list) {
2789                 list_del_init(&curr->list);
2790                 scrub_page_put(curr);
2791         }
2792
2793         kfree(sparity);
2794 }
2795
2796 static void scrub_parity_bio_endio_worker(struct btrfs_work *work)
2797 {
2798         struct scrub_parity *sparity = container_of(work, struct scrub_parity,
2799                                                     work);
2800         struct scrub_ctx *sctx = sparity->sctx;
2801
2802         scrub_free_parity(sparity);
2803         scrub_pending_bio_dec(sctx);
2804 }
2805
2806 static void scrub_parity_bio_endio(struct bio *bio)
2807 {
2808         struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
2809         struct btrfs_fs_info *fs_info = sparity->sctx->fs_info;
2810
2811         if (bio->bi_status)
2812                 bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
2813                           sparity->nsectors);
2814
2815         bio_put(bio);
2816
2817         btrfs_init_work(&sparity->work, scrub_parity_bio_endio_worker, NULL,
2818                         NULL);
2819         btrfs_queue_work(fs_info->scrub_parity_workers, &sparity->work);
2820 }
2821
2822 static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
2823 {
2824         struct scrub_ctx *sctx = sparity->sctx;
2825         struct btrfs_fs_info *fs_info = sctx->fs_info;
2826         struct bio *bio;
2827         struct btrfs_raid_bio *rbio;
2828         struct btrfs_io_context *bioc = NULL;
2829         u64 length;
2830         int ret;
2831
2832         if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap,
2833                            sparity->nsectors))
2834                 goto out;
2835
2836         length = sparity->logic_end - sparity->logic_start;
2837
2838         btrfs_bio_counter_inc_blocked(fs_info);
2839         ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
2840                                &length, &bioc);
2841         if (ret || !bioc || !bioc->raid_map)
2842                 goto bioc_out;
2843
2844         bio = btrfs_io_bio_alloc(0);
2845         bio->bi_iter.bi_sector = sparity->logic_start >> 9;
2846         bio->bi_private = sparity;
2847         bio->bi_end_io = scrub_parity_bio_endio;
2848
2849         rbio = raid56_parity_alloc_scrub_rbio(fs_info, bio, bioc,
2850                                               length, sparity->scrub_dev,
2851                                               sparity->dbitmap,
2852                                               sparity->nsectors);
2853         if (!rbio)
2854                 goto rbio_out;
2855
2856         scrub_pending_bio_inc(sctx);
2857         raid56_parity_submit_scrub_rbio(rbio);
2858         return;
2859
2860 rbio_out:
2861         bio_put(bio);
2862 bioc_out:
2863         btrfs_bio_counter_dec(fs_info);
2864         btrfs_put_bioc(bioc);
2865         bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
2866                   sparity->nsectors);
2867         spin_lock(&sctx->stat_lock);
2868         sctx->stat.malloc_errors++;
2869         spin_unlock(&sctx->stat_lock);
2870 out:
2871         scrub_free_parity(sparity);
2872 }
2873
2874 static inline int scrub_calc_parity_bitmap_len(int nsectors)
2875 {
2876         return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * sizeof(long);
2877 }
2878
2879 static void scrub_parity_get(struct scrub_parity *sparity)
2880 {
2881         refcount_inc(&sparity->refs);
2882 }
2883
2884 static void scrub_parity_put(struct scrub_parity *sparity)
2885 {
2886         if (!refcount_dec_and_test(&sparity->refs))
2887                 return;
2888
2889         scrub_parity_check_and_repair(sparity);
2890 }
2891
2892 static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
2893                                                   struct map_lookup *map,
2894                                                   struct btrfs_device *sdev,
2895                                                   struct btrfs_path *path,
2896                                                   u64 logic_start,
2897                                                   u64 logic_end)
2898 {
2899         struct btrfs_fs_info *fs_info = sctx->fs_info;
2900         struct btrfs_root *root = fs_info->extent_root;
2901         struct btrfs_root *csum_root = fs_info->csum_root;
2902         struct btrfs_extent_item *extent;
2903         struct btrfs_io_context *bioc = NULL;
2904         u64 flags;
2905         int ret;
2906         int slot;
2907         struct extent_buffer *l;
2908         struct btrfs_key key;
2909         u64 generation;
2910         u64 extent_logical;
2911         u64 extent_physical;
2912         /* Check the comment in scrub_stripe() for why u32 is enough here */
2913         u32 extent_len;
2914         u64 mapped_length;
2915         struct btrfs_device *extent_dev;
2916         struct scrub_parity *sparity;
2917         int nsectors;
2918         int bitmap_len;
2919         int extent_mirror_num;
2920         int stop_loop = 0;
2921
2922         ASSERT(map->stripe_len <= U32_MAX);
2923         nsectors = map->stripe_len >> fs_info->sectorsize_bits;
2924         bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
2925         sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len,
2926                           GFP_NOFS);
2927         if (!sparity) {
2928                 spin_lock(&sctx->stat_lock);
2929                 sctx->stat.malloc_errors++;
2930                 spin_unlock(&sctx->stat_lock);
2931                 return -ENOMEM;
2932         }
2933
2934         ASSERT(map->stripe_len <= U32_MAX);
2935         sparity->stripe_len = map->stripe_len;
2936         sparity->nsectors = nsectors;
2937         sparity->sctx = sctx;
2938         sparity->scrub_dev = sdev;
2939         sparity->logic_start = logic_start;
2940         sparity->logic_end = logic_end;
2941         refcount_set(&sparity->refs, 1);
2942         INIT_LIST_HEAD(&sparity->spages);
2943         sparity->dbitmap = sparity->bitmap;
2944         sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
2945
2946         ret = 0;
2947         while (logic_start < logic_end) {
2948                 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2949                         key.type = BTRFS_METADATA_ITEM_KEY;
2950                 else
2951                         key.type = BTRFS_EXTENT_ITEM_KEY;
2952                 key.objectid = logic_start;
2953                 key.offset = (u64)-1;
2954
2955                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2956                 if (ret < 0)
2957                         goto out;
2958
2959                 if (ret > 0) {
2960                         ret = btrfs_previous_extent_item(root, path, 0);
2961                         if (ret < 0)
2962                                 goto out;
2963                         if (ret > 0) {
2964                                 btrfs_release_path(path);
2965                                 ret = btrfs_search_slot(NULL, root, &key,
2966                                                         path, 0, 0);
2967                                 if (ret < 0)
2968                                         goto out;
2969                         }
2970                 }
2971
2972                 stop_loop = 0;
2973                 while (1) {
2974                         u64 bytes;
2975
2976                         l = path->nodes[0];
2977                         slot = path->slots[0];
2978                         if (slot >= btrfs_header_nritems(l)) {
2979                                 ret = btrfs_next_leaf(root, path);
2980                                 if (ret == 0)
2981                                         continue;
2982                                 if (ret < 0)
2983                                         goto out;
2984
2985                                 stop_loop = 1;
2986                                 break;
2987                         }
2988                         btrfs_item_key_to_cpu(l, &key, slot);
2989
2990                         if (key.type != BTRFS_EXTENT_ITEM_KEY &&
2991                             key.type != BTRFS_METADATA_ITEM_KEY)
2992                                 goto next;
2993
2994                         if (key.type == BTRFS_METADATA_ITEM_KEY)
2995                                 bytes = fs_info->nodesize;
2996                         else
2997                                 bytes = key.offset;
2998
2999                         if (key.objectid + bytes <= logic_start)
3000                                 goto next;
3001
3002                         if (key.objectid >= logic_end) {
3003                                 stop_loop = 1;
3004                                 break;
3005                         }
3006
3007                         while (key.objectid >= logic_start + map->stripe_len)
3008                                 logic_start += map->stripe_len;
3009
3010                         extent = btrfs_item_ptr(l, slot,
3011                                                 struct btrfs_extent_item);
3012                         flags = btrfs_extent_flags(l, extent);
3013                         generation = btrfs_extent_generation(l, extent);
3014
3015                         if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3016                             (key.objectid < logic_start ||
3017                              key.objectid + bytes >
3018                              logic_start + map->stripe_len)) {
3019                                 btrfs_err(fs_info,
3020                                           "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
3021                                           key.objectid, logic_start);
3022                                 spin_lock(&sctx->stat_lock);
3023                                 sctx->stat.uncorrectable_errors++;
3024                                 spin_unlock(&sctx->stat_lock);
3025                                 goto next;
3026                         }
3027 again:
3028                         extent_logical = key.objectid;
3029                         ASSERT(bytes <= U32_MAX);
3030                         extent_len = bytes;
3031
3032                         if (extent_logical < logic_start) {
3033                                 extent_len -= logic_start - extent_logical;
3034                                 extent_logical = logic_start;
3035                         }
3036
3037                         if (extent_logical + extent_len >
3038                             logic_start + map->stripe_len)
3039                                 extent_len = logic_start + map->stripe_len -
3040                                              extent_logical;
3041
3042                         scrub_parity_mark_sectors_data(sparity, extent_logical,
3043                                                        extent_len);
3044
3045                         mapped_length = extent_len;
3046                         bioc = NULL;
3047                         ret = btrfs_map_block(fs_info, BTRFS_MAP_READ,
3048                                         extent_logical, &mapped_length, &bioc,
3049                                         0);
3050                         if (!ret) {
3051                                 if (!bioc || mapped_length < extent_len)
3052                                         ret = -EIO;
3053                         }
3054                         if (ret) {
3055                                 btrfs_put_bioc(bioc);
3056                                 goto out;
3057                         }
3058                         extent_physical = bioc->stripes[0].physical;
3059                         extent_mirror_num = bioc->mirror_num;
3060                         extent_dev = bioc->stripes[0].dev;
3061                         btrfs_put_bioc(bioc);
3062
3063                         ret = btrfs_lookup_csums_range(csum_root,
3064                                                 extent_logical,
3065                                                 extent_logical + extent_len - 1,
3066                                                 &sctx->csum_list, 1);
3067                         if (ret)
3068                                 goto out;
3069
3070                         ret = scrub_extent_for_parity(sparity, extent_logical,
3071                                                       extent_len,
3072                                                       extent_physical,
3073                                                       extent_dev, flags,
3074                                                       generation,
3075                                                       extent_mirror_num);
3076
3077                         scrub_free_csums(sctx);
3078
3079                         if (ret)
3080                                 goto out;
3081
3082                         if (extent_logical + extent_len <
3083                             key.objectid + bytes) {
3084                                 logic_start += map->stripe_len;
3085
3086                                 if (logic_start >= logic_end) {
3087                                         stop_loop = 1;
3088                                         break;
3089                                 }
3090
3091                                 if (logic_start < key.objectid + bytes) {
3092                                         cond_resched();
3093                                         goto again;
3094                                 }
3095                         }
3096 next:
3097                         path->slots[0]++;
3098                 }
3099
3100                 btrfs_release_path(path);
3101
3102                 if (stop_loop)
3103                         break;
3104
3105                 logic_start += map->stripe_len;
3106         }
3107 out:
3108         if (ret < 0) {
3109                 ASSERT(logic_end - logic_start <= U32_MAX);
3110                 scrub_parity_mark_sectors_error(sparity, logic_start,
3111                                                 logic_end - logic_start);
3112         }
3113         scrub_parity_put(sparity);
3114         scrub_submit(sctx);
3115         mutex_lock(&sctx->wr_lock);
3116         scrub_wr_submit(sctx);
3117         mutex_unlock(&sctx->wr_lock);
3118
3119         btrfs_release_path(path);
3120         return ret < 0 ? ret : 0;
3121 }
3122
3123 static void sync_replace_for_zoned(struct scrub_ctx *sctx)
3124 {
3125         if (!btrfs_is_zoned(sctx->fs_info))
3126                 return;
3127
3128         sctx->flush_all_writes = true;
3129         scrub_submit(sctx);
3130         mutex_lock(&sctx->wr_lock);
3131         scrub_wr_submit(sctx);
3132         mutex_unlock(&sctx->wr_lock);
3133
3134         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3135 }
3136
3137 static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical,
3138                                         u64 physical, u64 physical_end)
3139 {
3140         struct btrfs_fs_info *fs_info = sctx->fs_info;
3141         int ret = 0;
3142
3143         if (!btrfs_is_zoned(fs_info))
3144                 return 0;
3145
3146         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3147
3148         mutex_lock(&sctx->wr_lock);
3149         if (sctx->write_pointer < physical_end) {
3150                 ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical,
3151                                                     physical,
3152                                                     sctx->write_pointer);
3153                 if (ret)
3154                         btrfs_err(fs_info,
3155                                   "zoned: failed to recover write pointer");
3156         }
3157         mutex_unlock(&sctx->wr_lock);
3158         btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical);
3159
3160         return ret;
3161 }
3162
3163 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
3164                                            struct map_lookup *map,
3165                                            struct btrfs_device *scrub_dev,
3166                                            int num, u64 base, u64 length,
3167                                            struct btrfs_block_group *cache)
3168 {
3169         struct btrfs_path *path, *ppath;
3170         struct btrfs_fs_info *fs_info = sctx->fs_info;
3171         struct btrfs_root *root = fs_info->extent_root;
3172         struct btrfs_root *csum_root = fs_info->csum_root;
3173         struct btrfs_extent_item *extent;
3174         struct blk_plug plug;
3175         u64 flags;
3176         int ret;
3177         int slot;
3178         u64 nstripes;
3179         struct extent_buffer *l;
3180         u64 physical;
3181         u64 logical;
3182         u64 logic_end;
3183         u64 physical_end;
3184         u64 generation;
3185         int mirror_num;
3186         struct reada_control *reada1;
3187         struct reada_control *reada2;
3188         struct btrfs_key key;
3189         struct btrfs_key key_end;
3190         u64 increment = map->stripe_len;
3191         u64 offset;
3192         u64 extent_logical;
3193         u64 extent_physical;
3194         /*
3195          * Unlike chunk length, extent length should never go beyond
3196          * BTRFS_MAX_EXTENT_SIZE, thus u32 is enough here.
3197          */
3198         u32 extent_len;
3199         u64 stripe_logical;
3200         u64 stripe_end;
3201         struct btrfs_device *extent_dev;
3202         int extent_mirror_num;
3203         int stop_loop = 0;
3204
3205         physical = map->stripes[num].physical;
3206         offset = 0;
3207         nstripes = div64_u64(length, map->stripe_len);
3208         mirror_num = 1;
3209         increment = map->stripe_len;
3210         if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3211                 offset = map->stripe_len * num;
3212                 increment = map->stripe_len * map->num_stripes;
3213         } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3214                 int factor = map->num_stripes / map->sub_stripes;
3215                 offset = map->stripe_len * (num / map->sub_stripes);
3216                 increment = map->stripe_len * factor;
3217                 mirror_num = num % map->sub_stripes + 1;
3218         } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
3219                 mirror_num = num % map->num_stripes + 1;
3220         } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
3221                 mirror_num = num % map->num_stripes + 1;
3222         } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3223                 get_raid56_logic_offset(physical, num, map, &offset, NULL);
3224                 increment = map->stripe_len * nr_data_stripes(map);
3225         }
3226
3227         path = btrfs_alloc_path();
3228         if (!path)
3229                 return -ENOMEM;
3230
3231         ppath = btrfs_alloc_path();
3232         if (!ppath) {
3233                 btrfs_free_path(path);
3234                 return -ENOMEM;
3235         }
3236
3237         /*
3238          * work on commit root. The related disk blocks are static as
3239          * long as COW is applied. This means, it is save to rewrite
3240          * them to repair disk errors without any race conditions
3241          */
3242         path->search_commit_root = 1;
3243         path->skip_locking = 1;
3244
3245         ppath->search_commit_root = 1;
3246         ppath->skip_locking = 1;
3247         /*
3248          * trigger the readahead for extent tree csum tree and wait for
3249          * completion. During readahead, the scrub is officially paused
3250          * to not hold off transaction commits
3251          */
3252         logical = base + offset;
3253         physical_end = physical + nstripes * map->stripe_len;
3254         if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3255                 get_raid56_logic_offset(physical_end, num,
3256                                         map, &logic_end, NULL);
3257                 logic_end += base;
3258         } else {
3259                 logic_end = logical + increment * nstripes;
3260         }
3261         wait_event(sctx->list_wait,
3262                    atomic_read(&sctx->bios_in_flight) == 0);
3263         scrub_blocked_if_needed(fs_info);
3264
3265         /* FIXME it might be better to start readahead at commit root */
3266         key.objectid = logical;
3267         key.type = BTRFS_EXTENT_ITEM_KEY;
3268         key.offset = (u64)0;
3269         key_end.objectid = logic_end;
3270         key_end.type = BTRFS_METADATA_ITEM_KEY;
3271         key_end.offset = (u64)-1;
3272         reada1 = btrfs_reada_add(root, &key, &key_end);
3273
3274         if (cache->flags & BTRFS_BLOCK_GROUP_DATA) {
3275                 key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
3276                 key.type = BTRFS_EXTENT_CSUM_KEY;
3277                 key.offset = logical;
3278                 key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
3279                 key_end.type = BTRFS_EXTENT_CSUM_KEY;
3280                 key_end.offset = logic_end;
3281                 reada2 = btrfs_reada_add(csum_root, &key, &key_end);
3282         } else {
3283                 reada2 = NULL;
3284         }
3285
3286         if (!IS_ERR(reada1))
3287                 btrfs_reada_wait(reada1);
3288         if (!IS_ERR_OR_NULL(reada2))
3289                 btrfs_reada_wait(reada2);
3290
3291
3292         /*
3293          * collect all data csums for the stripe to avoid seeking during
3294          * the scrub. This might currently (crc32) end up to be about 1MB
3295          */
3296         blk_start_plug(&plug);
3297
3298         if (sctx->is_dev_replace &&
3299             btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) {
3300                 mutex_lock(&sctx->wr_lock);
3301                 sctx->write_pointer = physical;
3302                 mutex_unlock(&sctx->wr_lock);
3303                 sctx->flush_all_writes = true;
3304         }
3305
3306         /*
3307          * now find all extents for each stripe and scrub them
3308          */
3309         ret = 0;
3310         while (physical < physical_end) {
3311                 /*
3312                  * canceled?
3313                  */
3314                 if (atomic_read(&fs_info->scrub_cancel_req) ||
3315                     atomic_read(&sctx->cancel_req)) {
3316                         ret = -ECANCELED;
3317                         goto out;
3318                 }
3319                 /*
3320                  * check to see if we have to pause
3321                  */
3322                 if (atomic_read(&fs_info->scrub_pause_req)) {
3323                         /* push queued extents */
3324                         sctx->flush_all_writes = true;
3325                         scrub_submit(sctx);
3326                         mutex_lock(&sctx->wr_lock);
3327                         scrub_wr_submit(sctx);
3328                         mutex_unlock(&sctx->wr_lock);
3329                         wait_event(sctx->list_wait,
3330                                    atomic_read(&sctx->bios_in_flight) == 0);
3331                         sctx->flush_all_writes = false;
3332                         scrub_blocked_if_needed(fs_info);
3333                 }
3334
3335                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3336                         ret = get_raid56_logic_offset(physical, num, map,
3337                                                       &logical,
3338                                                       &stripe_logical);
3339                         logical += base;
3340                         if (ret) {
3341                                 /* it is parity strip */
3342                                 stripe_logical += base;
3343                                 stripe_end = stripe_logical + increment;
3344                                 ret = scrub_raid56_parity(sctx, map, scrub_dev,
3345                                                           ppath, stripe_logical,
3346                                                           stripe_end);
3347                                 if (ret)
3348                                         goto out;
3349                                 goto skip;
3350                         }
3351                 }
3352
3353                 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
3354                         key.type = BTRFS_METADATA_ITEM_KEY;
3355                 else
3356                         key.type = BTRFS_EXTENT_ITEM_KEY;
3357                 key.objectid = logical;
3358                 key.offset = (u64)-1;
3359
3360                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3361                 if (ret < 0)
3362                         goto out;
3363
3364                 if (ret > 0) {
3365                         ret = btrfs_previous_extent_item(root, path, 0);
3366                         if (ret < 0)
3367                                 goto out;
3368                         if (ret > 0) {
3369                                 /* there's no smaller item, so stick with the
3370                                  * larger one */
3371                                 btrfs_release_path(path);
3372                                 ret = btrfs_search_slot(NULL, root, &key,
3373                                                         path, 0, 0);
3374                                 if (ret < 0)
3375                                         goto out;
3376                         }
3377                 }
3378
3379                 stop_loop = 0;
3380                 while (1) {
3381                         u64 bytes;
3382
3383                         l = path->nodes[0];
3384                         slot = path->slots[0];
3385                         if (slot >= btrfs_header_nritems(l)) {
3386                                 ret = btrfs_next_leaf(root, path);
3387                                 if (ret == 0)
3388                                         continue;
3389                                 if (ret < 0)
3390                                         goto out;
3391
3392                                 stop_loop = 1;
3393                                 break;
3394                         }
3395                         btrfs_item_key_to_cpu(l, &key, slot);
3396
3397                         if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3398                             key.type != BTRFS_METADATA_ITEM_KEY)
3399                                 goto next;
3400
3401                         if (key.type == BTRFS_METADATA_ITEM_KEY)
3402                                 bytes = fs_info->nodesize;
3403                         else
3404                                 bytes = key.offset;
3405
3406                         if (key.objectid + bytes <= logical)
3407                                 goto next;
3408
3409                         if (key.objectid >= logical + map->stripe_len) {
3410                                 /* out of this device extent */
3411                                 if (key.objectid >= logic_end)
3412                                         stop_loop = 1;
3413                                 break;
3414                         }
3415
3416                         /*
3417                          * If our block group was removed in the meanwhile, just
3418                          * stop scrubbing since there is no point in continuing.
3419                          * Continuing would prevent reusing its device extents
3420                          * for new block groups for a long time.
3421                          */
3422                         spin_lock(&cache->lock);
3423                         if (cache->removed) {
3424                                 spin_unlock(&cache->lock);
3425                                 ret = 0;
3426                                 goto out;
3427                         }
3428                         spin_unlock(&cache->lock);
3429
3430                         extent = btrfs_item_ptr(l, slot,
3431                                                 struct btrfs_extent_item);
3432                         flags = btrfs_extent_flags(l, extent);
3433                         generation = btrfs_extent_generation(l, extent);
3434
3435                         if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3436                             (key.objectid < logical ||
3437                              key.objectid + bytes >
3438                              logical + map->stripe_len)) {
3439                                 btrfs_err(fs_info,
3440                                            "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
3441                                        key.objectid, logical);
3442                                 spin_lock(&sctx->stat_lock);
3443                                 sctx->stat.uncorrectable_errors++;
3444                                 spin_unlock(&sctx->stat_lock);
3445                                 goto next;
3446                         }
3447
3448 again:
3449                         extent_logical = key.objectid;
3450                         ASSERT(bytes <= U32_MAX);
3451                         extent_len = bytes;
3452
3453                         /*
3454                          * trim extent to this stripe
3455                          */
3456                         if (extent_logical < logical) {
3457                                 extent_len -= logical - extent_logical;
3458                                 extent_logical = logical;
3459                         }
3460                         if (extent_logical + extent_len >
3461                             logical + map->stripe_len) {
3462                                 extent_len = logical + map->stripe_len -
3463                                              extent_logical;
3464                         }
3465
3466                         extent_physical = extent_logical - logical + physical;
3467                         extent_dev = scrub_dev;
3468                         extent_mirror_num = mirror_num;
3469                         if (sctx->is_dev_replace)
3470                                 scrub_remap_extent(fs_info, extent_logical,
3471                                                    extent_len, &extent_physical,
3472                                                    &extent_dev,
3473                                                    &extent_mirror_num);
3474
3475                         if (flags & BTRFS_EXTENT_FLAG_DATA) {
3476                                 ret = btrfs_lookup_csums_range(csum_root,
3477                                                 extent_logical,
3478                                                 extent_logical + extent_len - 1,
3479                                                 &sctx->csum_list, 1);
3480                                 if (ret)
3481                                         goto out;
3482                         }
3483
3484                         ret = scrub_extent(sctx, map, extent_logical, extent_len,
3485                                            extent_physical, extent_dev, flags,
3486                                            generation, extent_mirror_num,
3487                                            extent_logical - logical + physical);
3488
3489                         scrub_free_csums(sctx);
3490
3491                         if (ret)
3492                                 goto out;
3493
3494                         if (sctx->is_dev_replace)
3495                                 sync_replace_for_zoned(sctx);
3496
3497                         if (extent_logical + extent_len <
3498                             key.objectid + bytes) {
3499                                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3500                                         /*
3501                                          * loop until we find next data stripe
3502                                          * or we have finished all stripes.
3503                                          */
3504 loop:
3505                                         physical += map->stripe_len;
3506                                         ret = get_raid56_logic_offset(physical,
3507                                                         num, map, &logical,
3508                                                         &stripe_logical);
3509                                         logical += base;
3510
3511                                         if (ret && physical < physical_end) {
3512                                                 stripe_logical += base;
3513                                                 stripe_end = stripe_logical +
3514                                                                 increment;
3515                                                 ret = scrub_raid56_parity(sctx,
3516                                                         map, scrub_dev, ppath,
3517                                                         stripe_logical,
3518                                                         stripe_end);
3519                                                 if (ret)
3520                                                         goto out;
3521                                                 goto loop;
3522                                         }
3523                                 } else {
3524                                         physical += map->stripe_len;
3525                                         logical += increment;
3526                                 }
3527                                 if (logical < key.objectid + bytes) {
3528                                         cond_resched();
3529                                         goto again;
3530                                 }
3531
3532                                 if (physical >= physical_end) {
3533                                         stop_loop = 1;
3534                                         break;
3535                                 }
3536                         }
3537 next:
3538                         path->slots[0]++;
3539                 }
3540                 btrfs_release_path(path);
3541 skip:
3542                 logical += increment;
3543                 physical += map->stripe_len;
3544                 spin_lock(&sctx->stat_lock);
3545                 if (stop_loop)
3546                         sctx->stat.last_physical = map->stripes[num].physical +
3547                                                    length;
3548                 else
3549                         sctx->stat.last_physical = physical;
3550                 spin_unlock(&sctx->stat_lock);
3551                 if (stop_loop)
3552                         break;
3553         }
3554 out:
3555         /* push queued extents */
3556         scrub_submit(sctx);
3557         mutex_lock(&sctx->wr_lock);
3558         scrub_wr_submit(sctx);
3559         mutex_unlock(&sctx->wr_lock);
3560
3561         blk_finish_plug(&plug);
3562         btrfs_free_path(path);
3563         btrfs_free_path(ppath);
3564
3565         if (sctx->is_dev_replace && ret >= 0) {
3566                 int ret2;
3567
3568                 ret2 = sync_write_pointer_for_zoned(sctx, base + offset,
3569                                                     map->stripes[num].physical,
3570                                                     physical_end);
3571                 if (ret2)
3572                         ret = ret2;
3573         }
3574
3575         return ret < 0 ? ret : 0;
3576 }
3577
3578 static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
3579                                           struct btrfs_device *scrub_dev,
3580                                           u64 chunk_offset, u64 length,
3581                                           u64 dev_offset,
3582                                           struct btrfs_block_group *cache)
3583 {
3584         struct btrfs_fs_info *fs_info = sctx->fs_info;
3585         struct extent_map_tree *map_tree = &fs_info->mapping_tree;
3586         struct map_lookup *map;
3587         struct extent_map *em;
3588         int i;
3589         int ret = 0;
3590
3591         read_lock(&map_tree->lock);
3592         em = lookup_extent_mapping(map_tree, chunk_offset, 1);
3593         read_unlock(&map_tree->lock);
3594
3595         if (!em) {
3596                 /*
3597                  * Might have been an unused block group deleted by the cleaner
3598                  * kthread or relocation.
3599                  */
3600                 spin_lock(&cache->lock);
3601                 if (!cache->removed)
3602                         ret = -EINVAL;
3603                 spin_unlock(&cache->lock);
3604
3605                 return ret;
3606         }
3607
3608         map = em->map_lookup;
3609         if (em->start != chunk_offset)
3610                 goto out;
3611
3612         if (em->len < length)
3613                 goto out;
3614
3615         for (i = 0; i < map->num_stripes; ++i) {
3616                 if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
3617                     map->stripes[i].physical == dev_offset) {
3618                         ret = scrub_stripe(sctx, map, scrub_dev, i,
3619                                            chunk_offset, length, cache);
3620                         if (ret)
3621                                 goto out;
3622                 }
3623         }
3624 out:
3625         free_extent_map(em);
3626
3627         return ret;
3628 }
3629
3630 static int finish_extent_writes_for_zoned(struct btrfs_root *root,
3631                                           struct btrfs_block_group *cache)
3632 {
3633         struct btrfs_fs_info *fs_info = cache->fs_info;
3634         struct btrfs_trans_handle *trans;
3635
3636         if (!btrfs_is_zoned(fs_info))
3637                 return 0;
3638
3639         btrfs_wait_block_group_reservations(cache);
3640         btrfs_wait_nocow_writers(cache);
3641         btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start, cache->length);
3642
3643         trans = btrfs_join_transaction(root);
3644         if (IS_ERR(trans))
3645                 return PTR_ERR(trans);
3646         return btrfs_commit_transaction(trans);
3647 }
3648
3649 static noinline_for_stack
3650 int scrub_enumerate_chunks(struct scrub_ctx *sctx,
3651                            struct btrfs_device *scrub_dev, u64 start, u64 end)
3652 {
3653         struct btrfs_dev_extent *dev_extent = NULL;
3654         struct btrfs_path *path;
3655         struct btrfs_fs_info *fs_info = sctx->fs_info;
3656         struct btrfs_root *root = fs_info->dev_root;
3657         u64 length;
3658         u64 chunk_offset;
3659         int ret = 0;
3660         int ro_set;
3661         int slot;
3662         struct extent_buffer *l;
3663         struct btrfs_key key;
3664         struct btrfs_key found_key;
3665         struct btrfs_block_group *cache;
3666         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
3667
3668         path = btrfs_alloc_path();
3669         if (!path)
3670                 return -ENOMEM;
3671
3672         path->reada = READA_FORWARD;
3673         path->search_commit_root = 1;
3674         path->skip_locking = 1;
3675
3676         key.objectid = scrub_dev->devid;
3677         key.offset = 0ull;
3678         key.type = BTRFS_DEV_EXTENT_KEY;
3679
3680         while (1) {
3681                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3682                 if (ret < 0)
3683                         break;
3684                 if (ret > 0) {
3685                         if (path->slots[0] >=
3686                             btrfs_header_nritems(path->nodes[0])) {
3687                                 ret = btrfs_next_leaf(root, path);
3688                                 if (ret < 0)
3689                                         break;
3690                                 if (ret > 0) {
3691                                         ret = 0;
3692                                         break;
3693                                 }
3694                         } else {
3695                                 ret = 0;
3696                         }
3697                 }
3698
3699                 l = path->nodes[0];
3700                 slot = path->slots[0];
3701
3702                 btrfs_item_key_to_cpu(l, &found_key, slot);
3703
3704                 if (found_key.objectid != scrub_dev->devid)
3705                         break;
3706
3707                 if (found_key.type != BTRFS_DEV_EXTENT_KEY)
3708                         break;
3709
3710                 if (found_key.offset >= end)
3711                         break;
3712
3713                 if (found_key.offset < key.offset)
3714                         break;
3715
3716                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
3717                 length = btrfs_dev_extent_length(l, dev_extent);
3718
3719                 if (found_key.offset + length <= start)
3720                         goto skip;
3721
3722                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
3723
3724                 /*
3725                  * get a reference on the corresponding block group to prevent
3726                  * the chunk from going away while we scrub it
3727                  */
3728                 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3729
3730                 /* some chunks are removed but not committed to disk yet,
3731                  * continue scrubbing */
3732                 if (!cache)
3733                         goto skip;
3734
3735                 if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) {
3736                         spin_lock(&cache->lock);
3737                         if (!cache->to_copy) {
3738                                 spin_unlock(&cache->lock);
3739                                 btrfs_put_block_group(cache);
3740                                 goto skip;
3741                         }
3742                         spin_unlock(&cache->lock);
3743                 }
3744
3745                 /*
3746                  * Make sure that while we are scrubbing the corresponding block
3747                  * group doesn't get its logical address and its device extents
3748                  * reused for another block group, which can possibly be of a
3749                  * different type and different profile. We do this to prevent
3750                  * false error detections and crashes due to bogus attempts to
3751                  * repair extents.
3752                  */
3753                 spin_lock(&cache->lock);
3754                 if (cache->removed) {
3755                         spin_unlock(&cache->lock);
3756                         btrfs_put_block_group(cache);
3757                         goto skip;
3758                 }
3759                 btrfs_freeze_block_group(cache);
3760                 spin_unlock(&cache->lock);
3761
3762                 /*
3763                  * we need call btrfs_inc_block_group_ro() with scrubs_paused,
3764                  * to avoid deadlock caused by:
3765                  * btrfs_inc_block_group_ro()
3766                  * -> btrfs_wait_for_commit()
3767                  * -> btrfs_commit_transaction()
3768                  * -> btrfs_scrub_pause()
3769                  */
3770                 scrub_pause_on(fs_info);
3771
3772                 /*
3773                  * Don't do chunk preallocation for scrub.
3774                  *
3775                  * This is especially important for SYSTEM bgs, or we can hit
3776                  * -EFBIG from btrfs_finish_chunk_alloc() like:
3777                  * 1. The only SYSTEM bg is marked RO.
3778                  *    Since SYSTEM bg is small, that's pretty common.
3779                  * 2. New SYSTEM bg will be allocated
3780                  *    Due to regular version will allocate new chunk.
3781                  * 3. New SYSTEM bg is empty and will get cleaned up
3782                  *    Before cleanup really happens, it's marked RO again.
3783                  * 4. Empty SYSTEM bg get scrubbed
3784                  *    We go back to 2.
3785                  *
3786                  * This can easily boost the amount of SYSTEM chunks if cleaner
3787                  * thread can't be triggered fast enough, and use up all space
3788                  * of btrfs_super_block::sys_chunk_array
3789                  *
3790                  * While for dev replace, we need to try our best to mark block
3791                  * group RO, to prevent race between:
3792                  * - Write duplication
3793                  *   Contains latest data
3794                  * - Scrub copy
3795                  *   Contains data from commit tree
3796                  *
3797                  * If target block group is not marked RO, nocow writes can
3798                  * be overwritten by scrub copy, causing data corruption.
3799                  * So for dev-replace, it's not allowed to continue if a block
3800                  * group is not RO.
3801                  */
3802                 ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace);
3803                 if (!ret && sctx->is_dev_replace) {
3804                         ret = finish_extent_writes_for_zoned(root, cache);
3805                         if (ret) {
3806                                 btrfs_dec_block_group_ro(cache);
3807                                 scrub_pause_off(fs_info);
3808                                 btrfs_put_block_group(cache);
3809                                 break;
3810                         }
3811                 }
3812
3813                 if (ret == 0) {
3814                         ro_set = 1;
3815                 } else if (ret == -ENOSPC && !sctx->is_dev_replace) {
3816                         /*
3817                          * btrfs_inc_block_group_ro return -ENOSPC when it
3818                          * failed in creating new chunk for metadata.
3819                          * It is not a problem for scrub, because
3820                          * metadata are always cowed, and our scrub paused
3821                          * commit_transactions.
3822                          */
3823                         ro_set = 0;
3824                 } else if (ret == -ETXTBSY) {
3825                         btrfs_warn(fs_info,
3826                    "skipping scrub of block group %llu due to active swapfile",
3827                                    cache->start);
3828                         scrub_pause_off(fs_info);
3829                         ret = 0;
3830                         goto skip_unfreeze;
3831                 } else {
3832                         btrfs_warn(fs_info,
3833                                    "failed setting block group ro: %d", ret);
3834                         btrfs_unfreeze_block_group(cache);
3835                         btrfs_put_block_group(cache);
3836                         scrub_pause_off(fs_info);
3837                         break;
3838                 }
3839
3840                 /*
3841                  * Now the target block is marked RO, wait for nocow writes to
3842                  * finish before dev-replace.
3843                  * COW is fine, as COW never overwrites extents in commit tree.
3844                  */
3845                 if (sctx->is_dev_replace) {
3846                         btrfs_wait_nocow_writers(cache);
3847                         btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start,
3848                                         cache->length);
3849                 }
3850
3851                 scrub_pause_off(fs_info);
3852                 down_write(&dev_replace->rwsem);
3853                 dev_replace->cursor_right = found_key.offset + length;
3854                 dev_replace->cursor_left = found_key.offset;
3855                 dev_replace->item_needs_writeback = 1;
3856                 up_write(&dev_replace->rwsem);
3857
3858                 ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length,
3859                                   found_key.offset, cache);
3860
3861                 /*
3862                  * flush, submit all pending read and write bios, afterwards
3863                  * wait for them.
3864                  * Note that in the dev replace case, a read request causes
3865                  * write requests that are submitted in the read completion
3866                  * worker. Therefore in the current situation, it is required
3867                  * that all write requests are flushed, so that all read and
3868                  * write requests are really completed when bios_in_flight
3869                  * changes to 0.
3870                  */
3871                 sctx->flush_all_writes = true;
3872                 scrub_submit(sctx);
3873                 mutex_lock(&sctx->wr_lock);
3874                 scrub_wr_submit(sctx);
3875                 mutex_unlock(&sctx->wr_lock);
3876
3877                 wait_event(sctx->list_wait,
3878                            atomic_read(&sctx->bios_in_flight) == 0);
3879
3880                 scrub_pause_on(fs_info);
3881
3882                 /*
3883                  * must be called before we decrease @scrub_paused.
3884                  * make sure we don't block transaction commit while
3885                  * we are waiting pending workers finished.
3886                  */
3887                 wait_event(sctx->list_wait,
3888                            atomic_read(&sctx->workers_pending) == 0);
3889                 sctx->flush_all_writes = false;
3890
3891                 scrub_pause_off(fs_info);
3892
3893                 if (sctx->is_dev_replace &&
3894                     !btrfs_finish_block_group_to_copy(dev_replace->srcdev,
3895                                                       cache, found_key.offset))
3896                         ro_set = 0;
3897
3898                 down_write(&dev_replace->rwsem);
3899                 dev_replace->cursor_left = dev_replace->cursor_right;
3900                 dev_replace->item_needs_writeback = 1;
3901                 up_write(&dev_replace->rwsem);
3902
3903                 if (ro_set)
3904                         btrfs_dec_block_group_ro(cache);
3905
3906                 /*
3907                  * We might have prevented the cleaner kthread from deleting
3908                  * this block group if it was already unused because we raced
3909                  * and set it to RO mode first. So add it back to the unused
3910                  * list, otherwise it might not ever be deleted unless a manual
3911                  * balance is triggered or it becomes used and unused again.
3912                  */
3913                 spin_lock(&cache->lock);
3914                 if (!cache->removed && !cache->ro && cache->reserved == 0 &&
3915                     cache->used == 0) {
3916                         spin_unlock(&cache->lock);
3917                         if (btrfs_test_opt(fs_info, DISCARD_ASYNC))
3918                                 btrfs_discard_queue_work(&fs_info->discard_ctl,
3919                                                          cache);
3920                         else
3921                                 btrfs_mark_bg_unused(cache);
3922                 } else {
3923                         spin_unlock(&cache->lock);
3924                 }
3925 skip_unfreeze:
3926                 btrfs_unfreeze_block_group(cache);
3927                 btrfs_put_block_group(cache);
3928                 if (ret)
3929                         break;
3930                 if (sctx->is_dev_replace &&
3931                     atomic64_read(&dev_replace->num_write_errors) > 0) {
3932                         ret = -EIO;
3933                         break;
3934                 }
3935                 if (sctx->stat.malloc_errors > 0) {
3936                         ret = -ENOMEM;
3937                         break;
3938                 }
3939 skip:
3940                 key.offset = found_key.offset + length;
3941                 btrfs_release_path(path);
3942         }
3943
3944         btrfs_free_path(path);
3945
3946         return ret;
3947 }
3948
3949 static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
3950                                            struct btrfs_device *scrub_dev)
3951 {
3952         int     i;
3953         u64     bytenr;
3954         u64     gen;
3955         int     ret;
3956         struct btrfs_fs_info *fs_info = sctx->fs_info;
3957
3958         if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
3959                 return -EROFS;
3960
3961         /* Seed devices of a new filesystem has their own generation. */
3962         if (scrub_dev->fs_devices != fs_info->fs_devices)
3963                 gen = scrub_dev->generation;
3964         else
3965                 gen = fs_info->last_trans_committed;
3966
3967         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
3968                 bytenr = btrfs_sb_offset(i);
3969                 if (bytenr + BTRFS_SUPER_INFO_SIZE >
3970                     scrub_dev->commit_total_bytes)
3971                         break;
3972                 if (!btrfs_check_super_location(scrub_dev, bytenr))
3973                         continue;
3974
3975                 ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
3976                                   scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
3977                                   NULL, bytenr);
3978                 if (ret)
3979                         return ret;
3980         }
3981         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3982
3983         return 0;
3984 }
3985
3986 static void scrub_workers_put(struct btrfs_fs_info *fs_info)
3987 {
3988         if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt,
3989                                         &fs_info->scrub_lock)) {
3990                 struct btrfs_workqueue *scrub_workers = NULL;
3991                 struct btrfs_workqueue *scrub_wr_comp = NULL;
3992                 struct btrfs_workqueue *scrub_parity = NULL;
3993
3994                 scrub_workers = fs_info->scrub_workers;
3995                 scrub_wr_comp = fs_info->scrub_wr_completion_workers;
3996                 scrub_parity = fs_info->scrub_parity_workers;
3997
3998                 fs_info->scrub_workers = NULL;
3999                 fs_info->scrub_wr_completion_workers = NULL;
4000                 fs_info->scrub_parity_workers = NULL;
4001                 mutex_unlock(&fs_info->scrub_lock);
4002
4003                 btrfs_destroy_workqueue(scrub_workers);
4004                 btrfs_destroy_workqueue(scrub_wr_comp);
4005                 btrfs_destroy_workqueue(scrub_parity);
4006         }
4007 }
4008
4009 /*
4010  * get a reference count on fs_info->scrub_workers. start worker if necessary
4011  */
4012 static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
4013                                                 int is_dev_replace)
4014 {
4015         struct btrfs_workqueue *scrub_workers = NULL;
4016         struct btrfs_workqueue *scrub_wr_comp = NULL;
4017         struct btrfs_workqueue *scrub_parity = NULL;
4018         unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
4019         int max_active = fs_info->thread_pool_size;
4020         int ret = -ENOMEM;
4021
4022         if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt))
4023                 return 0;
4024
4025         scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub", flags,
4026                                               is_dev_replace ? 1 : max_active, 4);
4027         if (!scrub_workers)
4028                 goto fail_scrub_workers;
4029
4030         scrub_wr_comp = btrfs_alloc_workqueue(fs_info, "scrubwrc", flags,
4031                                               max_active, 2);
4032         if (!scrub_wr_comp)
4033                 goto fail_scrub_wr_completion_workers;
4034
4035         scrub_parity = btrfs_alloc_workqueue(fs_info, "scrubparity", flags,
4036                                              max_active, 2);
4037         if (!scrub_parity)
4038                 goto fail_scrub_parity_workers;
4039
4040         mutex_lock(&fs_info->scrub_lock);
4041         if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
4042                 ASSERT(fs_info->scrub_workers == NULL &&
4043                        fs_info->scrub_wr_completion_workers == NULL &&
4044                        fs_info->scrub_parity_workers == NULL);
4045                 fs_info->scrub_workers = scrub_workers;
4046                 fs_info->scrub_wr_completion_workers = scrub_wr_comp;
4047                 fs_info->scrub_parity_workers = scrub_parity;
4048                 refcount_set(&fs_info->scrub_workers_refcnt, 1);
4049                 mutex_unlock(&fs_info->scrub_lock);
4050                 return 0;
4051         }
4052         /* Other thread raced in and created the workers for us */
4053         refcount_inc(&fs_info->scrub_workers_refcnt);
4054         mutex_unlock(&fs_info->scrub_lock);
4055
4056         ret = 0;
4057         btrfs_destroy_workqueue(scrub_parity);
4058 fail_scrub_parity_workers:
4059         btrfs_destroy_workqueue(scrub_wr_comp);
4060 fail_scrub_wr_completion_workers:
4061         btrfs_destroy_workqueue(scrub_workers);
4062 fail_scrub_workers:
4063         return ret;
4064 }
4065
4066 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
4067                     u64 end, struct btrfs_scrub_progress *progress,
4068                     int readonly, int is_dev_replace)
4069 {
4070         struct btrfs_dev_lookup_args args = { .devid = devid };
4071         struct scrub_ctx *sctx;
4072         int ret;
4073         struct btrfs_device *dev;
4074         unsigned int nofs_flag;
4075         bool need_commit = false;
4076
4077         if (btrfs_fs_closing(fs_info))
4078                 return -EAGAIN;
4079
4080         if (fs_info->nodesize > BTRFS_STRIPE_LEN) {
4081                 /*
4082                  * in this case scrub is unable to calculate the checksum
4083                  * the way scrub is implemented. Do not handle this
4084                  * situation at all because it won't ever happen.
4085                  */
4086                 btrfs_err(fs_info,
4087                            "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
4088                        fs_info->nodesize,
4089                        BTRFS_STRIPE_LEN);
4090                 return -EINVAL;
4091         }
4092
4093         if (fs_info->nodesize >
4094             PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
4095             fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
4096                 /*
4097                  * would exhaust the array bounds of pagev member in
4098                  * struct scrub_block
4099                  */
4100                 btrfs_err(fs_info,
4101                           "scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails",
4102                        fs_info->nodesize,
4103                        SCRUB_MAX_PAGES_PER_BLOCK,
4104                        fs_info->sectorsize,
4105                        SCRUB_MAX_PAGES_PER_BLOCK);
4106                 return -EINVAL;
4107         }
4108
4109         /* Allocate outside of device_list_mutex */
4110         sctx = scrub_setup_ctx(fs_info, is_dev_replace);
4111         if (IS_ERR(sctx))
4112                 return PTR_ERR(sctx);
4113
4114         ret = scrub_workers_get(fs_info, is_dev_replace);
4115         if (ret)
4116                 goto out_free_ctx;
4117
4118         mutex_lock(&fs_info->fs_devices->device_list_mutex);
4119         dev = btrfs_find_device(fs_info->fs_devices, &args);
4120         if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
4121                      !is_dev_replace)) {
4122                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4123                 ret = -ENODEV;
4124                 goto out;
4125         }
4126
4127         if (!is_dev_replace && !readonly &&
4128             !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
4129                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4130                 btrfs_err_in_rcu(fs_info,
4131                         "scrub on devid %llu: filesystem on %s is not writable",
4132                                  devid, rcu_str_deref(dev->name));
4133                 ret = -EROFS;
4134                 goto out;
4135         }
4136
4137         mutex_lock(&fs_info->scrub_lock);
4138         if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
4139             test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) {
4140                 mutex_unlock(&fs_info->scrub_lock);
4141                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4142                 ret = -EIO;
4143                 goto out;
4144         }
4145
4146         down_read(&fs_info->dev_replace.rwsem);
4147         if (dev->scrub_ctx ||
4148             (!is_dev_replace &&
4149              btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
4150                 up_read(&fs_info->dev_replace.rwsem);
4151                 mutex_unlock(&fs_info->scrub_lock);
4152                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4153                 ret = -EINPROGRESS;
4154                 goto out;
4155         }
4156         up_read(&fs_info->dev_replace.rwsem);
4157
4158         sctx->readonly = readonly;
4159         dev->scrub_ctx = sctx;
4160         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4161
4162         /*
4163          * checking @scrub_pause_req here, we can avoid
4164          * race between committing transaction and scrubbing.
4165          */
4166         __scrub_blocked_if_needed(fs_info);
4167         atomic_inc(&fs_info->scrubs_running);
4168         mutex_unlock(&fs_info->scrub_lock);
4169
4170         /*
4171          * In order to avoid deadlock with reclaim when there is a transaction
4172          * trying to pause scrub, make sure we use GFP_NOFS for all the
4173          * allocations done at btrfs_scrub_pages() and scrub_pages_for_parity()
4174          * invoked by our callees. The pausing request is done when the
4175          * transaction commit starts, and it blocks the transaction until scrub
4176          * is paused (done at specific points at scrub_stripe() or right above
4177          * before incrementing fs_info->scrubs_running).
4178          */
4179         nofs_flag = memalloc_nofs_save();
4180         if (!is_dev_replace) {
4181                 u64 old_super_errors;
4182
4183                 spin_lock(&sctx->stat_lock);
4184                 old_super_errors = sctx->stat.super_errors;
4185                 spin_unlock(&sctx->stat_lock);
4186
4187                 btrfs_info(fs_info, "scrub: started on devid %llu", devid);
4188                 /*
4189                  * by holding device list mutex, we can
4190                  * kick off writing super in log tree sync.
4191                  */
4192                 mutex_lock(&fs_info->fs_devices->device_list_mutex);
4193                 ret = scrub_supers(sctx, dev);
4194                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4195
4196                 spin_lock(&sctx->stat_lock);
4197                 /*
4198                  * Super block errors found, but we can not commit transaction
4199                  * at current context, since btrfs_commit_transaction() needs
4200                  * to pause the current running scrub (hold by ourselves).
4201                  */
4202                 if (sctx->stat.super_errors > old_super_errors && !sctx->readonly)
4203                         need_commit = true;
4204                 spin_unlock(&sctx->stat_lock);
4205         }
4206
4207         if (!ret)
4208                 ret = scrub_enumerate_chunks(sctx, dev, start, end);
4209         memalloc_nofs_restore(nofs_flag);
4210
4211         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
4212         atomic_dec(&fs_info->scrubs_running);
4213         wake_up(&fs_info->scrub_pause_wait);
4214
4215         wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
4216
4217         if (progress)
4218                 memcpy(progress, &sctx->stat, sizeof(*progress));
4219
4220         if (!is_dev_replace)
4221                 btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d",
4222                         ret ? "not finished" : "finished", devid, ret);
4223
4224         mutex_lock(&fs_info->scrub_lock);
4225         dev->scrub_ctx = NULL;
4226         mutex_unlock(&fs_info->scrub_lock);
4227
4228         scrub_workers_put(fs_info);
4229         scrub_put_ctx(sctx);
4230
4231         /*
4232          * We found some super block errors before, now try to force a
4233          * transaction commit, as scrub has finished.
4234          */
4235         if (need_commit) {
4236                 struct btrfs_trans_handle *trans;
4237
4238                 trans = btrfs_start_transaction(fs_info->tree_root, 0);
4239                 if (IS_ERR(trans)) {
4240                         ret = PTR_ERR(trans);
4241                         btrfs_err(fs_info,
4242         "scrub: failed to start transaction to fix super block errors: %d", ret);
4243                         return ret;
4244                 }
4245                 ret = btrfs_commit_transaction(trans);
4246                 if (ret < 0)
4247                         btrfs_err(fs_info,
4248         "scrub: failed to commit transaction to fix super block errors: %d", ret);
4249         }
4250         return ret;
4251 out:
4252         scrub_workers_put(fs_info);
4253 out_free_ctx:
4254         scrub_free_ctx(sctx);
4255
4256         return ret;
4257 }
4258
4259 void btrfs_scrub_pause(struct btrfs_fs_info *fs_info)
4260 {
4261         mutex_lock(&fs_info->scrub_lock);
4262         atomic_inc(&fs_info->scrub_pause_req);
4263         while (atomic_read(&fs_info->scrubs_paused) !=
4264                atomic_read(&fs_info->scrubs_running)) {
4265                 mutex_unlock(&fs_info->scrub_lock);
4266                 wait_event(fs_info->scrub_pause_wait,
4267                            atomic_read(&fs_info->scrubs_paused) ==
4268                            atomic_read(&fs_info->scrubs_running));
4269                 mutex_lock(&fs_info->scrub_lock);
4270         }
4271         mutex_unlock(&fs_info->scrub_lock);
4272 }
4273
4274 void btrfs_scrub_continue(struct btrfs_fs_info *fs_info)
4275 {
4276         atomic_dec(&fs_info->scrub_pause_req);
4277         wake_up(&fs_info->scrub_pause_wait);
4278 }
4279
4280 int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
4281 {
4282         mutex_lock(&fs_info->scrub_lock);
4283         if (!atomic_read(&fs_info->scrubs_running)) {
4284                 mutex_unlock(&fs_info->scrub_lock);
4285                 return -ENOTCONN;
4286         }
4287
4288         atomic_inc(&fs_info->scrub_cancel_req);
4289         while (atomic_read(&fs_info->scrubs_running)) {
4290                 mutex_unlock(&fs_info->scrub_lock);
4291                 wait_event(fs_info->scrub_pause_wait,
4292                            atomic_read(&fs_info->scrubs_running) == 0);
4293                 mutex_lock(&fs_info->scrub_lock);
4294         }
4295         atomic_dec(&fs_info->scrub_cancel_req);
4296         mutex_unlock(&fs_info->scrub_lock);
4297
4298         return 0;
4299 }
4300
4301 int btrfs_scrub_cancel_dev(struct btrfs_device *dev)
4302 {
4303         struct btrfs_fs_info *fs_info = dev->fs_info;
4304         struct scrub_ctx *sctx;
4305
4306         mutex_lock(&fs_info->scrub_lock);
4307         sctx = dev->scrub_ctx;
4308         if (!sctx) {
4309                 mutex_unlock(&fs_info->scrub_lock);
4310                 return -ENOTCONN;
4311         }
4312         atomic_inc(&sctx->cancel_req);
4313         while (dev->scrub_ctx) {
4314                 mutex_unlock(&fs_info->scrub_lock);
4315                 wait_event(fs_info->scrub_pause_wait,
4316                            dev->scrub_ctx == NULL);
4317                 mutex_lock(&fs_info->scrub_lock);
4318         }
4319         mutex_unlock(&fs_info->scrub_lock);
4320
4321         return 0;
4322 }
4323
4324 int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
4325                          struct btrfs_scrub_progress *progress)
4326 {
4327         struct btrfs_dev_lookup_args args = { .devid = devid };
4328         struct btrfs_device *dev;
4329         struct scrub_ctx *sctx = NULL;
4330
4331         mutex_lock(&fs_info->fs_devices->device_list_mutex);
4332         dev = btrfs_find_device(fs_info->fs_devices, &args);
4333         if (dev)
4334                 sctx = dev->scrub_ctx;
4335         if (sctx)
4336                 memcpy(progress, &sctx->stat, sizeof(*progress));
4337         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4338
4339         return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
4340 }
4341
4342 static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
4343                                u64 extent_logical, u32 extent_len,
4344                                u64 *extent_physical,
4345                                struct btrfs_device **extent_dev,
4346                                int *extent_mirror_num)
4347 {
4348         u64 mapped_length;
4349         struct btrfs_io_context *bioc = NULL;
4350         int ret;
4351
4352         mapped_length = extent_len;
4353         ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical,
4354                               &mapped_length, &bioc, 0);
4355         if (ret || !bioc || mapped_length < extent_len ||
4356             !bioc->stripes[0].dev->bdev) {
4357                 btrfs_put_bioc(bioc);
4358                 return;
4359         }
4360
4361         *extent_physical = bioc->stripes[0].physical;
4362         *extent_mirror_num = bioc->mirror_num;
4363         *extent_dev = bioc->stripes[0].dev;
4364         btrfs_put_bioc(bioc);
4365 }