1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2012 Fusion-io All rights reserved.
4 * Copyright (C) 2012 Intel Corp. All rights reserved.
7 #include <linux/sched.h>
9 #include <linux/slab.h>
10 #include <linux/blkdev.h>
11 #include <linux/raid/pq.h>
12 #include <linux/hash.h>
13 #include <linux/list_sort.h>
14 #include <linux/raid/xor.h>
21 #include "async-thread.h"
23 /* set when additional merges to this rbio are not allowed */
24 #define RBIO_RMW_LOCKED_BIT 1
27 * set when this rbio is sitting in the hash, but it is just a cache
30 #define RBIO_CACHE_BIT 2
33 * set when it is safe to trust the stripe_pages for caching
35 #define RBIO_CACHE_READY_BIT 3
37 #define RBIO_CACHE_SIZE 1024
39 #define BTRFS_STRIPE_HASH_TABLE_BITS 11
41 /* Used by the raid56 code to lock stripes for read/modify/write */
42 struct btrfs_stripe_hash {
43 struct list_head hash_list;
47 /* Used by the raid56 code to lock stripes for read/modify/write */
48 struct btrfs_stripe_hash_table {
49 struct list_head stripe_cache;
50 spinlock_t cache_lock;
52 struct btrfs_stripe_hash table[];
56 * A bvec like structure to present a sector inside a page.
58 * Unlike bvec we don't need bvlen, as it's fixed to sectorsize.
62 unsigned int pgoff:24;
63 unsigned int uptodate:8;
66 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
67 static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
68 static void rmw_work(struct work_struct *work);
69 static void read_rebuild_work(struct work_struct *work);
70 static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
71 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
72 static void __free_raid_bio(struct btrfs_raid_bio *rbio);
73 static void index_rbio_pages(struct btrfs_raid_bio *rbio);
74 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
76 static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
78 static void scrub_parity_work(struct work_struct *work);
80 static void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func)
82 INIT_WORK(&rbio->work, work_func);
83 queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work);
87 * the stripe hash table is used for locking, and to collect
88 * bios in hopes of making a full stripe
90 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
92 struct btrfs_stripe_hash_table *table;
93 struct btrfs_stripe_hash_table *x;
94 struct btrfs_stripe_hash *cur;
95 struct btrfs_stripe_hash *h;
96 int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
99 if (info->stripe_hash_table)
103 * The table is large, starting with order 4 and can go as high as
104 * order 7 in case lock debugging is turned on.
106 * Try harder to allocate and fallback to vmalloc to lower the chance
107 * of a failing mount.
109 table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL);
113 spin_lock_init(&table->cache_lock);
114 INIT_LIST_HEAD(&table->stripe_cache);
118 for (i = 0; i < num_entries; i++) {
120 INIT_LIST_HEAD(&cur->hash_list);
121 spin_lock_init(&cur->lock);
124 x = cmpxchg(&info->stripe_hash_table, NULL, table);
130 * caching an rbio means to copy anything from the
131 * bio_sectors array into the stripe_pages array. We
132 * use the page uptodate bit in the stripe cache array
133 * to indicate if it has valid data
135 * once the caching is done, we set the cache ready
138 static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
143 ret = alloc_rbio_pages(rbio);
147 for (i = 0; i < rbio->nr_sectors; i++) {
148 /* Some range not covered by bio (partial write), skip it */
149 if (!rbio->bio_sectors[i].page)
152 ASSERT(rbio->stripe_sectors[i].page);
153 memcpy_page(rbio->stripe_sectors[i].page,
154 rbio->stripe_sectors[i].pgoff,
155 rbio->bio_sectors[i].page,
156 rbio->bio_sectors[i].pgoff,
157 rbio->bioc->fs_info->sectorsize);
158 rbio->stripe_sectors[i].uptodate = 1;
160 set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
164 * we hash on the first logical address of the stripe
166 static int rbio_bucket(struct btrfs_raid_bio *rbio)
168 u64 num = rbio->bioc->raid_map[0];
171 * we shift down quite a bit. We're using byte
172 * addressing, and most of the lower bits are zeros.
173 * This tends to upset hash_64, and it consistently
174 * returns just one or two different values.
176 * shifting off the lower bits fixes things.
178 return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
181 static bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio,
182 unsigned int page_nr)
184 const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
185 const u32 sectors_per_page = PAGE_SIZE / sectorsize;
188 ASSERT(page_nr < rbio->nr_pages);
190 for (i = sectors_per_page * page_nr;
191 i < sectors_per_page * page_nr + sectors_per_page;
193 if (!rbio->stripe_sectors[i].uptodate)
200 * Update the stripe_sectors[] array to use correct page and pgoff
202 * Should be called every time any page pointer in stripes_pages[] got modified.
204 static void index_stripe_sectors(struct btrfs_raid_bio *rbio)
206 const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
210 for (i = 0, offset = 0; i < rbio->nr_sectors; i++, offset += sectorsize) {
211 int page_index = offset >> PAGE_SHIFT;
213 ASSERT(page_index < rbio->nr_pages);
214 rbio->stripe_sectors[i].page = rbio->stripe_pages[page_index];
215 rbio->stripe_sectors[i].pgoff = offset_in_page(offset);
219 static void steal_rbio_page(struct btrfs_raid_bio *src,
220 struct btrfs_raid_bio *dest, int page_nr)
222 const u32 sectorsize = src->bioc->fs_info->sectorsize;
223 const u32 sectors_per_page = PAGE_SIZE / sectorsize;
226 if (dest->stripe_pages[page_nr])
227 __free_page(dest->stripe_pages[page_nr]);
228 dest->stripe_pages[page_nr] = src->stripe_pages[page_nr];
229 src->stripe_pages[page_nr] = NULL;
231 /* Also update the sector->uptodate bits. */
232 for (i = sectors_per_page * page_nr;
233 i < sectors_per_page * page_nr + sectors_per_page; i++)
234 dest->stripe_sectors[i].uptodate = true;
238 * Stealing an rbio means taking all the uptodate pages from the stripe array
239 * in the source rbio and putting them into the destination rbio.
241 * This will also update the involved stripe_sectors[] which are referring to
244 static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
249 if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
252 for (i = 0; i < dest->nr_pages; i++) {
253 s = src->stripe_pages[i];
254 if (!s || !full_page_sectors_uptodate(src, i))
257 steal_rbio_page(src, dest, i);
259 index_stripe_sectors(dest);
260 index_stripe_sectors(src);
264 * merging means we take the bio_list from the victim and
265 * splice it into the destination. The victim should
266 * be discarded afterwards.
268 * must be called with dest->rbio_list_lock held
270 static void merge_rbio(struct btrfs_raid_bio *dest,
271 struct btrfs_raid_bio *victim)
273 bio_list_merge(&dest->bio_list, &victim->bio_list);
274 dest->bio_list_bytes += victim->bio_list_bytes;
275 /* Also inherit the bitmaps from @victim. */
276 bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap,
277 dest->stripe_nsectors);
278 dest->generic_bio_cnt += victim->generic_bio_cnt;
279 bio_list_init(&victim->bio_list);
283 * used to prune items that are in the cache. The caller
284 * must hold the hash table lock.
286 static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
288 int bucket = rbio_bucket(rbio);
289 struct btrfs_stripe_hash_table *table;
290 struct btrfs_stripe_hash *h;
294 * check the bit again under the hash table lock.
296 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
299 table = rbio->bioc->fs_info->stripe_hash_table;
300 h = table->table + bucket;
302 /* hold the lock for the bucket because we may be
303 * removing it from the hash table
308 * hold the lock for the bio list because we need
309 * to make sure the bio list is empty
311 spin_lock(&rbio->bio_list_lock);
313 if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
314 list_del_init(&rbio->stripe_cache);
315 table->cache_size -= 1;
318 /* if the bio list isn't empty, this rbio is
319 * still involved in an IO. We take it out
320 * of the cache list, and drop the ref that
321 * was held for the list.
323 * If the bio_list was empty, we also remove
324 * the rbio from the hash_table, and drop
325 * the corresponding ref
327 if (bio_list_empty(&rbio->bio_list)) {
328 if (!list_empty(&rbio->hash_list)) {
329 list_del_init(&rbio->hash_list);
330 refcount_dec(&rbio->refs);
331 BUG_ON(!list_empty(&rbio->plug_list));
336 spin_unlock(&rbio->bio_list_lock);
337 spin_unlock(&h->lock);
340 __free_raid_bio(rbio);
344 * prune a given rbio from the cache
346 static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
348 struct btrfs_stripe_hash_table *table;
351 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
354 table = rbio->bioc->fs_info->stripe_hash_table;
356 spin_lock_irqsave(&table->cache_lock, flags);
357 __remove_rbio_from_cache(rbio);
358 spin_unlock_irqrestore(&table->cache_lock, flags);
362 * remove everything in the cache
364 static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
366 struct btrfs_stripe_hash_table *table;
368 struct btrfs_raid_bio *rbio;
370 table = info->stripe_hash_table;
372 spin_lock_irqsave(&table->cache_lock, flags);
373 while (!list_empty(&table->stripe_cache)) {
374 rbio = list_entry(table->stripe_cache.next,
375 struct btrfs_raid_bio,
377 __remove_rbio_from_cache(rbio);
379 spin_unlock_irqrestore(&table->cache_lock, flags);
383 * remove all cached entries and free the hash table
386 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
388 if (!info->stripe_hash_table)
390 btrfs_clear_rbio_cache(info);
391 kvfree(info->stripe_hash_table);
392 info->stripe_hash_table = NULL;
396 * insert an rbio into the stripe cache. It
397 * must have already been prepared by calling
400 * If this rbio was already cached, it gets
401 * moved to the front of the lru.
403 * If the size of the rbio cache is too big, we
406 static void cache_rbio(struct btrfs_raid_bio *rbio)
408 struct btrfs_stripe_hash_table *table;
411 if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
414 table = rbio->bioc->fs_info->stripe_hash_table;
416 spin_lock_irqsave(&table->cache_lock, flags);
417 spin_lock(&rbio->bio_list_lock);
419 /* bump our ref if we were not in the list before */
420 if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
421 refcount_inc(&rbio->refs);
423 if (!list_empty(&rbio->stripe_cache)){
424 list_move(&rbio->stripe_cache, &table->stripe_cache);
426 list_add(&rbio->stripe_cache, &table->stripe_cache);
427 table->cache_size += 1;
430 spin_unlock(&rbio->bio_list_lock);
432 if (table->cache_size > RBIO_CACHE_SIZE) {
433 struct btrfs_raid_bio *found;
435 found = list_entry(table->stripe_cache.prev,
436 struct btrfs_raid_bio,
440 __remove_rbio_from_cache(found);
443 spin_unlock_irqrestore(&table->cache_lock, flags);
447 * helper function to run the xor_blocks api. It is only
448 * able to do MAX_XOR_BLOCKS at a time, so we need to
451 static void run_xor(void **pages, int src_cnt, ssize_t len)
455 void *dest = pages[src_cnt];
458 xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
459 xor_blocks(xor_src_cnt, len, dest, pages + src_off);
461 src_cnt -= xor_src_cnt;
462 src_off += xor_src_cnt;
467 * Returns true if the bio list inside this rbio covers an entire stripe (no
470 static int rbio_is_full(struct btrfs_raid_bio *rbio)
473 unsigned long size = rbio->bio_list_bytes;
476 spin_lock_irqsave(&rbio->bio_list_lock, flags);
477 if (size != rbio->nr_data * BTRFS_STRIPE_LEN)
479 BUG_ON(size > rbio->nr_data * BTRFS_STRIPE_LEN);
480 spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
486 * returns 1 if it is safe to merge two rbios together.
487 * The merging is safe if the two rbios correspond to
488 * the same stripe and if they are both going in the same
489 * direction (read vs write), and if neither one is
490 * locked for final IO
492 * The caller is responsible for locking such that
493 * rmw_locked is safe to test
495 static int rbio_can_merge(struct btrfs_raid_bio *last,
496 struct btrfs_raid_bio *cur)
498 if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
499 test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
503 * we can't merge with cached rbios, since the
504 * idea is that when we merge the destination
505 * rbio is going to run our IO for us. We can
506 * steal from cached rbios though, other functions
509 if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
510 test_bit(RBIO_CACHE_BIT, &cur->flags))
513 if (last->bioc->raid_map[0] != cur->bioc->raid_map[0])
516 /* we can't merge with different operations */
517 if (last->operation != cur->operation)
520 * We've need read the full stripe from the drive.
521 * check and repair the parity and write the new results.
523 * We're not allowed to add any new bios to the
524 * bio list here, anyone else that wants to
525 * change this stripe needs to do their own rmw.
527 if (last->operation == BTRFS_RBIO_PARITY_SCRUB)
530 if (last->operation == BTRFS_RBIO_REBUILD_MISSING)
533 if (last->operation == BTRFS_RBIO_READ_REBUILD) {
534 int fa = last->faila;
535 int fb = last->failb;
536 int cur_fa = cur->faila;
537 int cur_fb = cur->failb;
539 if (last->faila >= last->failb) {
544 if (cur->faila >= cur->failb) {
549 if (fa != cur_fa || fb != cur_fb)
555 static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio *rbio,
556 unsigned int stripe_nr,
557 unsigned int sector_nr)
559 ASSERT(stripe_nr < rbio->real_stripes);
560 ASSERT(sector_nr < rbio->stripe_nsectors);
562 return stripe_nr * rbio->stripe_nsectors + sector_nr;
565 /* Return a sector from rbio->stripe_sectors, not from the bio list */
566 static struct sector_ptr *rbio_stripe_sector(const struct btrfs_raid_bio *rbio,
567 unsigned int stripe_nr,
568 unsigned int sector_nr)
570 return &rbio->stripe_sectors[rbio_stripe_sector_index(rbio, stripe_nr,
574 /* Grab a sector inside P stripe */
575 static struct sector_ptr *rbio_pstripe_sector(const struct btrfs_raid_bio *rbio,
576 unsigned int sector_nr)
578 return rbio_stripe_sector(rbio, rbio->nr_data, sector_nr);
581 /* Grab a sector inside Q stripe, return NULL if not RAID6 */
582 static struct sector_ptr *rbio_qstripe_sector(const struct btrfs_raid_bio *rbio,
583 unsigned int sector_nr)
585 if (rbio->nr_data + 1 == rbio->real_stripes)
587 return rbio_stripe_sector(rbio, rbio->nr_data + 1, sector_nr);
591 * The first stripe in the table for a logical address
592 * has the lock. rbios are added in one of three ways:
594 * 1) Nobody has the stripe locked yet. The rbio is given
595 * the lock and 0 is returned. The caller must start the IO
598 * 2) Someone has the stripe locked, but we're able to merge
599 * with the lock owner. The rbio is freed and the IO will
600 * start automatically along with the existing rbio. 1 is returned.
602 * 3) Someone has the stripe locked, but we're not able to merge.
603 * The rbio is added to the lock owner's plug list, or merged into
604 * an rbio already on the plug list. When the lock owner unlocks,
605 * the next rbio on the list is run and the IO is started automatically.
608 * If we return 0, the caller still owns the rbio and must continue with
609 * IO submission. If we return 1, the caller must assume the rbio has
610 * already been freed.
612 static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
614 struct btrfs_stripe_hash *h;
615 struct btrfs_raid_bio *cur;
616 struct btrfs_raid_bio *pending;
618 struct btrfs_raid_bio *freeit = NULL;
619 struct btrfs_raid_bio *cache_drop = NULL;
622 h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
624 spin_lock_irqsave(&h->lock, flags);
625 list_for_each_entry(cur, &h->hash_list, hash_list) {
626 if (cur->bioc->raid_map[0] != rbio->bioc->raid_map[0])
629 spin_lock(&cur->bio_list_lock);
631 /* Can we steal this cached rbio's pages? */
632 if (bio_list_empty(&cur->bio_list) &&
633 list_empty(&cur->plug_list) &&
634 test_bit(RBIO_CACHE_BIT, &cur->flags) &&
635 !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
636 list_del_init(&cur->hash_list);
637 refcount_dec(&cur->refs);
639 steal_rbio(cur, rbio);
641 spin_unlock(&cur->bio_list_lock);
646 /* Can we merge into the lock owner? */
647 if (rbio_can_merge(cur, rbio)) {
648 merge_rbio(cur, rbio);
649 spin_unlock(&cur->bio_list_lock);
657 * We couldn't merge with the running rbio, see if we can merge
658 * with the pending ones. We don't have to check for rmw_locked
659 * because there is no way they are inside finish_rmw right now
661 list_for_each_entry(pending, &cur->plug_list, plug_list) {
662 if (rbio_can_merge(pending, rbio)) {
663 merge_rbio(pending, rbio);
664 spin_unlock(&cur->bio_list_lock);
672 * No merging, put us on the tail of the plug list, our rbio
673 * will be started with the currently running rbio unlocks
675 list_add_tail(&rbio->plug_list, &cur->plug_list);
676 spin_unlock(&cur->bio_list_lock);
681 refcount_inc(&rbio->refs);
682 list_add(&rbio->hash_list, &h->hash_list);
684 spin_unlock_irqrestore(&h->lock, flags);
686 remove_rbio_from_cache(cache_drop);
688 __free_raid_bio(freeit);
693 * called as rmw or parity rebuild is completed. If the plug list has more
694 * rbios waiting for this stripe, the next one on the list will be started
696 static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
699 struct btrfs_stripe_hash *h;
703 bucket = rbio_bucket(rbio);
704 h = rbio->bioc->fs_info->stripe_hash_table->table + bucket;
706 if (list_empty(&rbio->plug_list))
709 spin_lock_irqsave(&h->lock, flags);
710 spin_lock(&rbio->bio_list_lock);
712 if (!list_empty(&rbio->hash_list)) {
714 * if we're still cached and there is no other IO
715 * to perform, just leave this rbio here for others
716 * to steal from later
718 if (list_empty(&rbio->plug_list) &&
719 test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
721 clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
722 BUG_ON(!bio_list_empty(&rbio->bio_list));
726 list_del_init(&rbio->hash_list);
727 refcount_dec(&rbio->refs);
730 * we use the plug list to hold all the rbios
731 * waiting for the chance to lock this stripe.
732 * hand the lock over to one of them.
734 if (!list_empty(&rbio->plug_list)) {
735 struct btrfs_raid_bio *next;
736 struct list_head *head = rbio->plug_list.next;
738 next = list_entry(head, struct btrfs_raid_bio,
741 list_del_init(&rbio->plug_list);
743 list_add(&next->hash_list, &h->hash_list);
744 refcount_inc(&next->refs);
745 spin_unlock(&rbio->bio_list_lock);
746 spin_unlock_irqrestore(&h->lock, flags);
748 if (next->operation == BTRFS_RBIO_READ_REBUILD)
749 start_async_work(next, read_rebuild_work);
750 else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) {
751 steal_rbio(rbio, next);
752 start_async_work(next, read_rebuild_work);
753 } else if (next->operation == BTRFS_RBIO_WRITE) {
754 steal_rbio(rbio, next);
755 start_async_work(next, rmw_work);
756 } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
757 steal_rbio(rbio, next);
758 start_async_work(next, scrub_parity_work);
765 spin_unlock(&rbio->bio_list_lock);
766 spin_unlock_irqrestore(&h->lock, flags);
770 remove_rbio_from_cache(rbio);
773 static void __free_raid_bio(struct btrfs_raid_bio *rbio)
777 if (!refcount_dec_and_test(&rbio->refs))
780 WARN_ON(!list_empty(&rbio->stripe_cache));
781 WARN_ON(!list_empty(&rbio->hash_list));
782 WARN_ON(!bio_list_empty(&rbio->bio_list));
784 for (i = 0; i < rbio->nr_pages; i++) {
785 if (rbio->stripe_pages[i]) {
786 __free_page(rbio->stripe_pages[i]);
787 rbio->stripe_pages[i] = NULL;
791 btrfs_put_bioc(rbio->bioc);
795 static void rbio_endio_bio_list(struct bio *cur, blk_status_t err)
802 cur->bi_status = err;
809 * this frees the rbio and runs through all the bios in the
810 * bio_list and calls end_io on them
812 static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
814 struct bio *cur = bio_list_get(&rbio->bio_list);
817 if (rbio->generic_bio_cnt)
818 btrfs_bio_counter_sub(rbio->bioc->fs_info, rbio->generic_bio_cnt);
820 * Clear the data bitmap, as the rbio may be cached for later usage.
821 * do this before before unlock_stripe() so there will be no new bio
824 bitmap_clear(&rbio->dbitmap, 0, rbio->stripe_nsectors);
827 * At this moment, rbio->bio_list is empty, however since rbio does not
828 * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the
829 * hash list, rbio may be merged with others so that rbio->bio_list
831 * Once unlock_stripe() is done, rbio->bio_list will not be updated any
832 * more and we can call bio_endio() on all queued bios.
835 extra = bio_list_get(&rbio->bio_list);
836 __free_raid_bio(rbio);
838 rbio_endio_bio_list(cur, err);
840 rbio_endio_bio_list(extra, err);
844 * end io function used by finish_rmw. When we finally
845 * get here, we've written a full stripe
847 static void raid_write_end_io(struct bio *bio)
849 struct btrfs_raid_bio *rbio = bio->bi_private;
850 blk_status_t err = bio->bi_status;
854 fail_bio_stripe(rbio, bio);
858 if (!atomic_dec_and_test(&rbio->stripes_pending))
863 /* OK, we have read all the stripes we need to. */
864 max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ?
865 0 : rbio->bioc->max_errors;
866 if (atomic_read(&rbio->error) > max_errors)
869 rbio_orig_end_io(rbio, err);
873 * Get a sector pointer specified by its @stripe_nr and @sector_nr
875 * @rbio: The raid bio
876 * @stripe_nr: Stripe number, valid range [0, real_stripe)
877 * @sector_nr: Sector number inside the stripe,
878 * valid range [0, stripe_nsectors)
879 * @bio_list_only: Whether to use sectors inside the bio list only.
881 * The read/modify/write code wants to reuse the original bio page as much
882 * as possible, and only use stripe_sectors as fallback.
884 static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio,
885 int stripe_nr, int sector_nr,
888 struct sector_ptr *sector;
891 ASSERT(stripe_nr >= 0 && stripe_nr < rbio->real_stripes);
892 ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors);
894 index = stripe_nr * rbio->stripe_nsectors + sector_nr;
895 ASSERT(index >= 0 && index < rbio->nr_sectors);
897 spin_lock_irq(&rbio->bio_list_lock);
898 sector = &rbio->bio_sectors[index];
899 if (sector->page || bio_list_only) {
900 /* Don't return sector without a valid page pointer */
903 spin_unlock_irq(&rbio->bio_list_lock);
906 spin_unlock_irq(&rbio->bio_list_lock);
908 return &rbio->stripe_sectors[index];
912 * allocation and initial setup for the btrfs_raid_bio. Not
913 * this does not allocate any pages for rbio->pages.
915 static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
916 struct btrfs_io_context *bioc)
918 const unsigned int real_stripes = bioc->num_stripes - bioc->num_tgtdevs;
919 const unsigned int stripe_npages = BTRFS_STRIPE_LEN >> PAGE_SHIFT;
920 const unsigned int num_pages = stripe_npages * real_stripes;
921 const unsigned int stripe_nsectors =
922 BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits;
923 const unsigned int num_sectors = stripe_nsectors * real_stripes;
924 struct btrfs_raid_bio *rbio;
927 /* PAGE_SIZE must also be aligned to sectorsize for subpage support */
928 ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize));
930 * Our current stripe len should be fixed to 64k thus stripe_nsectors
931 * (at most 16) should be no larger than BITS_PER_LONG.
933 ASSERT(stripe_nsectors <= BITS_PER_LONG);
935 rbio = kzalloc(sizeof(*rbio) +
936 sizeof(*rbio->stripe_pages) * num_pages +
937 sizeof(*rbio->bio_sectors) * num_sectors +
938 sizeof(*rbio->stripe_sectors) * num_sectors +
939 sizeof(*rbio->finish_pointers) * real_stripes,
942 return ERR_PTR(-ENOMEM);
944 bio_list_init(&rbio->bio_list);
945 INIT_LIST_HEAD(&rbio->plug_list);
946 spin_lock_init(&rbio->bio_list_lock);
947 INIT_LIST_HEAD(&rbio->stripe_cache);
948 INIT_LIST_HEAD(&rbio->hash_list);
950 rbio->nr_pages = num_pages;
951 rbio->nr_sectors = num_sectors;
952 rbio->real_stripes = real_stripes;
953 rbio->stripe_npages = stripe_npages;
954 rbio->stripe_nsectors = stripe_nsectors;
957 refcount_set(&rbio->refs, 1);
958 atomic_set(&rbio->error, 0);
959 atomic_set(&rbio->stripes_pending, 0);
962 * The stripe_pages, bio_sectors, etc arrays point to the extra memory
963 * we allocated past the end of the rbio.
966 #define CONSUME_ALLOC(ptr, count) do { \
968 p = (unsigned char *)p + sizeof(*(ptr)) * (count); \
970 CONSUME_ALLOC(rbio->stripe_pages, num_pages);
971 CONSUME_ALLOC(rbio->bio_sectors, num_sectors);
972 CONSUME_ALLOC(rbio->stripe_sectors, num_sectors);
973 CONSUME_ALLOC(rbio->finish_pointers, real_stripes);
976 ASSERT(btrfs_nr_parity_stripes(bioc->map_type));
977 rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type);
982 /* allocate pages for all the stripes in the bio, including parity */
983 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
987 ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages);
990 /* Mapping all sectors */
991 index_stripe_sectors(rbio);
995 /* only allocate pages for p/q stripes */
996 static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
998 const int data_pages = rbio->nr_data * rbio->stripe_npages;
1001 ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages,
1002 rbio->stripe_pages + data_pages);
1006 index_stripe_sectors(rbio);
1011 * Add a single sector @sector into our list of bios for IO.
1013 * Return 0 if everything went well.
1014 * Return <0 for error.
1016 static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
1017 struct bio_list *bio_list,
1018 struct sector_ptr *sector,
1019 unsigned int stripe_nr,
1020 unsigned int sector_nr,
1023 const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1024 struct bio *last = bio_list->tail;
1027 struct btrfs_io_stripe *stripe;
1031 * Note: here stripe_nr has taken device replace into consideration,
1032 * thus it can be larger than rbio->real_stripe.
1033 * So here we check against bioc->num_stripes, not rbio->real_stripes.
1035 ASSERT(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes);
1036 ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors);
1037 ASSERT(sector->page);
1039 stripe = &rbio->bioc->stripes[stripe_nr];
1040 disk_start = stripe->physical + sector_nr * sectorsize;
1042 /* if the device is missing, just fail this stripe */
1043 if (!stripe->dev->bdev)
1044 return fail_rbio_index(rbio, stripe_nr);
1046 /* see if we can add this page onto our existing bio */
1048 u64 last_end = last->bi_iter.bi_sector << 9;
1049 last_end += last->bi_iter.bi_size;
1052 * we can't merge these if they are from different
1053 * devices or if they are not contiguous
1055 if (last_end == disk_start && !last->bi_status &&
1056 last->bi_bdev == stripe->dev->bdev) {
1057 ret = bio_add_page(last, sector->page, sectorsize,
1059 if (ret == sectorsize)
1064 /* put a new bio on the list */
1065 bio = bio_alloc(stripe->dev->bdev,
1066 max(BTRFS_STRIPE_LEN >> PAGE_SHIFT, 1),
1068 bio->bi_iter.bi_sector = disk_start >> 9;
1069 bio->bi_private = rbio;
1071 bio_add_page(bio, sector->page, sectorsize, sector->pgoff);
1072 bio_list_add(bio_list, bio);
1077 * while we're doing the read/modify/write cycle, we could
1078 * have errors in reading pages off the disk. This checks
1079 * for errors and if we're not able to read the page it'll
1080 * trigger parity reconstruction. The rmw will be finished
1081 * after we've reconstructed the failed stripes
1083 static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
1085 if (rbio->faila >= 0 || rbio->failb >= 0) {
1086 BUG_ON(rbio->faila == rbio->real_stripes - 1);
1087 __raid56_parity_recover(rbio);
1093 static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio)
1095 const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1096 struct bio_vec bvec;
1097 struct bvec_iter iter;
1098 u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
1099 rbio->bioc->raid_map[0];
1101 bio_for_each_segment(bvec, bio, iter) {
1104 for (bvec_offset = 0; bvec_offset < bvec.bv_len;
1105 bvec_offset += sectorsize, offset += sectorsize) {
1106 int index = offset / sectorsize;
1107 struct sector_ptr *sector = &rbio->bio_sectors[index];
1109 sector->page = bvec.bv_page;
1110 sector->pgoff = bvec.bv_offset + bvec_offset;
1111 ASSERT(sector->pgoff < PAGE_SIZE);
1117 * helper function to walk our bio list and populate the bio_pages array with
1118 * the result. This seems expensive, but it is faster than constantly
1119 * searching through the bio list as we setup the IO in finish_rmw or stripe
1122 * This must be called before you trust the answers from page_in_rbio
1124 static void index_rbio_pages(struct btrfs_raid_bio *rbio)
1128 spin_lock_irq(&rbio->bio_list_lock);
1129 bio_list_for_each(bio, &rbio->bio_list)
1130 index_one_bio(rbio, bio);
1132 spin_unlock_irq(&rbio->bio_list_lock);
1135 static void bio_get_trace_info(struct btrfs_raid_bio *rbio, struct bio *bio,
1136 struct raid56_bio_trace_info *trace_info)
1138 const struct btrfs_io_context *bioc = rbio->bioc;
1143 /* We rely on bio->bi_bdev to find the stripe number. */
1147 for (i = 0; i < bioc->num_stripes; i++) {
1148 if (bio->bi_bdev != bioc->stripes[i].dev->bdev)
1150 trace_info->stripe_nr = i;
1151 trace_info->devid = bioc->stripes[i].dev->devid;
1152 trace_info->offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
1153 bioc->stripes[i].physical;
1158 trace_info->devid = -1;
1159 trace_info->offset = -1;
1160 trace_info->stripe_nr = -1;
1164 * this is called from one of two situations. We either
1165 * have a full stripe from the higher layers, or we've read all
1166 * the missing bits off disk.
1168 * This will calculate the parity and then send down any
1171 static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1173 struct btrfs_io_context *bioc = rbio->bioc;
1174 const u32 sectorsize = bioc->fs_info->sectorsize;
1175 void **pointers = rbio->finish_pointers;
1176 int nr_data = rbio->nr_data;
1177 /* The total sector number inside the full stripe. */
1178 int total_sector_nr;
1180 /* Sector number inside a stripe. */
1183 struct bio_list bio_list;
1187 bio_list_init(&bio_list);
1189 if (rbio->real_stripes - rbio->nr_data == 1)
1190 has_qstripe = false;
1191 else if (rbio->real_stripes - rbio->nr_data == 2)
1196 /* We should have at least one data sector. */
1197 ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors));
1199 /* at this point we either have a full stripe,
1200 * or we've read the full stripe from the drive.
1201 * recalculate the parity and write the new results.
1203 * We're not allowed to add any new bios to the
1204 * bio list here, anyone else that wants to
1205 * change this stripe needs to do their own rmw.
1207 spin_lock_irq(&rbio->bio_list_lock);
1208 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1209 spin_unlock_irq(&rbio->bio_list_lock);
1211 atomic_set(&rbio->error, 0);
1214 * now that we've set rmw_locked, run through the
1215 * bio list one last time and map the page pointers
1217 * We don't cache full rbios because we're assuming
1218 * the higher layers are unlikely to use this area of
1219 * the disk again soon. If they do use it again,
1220 * hopefully they will send another full bio.
1222 index_rbio_pages(rbio);
1223 if (!rbio_is_full(rbio))
1224 cache_rbio_pages(rbio);
1226 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
1228 for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
1229 struct sector_ptr *sector;
1231 /* First collect one sector from each data stripe */
1232 for (stripe = 0; stripe < nr_data; stripe++) {
1233 sector = sector_in_rbio(rbio, stripe, sectornr, 0);
1234 pointers[stripe] = kmap_local_page(sector->page) +
1238 /* Then add the parity stripe */
1239 sector = rbio_pstripe_sector(rbio, sectornr);
1240 sector->uptodate = 1;
1241 pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff;
1245 * RAID6, add the qstripe and call the library function
1246 * to fill in our p/q
1248 sector = rbio_qstripe_sector(rbio, sectornr);
1249 sector->uptodate = 1;
1250 pointers[stripe++] = kmap_local_page(sector->page) +
1253 raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
1257 memcpy(pointers[nr_data], pointers[0], sectorsize);
1258 run_xor(pointers + 1, nr_data - 1, sectorsize);
1260 for (stripe = stripe - 1; stripe >= 0; stripe--)
1261 kunmap_local(pointers[stripe]);
1265 * Start writing. Make bios for everything from the higher layers (the
1266 * bio_list in our rbio) and our P/Q. Ignore everything else.
1268 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
1269 total_sector_nr++) {
1270 struct sector_ptr *sector;
1272 stripe = total_sector_nr / rbio->stripe_nsectors;
1273 sectornr = total_sector_nr % rbio->stripe_nsectors;
1275 /* This vertical stripe has no data, skip it. */
1276 if (!test_bit(sectornr, &rbio->dbitmap))
1279 if (stripe < rbio->nr_data) {
1280 sector = sector_in_rbio(rbio, stripe, sectornr, 1);
1284 sector = rbio_stripe_sector(rbio, stripe, sectornr);
1287 ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
1288 sectornr, REQ_OP_WRITE);
1293 if (likely(!bioc->num_tgtdevs))
1296 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
1297 total_sector_nr++) {
1298 struct sector_ptr *sector;
1300 stripe = total_sector_nr / rbio->stripe_nsectors;
1301 sectornr = total_sector_nr % rbio->stripe_nsectors;
1303 if (!bioc->tgtdev_map[stripe]) {
1305 * We can skip the whole stripe completely, note
1306 * total_sector_nr will be increased by one anyway.
1308 ASSERT(sectornr == 0);
1309 total_sector_nr += rbio->stripe_nsectors - 1;
1313 /* This vertical stripe has no data, skip it. */
1314 if (!test_bit(sectornr, &rbio->dbitmap))
1317 if (stripe < rbio->nr_data) {
1318 sector = sector_in_rbio(rbio, stripe, sectornr, 1);
1322 sector = rbio_stripe_sector(rbio, stripe, sectornr);
1325 ret = rbio_add_io_sector(rbio, &bio_list, sector,
1326 rbio->bioc->tgtdev_map[stripe],
1327 sectornr, REQ_OP_WRITE);
1333 atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
1334 BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
1336 while ((bio = bio_list_pop(&bio_list))) {
1337 bio->bi_end_io = raid_write_end_io;
1339 if (trace_raid56_write_stripe_enabled()) {
1340 struct raid56_bio_trace_info trace_info = { 0 };
1342 bio_get_trace_info(rbio, bio, &trace_info);
1343 trace_raid56_write_stripe(rbio, bio, &trace_info);
1350 rbio_orig_end_io(rbio, BLK_STS_IOERR);
1352 while ((bio = bio_list_pop(&bio_list)))
1357 * helper to find the stripe number for a given bio. Used to figure out which
1358 * stripe has failed. This expects the bio to correspond to a physical disk,
1359 * so it looks up based on physical sector numbers.
1361 static int find_bio_stripe(struct btrfs_raid_bio *rbio,
1364 u64 physical = bio->bi_iter.bi_sector;
1366 struct btrfs_io_stripe *stripe;
1370 for (i = 0; i < rbio->bioc->num_stripes; i++) {
1371 stripe = &rbio->bioc->stripes[i];
1372 if (in_range(physical, stripe->physical, BTRFS_STRIPE_LEN) &&
1373 stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) {
1381 * helper to find the stripe number for a given
1382 * bio (before mapping). Used to figure out which stripe has
1383 * failed. This looks up based on logical block numbers.
1385 static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
1388 u64 logical = bio->bi_iter.bi_sector << 9;
1391 for (i = 0; i < rbio->nr_data; i++) {
1392 u64 stripe_start = rbio->bioc->raid_map[i];
1394 if (in_range(logical, stripe_start, BTRFS_STRIPE_LEN))
1401 * returns -EIO if we had too many failures
1403 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
1405 unsigned long flags;
1408 spin_lock_irqsave(&rbio->bio_list_lock, flags);
1410 /* we already know this stripe is bad, move on */
1411 if (rbio->faila == failed || rbio->failb == failed)
1414 if (rbio->faila == -1) {
1415 /* first failure on this rbio */
1416 rbio->faila = failed;
1417 atomic_inc(&rbio->error);
1418 } else if (rbio->failb == -1) {
1419 /* second failure on this rbio */
1420 rbio->failb = failed;
1421 atomic_inc(&rbio->error);
1426 spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
1432 * helper to fail a stripe based on a physical disk
1435 static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
1438 int failed = find_bio_stripe(rbio, bio);
1443 return fail_rbio_index(rbio, failed);
1447 * For subpage case, we can no longer set page Uptodate directly for
1448 * stripe_pages[], thus we need to locate the sector.
1450 static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio,
1456 for (i = 0; i < rbio->nr_sectors; i++) {
1457 struct sector_ptr *sector = &rbio->stripe_sectors[i];
1459 if (sector->page == page && sector->pgoff == pgoff)
1466 * this sets each page in the bio uptodate. It should only be used on private
1467 * rbio pages, nothing that comes in from the higher layers
1469 static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio)
1471 const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1472 struct bio_vec *bvec;
1473 struct bvec_iter_all iter_all;
1475 ASSERT(!bio_flagged(bio, BIO_CLONED));
1477 bio_for_each_segment_all(bvec, bio, iter_all) {
1478 struct sector_ptr *sector;
1481 for (pgoff = bvec->bv_offset; pgoff - bvec->bv_offset < bvec->bv_len;
1482 pgoff += sectorsize) {
1483 sector = find_stripe_sector(rbio, bvec->bv_page, pgoff);
1486 sector->uptodate = 1;
1491 static void raid56_bio_end_io(struct bio *bio)
1493 struct btrfs_raid_bio *rbio = bio->bi_private;
1496 fail_bio_stripe(rbio, bio);
1498 set_bio_pages_uptodate(rbio, bio);
1502 if (atomic_dec_and_test(&rbio->stripes_pending))
1503 queue_work(rbio->bioc->fs_info->endio_raid56_workers,
1504 &rbio->end_io_work);
1508 * End io handler for the read phase of the RMW cycle. All the bios here are
1509 * physical stripe bios we've read from the disk so we can recalculate the
1510 * parity of the stripe.
1512 * This will usually kick off finish_rmw once all the bios are read in, but it
1513 * may trigger parity reconstruction if we had any errors along the way
1515 static void raid56_rmw_end_io_work(struct work_struct *work)
1517 struct btrfs_raid_bio *rbio =
1518 container_of(work, struct btrfs_raid_bio, end_io_work);
1520 if (atomic_read(&rbio->error) > rbio->bioc->max_errors) {
1521 rbio_orig_end_io(rbio, BLK_STS_IOERR);
1526 * This will normally call finish_rmw to start our write but if there
1527 * are any failed stripes we'll reconstruct from parity first.
1529 validate_rbio_for_rmw(rbio);
1533 * the stripe must be locked by the caller. It will
1534 * unlock after all the writes are done
1536 static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1538 int bios_to_read = 0;
1539 struct bio_list bio_list;
1540 const int nr_data_sectors = rbio->stripe_nsectors * rbio->nr_data;
1542 int total_sector_nr;
1545 bio_list_init(&bio_list);
1547 ret = alloc_rbio_pages(rbio);
1551 index_rbio_pages(rbio);
1553 atomic_set(&rbio->error, 0);
1554 /* Build a list of bios to read all the missing data sectors. */
1555 for (total_sector_nr = 0; total_sector_nr < nr_data_sectors;
1556 total_sector_nr++) {
1557 struct sector_ptr *sector;
1558 int stripe = total_sector_nr / rbio->stripe_nsectors;
1559 int sectornr = total_sector_nr % rbio->stripe_nsectors;
1562 * We want to find all the sectors missing from the rbio and
1563 * read them from the disk. If sector_in_rbio() finds a page
1564 * in the bio list we don't need to read it off the stripe.
1566 sector = sector_in_rbio(rbio, stripe, sectornr, 1);
1570 sector = rbio_stripe_sector(rbio, stripe, sectornr);
1572 * The bio cache may have handed us an uptodate page. If so,
1575 if (sector->uptodate)
1578 ret = rbio_add_io_sector(rbio, &bio_list, sector,
1579 stripe, sectornr, REQ_OP_READ);
1584 bios_to_read = bio_list_size(&bio_list);
1585 if (!bios_to_read) {
1587 * this can happen if others have merged with
1588 * us, it means there is nothing left to read.
1589 * But if there are missing devices it may not be
1590 * safe to do the full stripe write yet.
1596 * The bioc may be freed once we submit the last bio. Make sure not to
1597 * touch it after that.
1599 atomic_set(&rbio->stripes_pending, bios_to_read);
1600 INIT_WORK(&rbio->end_io_work, raid56_rmw_end_io_work);
1601 while ((bio = bio_list_pop(&bio_list))) {
1602 bio->bi_end_io = raid56_bio_end_io;
1604 if (trace_raid56_read_partial_enabled()) {
1605 struct raid56_bio_trace_info trace_info = { 0 };
1607 bio_get_trace_info(rbio, bio, &trace_info);
1608 trace_raid56_read_partial(rbio, bio, &trace_info);
1612 /* the actual write will happen once the reads are done */
1616 rbio_orig_end_io(rbio, BLK_STS_IOERR);
1618 while ((bio = bio_list_pop(&bio_list)))
1624 validate_rbio_for_rmw(rbio);
1629 * if the upper layers pass in a full stripe, we thank them by only allocating
1630 * enough pages to hold the parity, and sending it all down quickly.
1632 static int full_stripe_write(struct btrfs_raid_bio *rbio)
1636 ret = alloc_rbio_parity_pages(rbio);
1638 __free_raid_bio(rbio);
1642 ret = lock_stripe_add(rbio);
1649 * partial stripe writes get handed over to async helpers.
1650 * We're really hoping to merge a few more writes into this
1651 * rbio before calculating new parity
1653 static int partial_stripe_write(struct btrfs_raid_bio *rbio)
1657 ret = lock_stripe_add(rbio);
1659 start_async_work(rbio, rmw_work);
1664 * sometimes while we were reading from the drive to
1665 * recalculate parity, enough new bios come into create
1666 * a full stripe. So we do a check here to see if we can
1667 * go directly to finish_rmw
1669 static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
1671 /* head off into rmw land if we don't have a full stripe */
1672 if (!rbio_is_full(rbio))
1673 return partial_stripe_write(rbio);
1674 return full_stripe_write(rbio);
1678 * We use plugging call backs to collect full stripes.
1679 * Any time we get a partial stripe write while plugged
1680 * we collect it into a list. When the unplug comes down,
1681 * we sort the list by logical block number and merge
1682 * everything we can into the same rbios
1684 struct btrfs_plug_cb {
1685 struct blk_plug_cb cb;
1686 struct btrfs_fs_info *info;
1687 struct list_head rbio_list;
1688 struct work_struct work;
1692 * rbios on the plug list are sorted for easier merging.
1694 static int plug_cmp(void *priv, const struct list_head *a,
1695 const struct list_head *b)
1697 const struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
1699 const struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
1701 u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
1702 u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
1704 if (a_sector < b_sector)
1706 if (a_sector > b_sector)
1711 static void run_plug(struct btrfs_plug_cb *plug)
1713 struct btrfs_raid_bio *cur;
1714 struct btrfs_raid_bio *last = NULL;
1717 * sort our plug list then try to merge
1718 * everything we can in hopes of creating full
1721 list_sort(NULL, &plug->rbio_list, plug_cmp);
1722 while (!list_empty(&plug->rbio_list)) {
1723 cur = list_entry(plug->rbio_list.next,
1724 struct btrfs_raid_bio, plug_list);
1725 list_del_init(&cur->plug_list);
1727 if (rbio_is_full(cur)) {
1730 /* we have a full stripe, send it down */
1731 ret = full_stripe_write(cur);
1736 if (rbio_can_merge(last, cur)) {
1737 merge_rbio(last, cur);
1738 __free_raid_bio(cur);
1742 __raid56_parity_write(last);
1747 __raid56_parity_write(last);
1753 * if the unplug comes from schedule, we have to push the
1754 * work off to a helper thread
1756 static void unplug_work(struct work_struct *work)
1758 struct btrfs_plug_cb *plug;
1759 plug = container_of(work, struct btrfs_plug_cb, work);
1763 static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1765 struct btrfs_plug_cb *plug;
1766 plug = container_of(cb, struct btrfs_plug_cb, cb);
1768 if (from_schedule) {
1769 INIT_WORK(&plug->work, unplug_work);
1770 queue_work(plug->info->rmw_workers, &plug->work);
1776 /* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */
1777 static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio)
1779 const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1780 const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT;
1781 const u64 full_stripe_start = rbio->bioc->raid_map[0];
1782 const u32 orig_len = orig_bio->bi_iter.bi_size;
1783 const u32 sectorsize = fs_info->sectorsize;
1786 ASSERT(orig_logical >= full_stripe_start &&
1787 orig_logical + orig_len <= full_stripe_start +
1788 rbio->nr_data * BTRFS_STRIPE_LEN);
1790 bio_list_add(&rbio->bio_list, orig_bio);
1791 rbio->bio_list_bytes += orig_bio->bi_iter.bi_size;
1793 /* Update the dbitmap. */
1794 for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len;
1795 cur_logical += sectorsize) {
1796 int bit = ((u32)(cur_logical - full_stripe_start) >>
1797 fs_info->sectorsize_bits) % rbio->stripe_nsectors;
1799 set_bit(bit, &rbio->dbitmap);
1804 * our main entry point for writes from the rest of the FS.
1806 void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc)
1808 struct btrfs_fs_info *fs_info = bioc->fs_info;
1809 struct btrfs_raid_bio *rbio;
1810 struct btrfs_plug_cb *plug = NULL;
1811 struct blk_plug_cb *cb;
1814 rbio = alloc_rbio(fs_info, bioc);
1816 btrfs_put_bioc(bioc);
1817 ret = PTR_ERR(rbio);
1818 goto out_dec_counter;
1820 rbio->operation = BTRFS_RBIO_WRITE;
1821 rbio_add_bio(rbio, bio);
1823 rbio->generic_bio_cnt = 1;
1826 * don't plug on full rbios, just get them out the door
1827 * as quickly as we can
1829 if (rbio_is_full(rbio)) {
1830 ret = full_stripe_write(rbio);
1832 goto out_dec_counter;
1836 cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug));
1838 plug = container_of(cb, struct btrfs_plug_cb, cb);
1840 plug->info = fs_info;
1841 INIT_LIST_HEAD(&plug->rbio_list);
1843 list_add_tail(&rbio->plug_list, &plug->rbio_list);
1845 ret = __raid56_parity_write(rbio);
1847 goto out_dec_counter;
1853 btrfs_bio_counter_dec(fs_info);
1854 bio->bi_status = errno_to_blk_status(ret);
1859 * all parity reconstruction happens here. We've read in everything
1860 * we can find from the drives and this does the heavy lifting of
1861 * sorting the good from the bad.
1863 static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1865 const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1866 int sectornr, stripe;
1869 int faila = -1, failb = -1;
1874 * This array stores the pointer for each sector, thus it has the extra
1875 * pgoff value added from each sector
1877 pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
1879 err = BLK_STS_RESOURCE;
1884 * Store copy of pointers that does not get reordered during
1885 * reconstruction so that kunmap_local works.
1887 unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
1889 err = BLK_STS_RESOURCE;
1890 goto cleanup_pointers;
1893 faila = rbio->faila;
1894 failb = rbio->failb;
1896 if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1897 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
1898 spin_lock_irq(&rbio->bio_list_lock);
1899 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1900 spin_unlock_irq(&rbio->bio_list_lock);
1903 index_rbio_pages(rbio);
1905 for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
1906 struct sector_ptr *sector;
1909 * Now we just use bitmap to mark the horizontal stripes in
1910 * which we have data when doing parity scrub.
1912 if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
1913 !test_bit(sectornr, &rbio->dbitmap))
1917 * Setup our array of pointers with sectors from each stripe
1919 * NOTE: store a duplicate array of pointers to preserve the
1922 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1924 * If we're rebuilding a read, we have to use
1925 * pages from the bio list
1927 if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1928 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
1929 (stripe == faila || stripe == failb)) {
1930 sector = sector_in_rbio(rbio, stripe, sectornr, 0);
1932 sector = rbio_stripe_sector(rbio, stripe, sectornr);
1934 ASSERT(sector->page);
1935 pointers[stripe] = kmap_local_page(sector->page) +
1937 unmap_array[stripe] = pointers[stripe];
1940 /* All raid6 handling here */
1941 if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) {
1942 /* Single failure, rebuild from parity raid5 style */
1944 if (faila == rbio->nr_data) {
1946 * Just the P stripe has failed, without
1947 * a bad data or Q stripe.
1948 * TODO, we should redo the xor here.
1950 err = BLK_STS_IOERR;
1954 * a single failure in raid6 is rebuilt
1955 * in the pstripe code below
1960 /* make sure our ps and qs are in order */
1964 /* if the q stripe is failed, do a pstripe reconstruction
1966 * If both the q stripe and the P stripe are failed, we're
1967 * here due to a crc mismatch and we can't give them the
1970 if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) {
1971 if (rbio->bioc->raid_map[faila] ==
1973 err = BLK_STS_IOERR;
1977 * otherwise we have one bad data stripe and
1978 * a good P stripe. raid5!
1983 if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) {
1984 raid6_datap_recov(rbio->real_stripes,
1985 sectorsize, faila, pointers);
1987 raid6_2data_recov(rbio->real_stripes,
1988 sectorsize, faila, failb,
1994 /* rebuild from P stripe here (raid5 or raid6) */
1995 BUG_ON(failb != -1);
1997 /* Copy parity block into failed block to start with */
1998 memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize);
2000 /* rearrange the pointer array */
2001 p = pointers[faila];
2002 for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
2003 pointers[stripe] = pointers[stripe + 1];
2004 pointers[rbio->nr_data - 1] = p;
2006 /* xor in the rest */
2007 run_xor(pointers, rbio->nr_data - 1, sectorsize);
2009 /* if we're doing this rebuild as part of an rmw, go through
2010 * and set all of our private rbio pages in the
2011 * failed stripes as uptodate. This way finish_rmw will
2012 * know they can be trusted. If this was a read reconstruction,
2013 * other endio functions will fiddle the uptodate bits
2015 if (rbio->operation == BTRFS_RBIO_WRITE) {
2016 for (i = 0; i < rbio->stripe_nsectors; i++) {
2018 sector = rbio_stripe_sector(rbio, faila, i);
2019 sector->uptodate = 1;
2022 sector = rbio_stripe_sector(rbio, failb, i);
2023 sector->uptodate = 1;
2027 for (stripe = rbio->real_stripes - 1; stripe >= 0; stripe--)
2028 kunmap_local(unmap_array[stripe]);
2039 * Similar to READ_REBUILD, REBUILD_MISSING at this point also has a
2040 * valid rbio which is consistent with ondisk content, thus such a
2041 * valid rbio can be cached to avoid further disk reads.
2043 if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
2044 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
2046 * - In case of two failures, where rbio->failb != -1:
2048 * Do not cache this rbio since the above read reconstruction
2049 * (raid6_datap_recov() or raid6_2data_recov()) may have
2050 * changed some content of stripes which are not identical to
2051 * on-disk content any more, otherwise, a later write/recover
2052 * may steal stripe_pages from this rbio and end up with
2053 * corruptions or rebuild failures.
2055 * - In case of single failure, where rbio->failb == -1:
2057 * Cache this rbio iff the above read reconstruction is
2058 * executed without problems.
2060 if (err == BLK_STS_OK && rbio->failb < 0)
2061 cache_rbio_pages(rbio);
2063 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2065 rbio_orig_end_io(rbio, err);
2066 } else if (err == BLK_STS_OK) {
2070 if (rbio->operation == BTRFS_RBIO_WRITE)
2072 else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB)
2073 finish_parity_scrub(rbio, 0);
2077 rbio_orig_end_io(rbio, err);
2082 * This is called only for stripes we've read from disk to reconstruct the
2085 static void raid_recover_end_io_work(struct work_struct *work)
2087 struct btrfs_raid_bio *rbio =
2088 container_of(work, struct btrfs_raid_bio, end_io_work);
2090 if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
2091 rbio_orig_end_io(rbio, BLK_STS_IOERR);
2093 __raid_recover_end_io(rbio);
2097 * reads everything we need off the disk to reconstruct
2098 * the parity. endio handlers trigger final reconstruction
2099 * when the IO is done.
2101 * This is used both for reads from the higher layers and for
2102 * parity construction required to finish a rmw cycle.
2104 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
2106 int bios_to_read = 0;
2107 struct bio_list bio_list;
2109 int total_sector_nr;
2112 bio_list_init(&bio_list);
2114 ret = alloc_rbio_pages(rbio);
2118 atomic_set(&rbio->error, 0);
2121 * Read everything that hasn't failed. However this time we will
2122 * not trust any cached sector.
2123 * As we may read out some stale data but higher layer is not reading
2126 * So here we always re-read everything in recovery path.
2128 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
2129 total_sector_nr++) {
2130 int stripe = total_sector_nr / rbio->stripe_nsectors;
2131 int sectornr = total_sector_nr % rbio->stripe_nsectors;
2132 struct sector_ptr *sector;
2134 if (rbio->faila == stripe || rbio->failb == stripe) {
2135 atomic_inc(&rbio->error);
2136 /* Skip the current stripe. */
2137 ASSERT(sectornr == 0);
2138 total_sector_nr += rbio->stripe_nsectors - 1;
2141 sector = rbio_stripe_sector(rbio, stripe, sectornr);
2142 ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
2143 sectornr, REQ_OP_READ);
2148 bios_to_read = bio_list_size(&bio_list);
2149 if (!bios_to_read) {
2151 * we might have no bios to read just because the pages
2152 * were up to date, or we might have no bios to read because
2153 * the devices were gone.
2155 if (atomic_read(&rbio->error) <= rbio->bioc->max_errors) {
2156 __raid_recover_end_io(rbio);
2164 * The bioc may be freed once we submit the last bio. Make sure not to
2165 * touch it after that.
2167 atomic_set(&rbio->stripes_pending, bios_to_read);
2168 INIT_WORK(&rbio->end_io_work, raid_recover_end_io_work);
2169 while ((bio = bio_list_pop(&bio_list))) {
2170 bio->bi_end_io = raid56_bio_end_io;
2172 if (trace_raid56_scrub_read_recover_enabled()) {
2173 struct raid56_bio_trace_info trace_info = { 0 };
2175 bio_get_trace_info(rbio, bio, &trace_info);
2176 trace_raid56_scrub_read_recover(rbio, bio, &trace_info);
2184 if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
2185 rbio->operation == BTRFS_RBIO_REBUILD_MISSING)
2186 rbio_orig_end_io(rbio, BLK_STS_IOERR);
2188 while ((bio = bio_list_pop(&bio_list)))
2195 * the main entry point for reads from the higher layers. This
2196 * is really only called when the normal read path had a failure,
2197 * so we assume the bio they send down corresponds to a failed part
2200 void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
2201 int mirror_num, bool generic_io)
2203 struct btrfs_fs_info *fs_info = bioc->fs_info;
2204 struct btrfs_raid_bio *rbio;
2207 ASSERT(bioc->mirror_num == mirror_num);
2208 btrfs_bio(bio)->mirror_num = mirror_num;
2210 btrfs_get_bioc(bioc);
2213 rbio = alloc_rbio(fs_info, bioc);
2215 bio->bi_status = errno_to_blk_status(PTR_ERR(rbio));
2219 rbio->operation = BTRFS_RBIO_READ_REBUILD;
2220 rbio_add_bio(rbio, bio);
2222 rbio->faila = find_logical_bio_stripe(rbio, bio);
2223 if (rbio->faila == -1) {
2225 "%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bioc has map_type %llu)",
2226 __func__, bio->bi_iter.bi_sector << 9,
2227 (u64)bio->bi_iter.bi_size, bioc->map_type);
2229 bio->bi_status = BLK_STS_IOERR;
2234 rbio->generic_bio_cnt = 1;
2238 * for 'mirror == 2', reconstruct from all other stripes.
2239 * for 'mirror_num > 2', select a stripe to fail on every retry.
2241 if (mirror_num > 2) {
2243 * 'mirror == 3' is to fail the p stripe and
2244 * reconstruct from the q stripe. 'mirror > 3' is to
2245 * fail a data stripe and reconstruct from p+q stripe.
2247 rbio->failb = rbio->real_stripes - (mirror_num - 1);
2248 ASSERT(rbio->failb > 0);
2249 if (rbio->failb <= rbio->faila)
2253 if (lock_stripe_add(rbio))
2257 * This adds our rbio to the list of rbios that will be handled after
2258 * the current lock owner is done.
2260 __raid56_parity_recover(rbio);
2264 btrfs_bio_counter_dec(fs_info);
2265 btrfs_put_bioc(bioc);
2269 static void rmw_work(struct work_struct *work)
2271 struct btrfs_raid_bio *rbio;
2273 rbio = container_of(work, struct btrfs_raid_bio, work);
2274 raid56_rmw_stripe(rbio);
2277 static void read_rebuild_work(struct work_struct *work)
2279 struct btrfs_raid_bio *rbio;
2281 rbio = container_of(work, struct btrfs_raid_bio, work);
2282 __raid56_parity_recover(rbio);
2286 * The following code is used to scrub/replace the parity stripe
2288 * Caller must have already increased bio_counter for getting @bioc.
2290 * Note: We need make sure all the pages that add into the scrub/replace
2291 * raid bio are correct and not be changed during the scrub/replace. That
2292 * is those pages just hold metadata or file data with checksum.
2295 struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
2296 struct btrfs_io_context *bioc,
2297 struct btrfs_device *scrub_dev,
2298 unsigned long *dbitmap, int stripe_nsectors)
2300 struct btrfs_fs_info *fs_info = bioc->fs_info;
2301 struct btrfs_raid_bio *rbio;
2304 rbio = alloc_rbio(fs_info, bioc);
2307 bio_list_add(&rbio->bio_list, bio);
2309 * This is a special bio which is used to hold the completion handler
2310 * and make the scrub rbio is similar to the other types
2312 ASSERT(!bio->bi_iter.bi_size);
2313 rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
2316 * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted
2317 * to the end position, so this search can start from the first parity
2320 for (i = rbio->nr_data; i < rbio->real_stripes; i++) {
2321 if (bioc->stripes[i].dev == scrub_dev) {
2326 ASSERT(i < rbio->real_stripes);
2328 bitmap_copy(&rbio->dbitmap, dbitmap, stripe_nsectors);
2331 * We have already increased bio_counter when getting bioc, record it
2332 * so we can free it at rbio_orig_end_io().
2334 rbio->generic_bio_cnt = 1;
2339 /* Used for both parity scrub and missing. */
2340 void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
2341 unsigned int pgoff, u64 logical)
2343 const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
2347 ASSERT(logical >= rbio->bioc->raid_map[0]);
2348 ASSERT(logical + sectorsize <= rbio->bioc->raid_map[0] +
2349 BTRFS_STRIPE_LEN * rbio->nr_data);
2350 stripe_offset = (int)(logical - rbio->bioc->raid_map[0]);
2351 index = stripe_offset / sectorsize;
2352 rbio->bio_sectors[index].page = page;
2353 rbio->bio_sectors[index].pgoff = pgoff;
2357 * We just scrub the parity that we have correct data on the same horizontal,
2358 * so we needn't allocate all pages for all the stripes.
2360 static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
2362 const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
2363 int total_sector_nr;
2365 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
2366 total_sector_nr++) {
2368 int sectornr = total_sector_nr % rbio->stripe_nsectors;
2369 int index = (total_sector_nr * sectorsize) >> PAGE_SHIFT;
2371 if (!test_bit(sectornr, &rbio->dbitmap))
2373 if (rbio->stripe_pages[index])
2375 page = alloc_page(GFP_NOFS);
2378 rbio->stripe_pages[index] = page;
2380 index_stripe_sectors(rbio);
2384 static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
2387 struct btrfs_io_context *bioc = rbio->bioc;
2388 const u32 sectorsize = bioc->fs_info->sectorsize;
2389 void **pointers = rbio->finish_pointers;
2390 unsigned long *pbitmap = &rbio->finish_pbitmap;
2391 int nr_data = rbio->nr_data;
2395 struct sector_ptr p_sector = { 0 };
2396 struct sector_ptr q_sector = { 0 };
2397 struct bio_list bio_list;
2402 bio_list_init(&bio_list);
2404 if (rbio->real_stripes - rbio->nr_data == 1)
2405 has_qstripe = false;
2406 else if (rbio->real_stripes - rbio->nr_data == 2)
2411 if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) {
2413 bitmap_copy(pbitmap, &rbio->dbitmap, rbio->stripe_nsectors);
2417 * Because the higher layers(scrubber) are unlikely to
2418 * use this area of the disk again soon, so don't cache
2421 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2426 p_sector.page = alloc_page(GFP_NOFS);
2430 p_sector.uptodate = 1;
2433 /* RAID6, allocate and map temp space for the Q stripe */
2434 q_sector.page = alloc_page(GFP_NOFS);
2435 if (!q_sector.page) {
2436 __free_page(p_sector.page);
2437 p_sector.page = NULL;
2441 q_sector.uptodate = 1;
2442 pointers[rbio->real_stripes - 1] = kmap_local_page(q_sector.page);
2445 atomic_set(&rbio->error, 0);
2447 /* Map the parity stripe just once */
2448 pointers[nr_data] = kmap_local_page(p_sector.page);
2450 for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) {
2451 struct sector_ptr *sector;
2454 /* first collect one page from each data stripe */
2455 for (stripe = 0; stripe < nr_data; stripe++) {
2456 sector = sector_in_rbio(rbio, stripe, sectornr, 0);
2457 pointers[stripe] = kmap_local_page(sector->page) +
2462 /* RAID6, call the library function to fill in our P/Q */
2463 raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
2467 memcpy(pointers[nr_data], pointers[0], sectorsize);
2468 run_xor(pointers + 1, nr_data - 1, sectorsize);
2471 /* Check scrubbing parity and repair it */
2472 sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
2473 parity = kmap_local_page(sector->page) + sector->pgoff;
2474 if (memcmp(parity, pointers[rbio->scrubp], sectorsize) != 0)
2475 memcpy(parity, pointers[rbio->scrubp], sectorsize);
2477 /* Parity is right, needn't writeback */
2478 bitmap_clear(&rbio->dbitmap, sectornr, 1);
2479 kunmap_local(parity);
2481 for (stripe = nr_data - 1; stripe >= 0; stripe--)
2482 kunmap_local(pointers[stripe]);
2485 kunmap_local(pointers[nr_data]);
2486 __free_page(p_sector.page);
2487 p_sector.page = NULL;
2488 if (q_sector.page) {
2489 kunmap_local(pointers[rbio->real_stripes - 1]);
2490 __free_page(q_sector.page);
2491 q_sector.page = NULL;
2496 * time to start writing. Make bios for everything from the
2497 * higher layers (the bio_list in our rbio) and our p/q. Ignore
2500 for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) {
2501 struct sector_ptr *sector;
2503 sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
2504 ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->scrubp,
2505 sectornr, REQ_OP_WRITE);
2513 for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) {
2514 struct sector_ptr *sector;
2516 sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
2517 ret = rbio_add_io_sector(rbio, &bio_list, sector,
2518 bioc->tgtdev_map[rbio->scrubp],
2519 sectornr, REQ_OP_WRITE);
2525 nr_data = bio_list_size(&bio_list);
2527 /* Every parity is right */
2528 rbio_orig_end_io(rbio, BLK_STS_OK);
2532 atomic_set(&rbio->stripes_pending, nr_data);
2534 while ((bio = bio_list_pop(&bio_list))) {
2535 bio->bi_end_io = raid_write_end_io;
2537 if (trace_raid56_scrub_write_stripe_enabled()) {
2538 struct raid56_bio_trace_info trace_info = { 0 };
2540 bio_get_trace_info(rbio, bio, &trace_info);
2541 trace_raid56_scrub_write_stripe(rbio, bio, &trace_info);
2548 rbio_orig_end_io(rbio, BLK_STS_IOERR);
2550 while ((bio = bio_list_pop(&bio_list)))
2554 static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
2556 if (stripe >= 0 && stripe < rbio->nr_data)
2562 * While we're doing the parity check and repair, we could have errors
2563 * in reading pages off the disk. This checks for errors and if we're
2564 * not able to read the page it'll trigger parity reconstruction. The
2565 * parity scrub will be finished after we've reconstructed the failed
2568 static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
2570 if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
2573 if (rbio->faila >= 0 || rbio->failb >= 0) {
2574 int dfail = 0, failp = -1;
2576 if (is_data_stripe(rbio, rbio->faila))
2578 else if (is_parity_stripe(rbio->faila))
2579 failp = rbio->faila;
2581 if (is_data_stripe(rbio, rbio->failb))
2583 else if (is_parity_stripe(rbio->failb))
2584 failp = rbio->failb;
2587 * Because we can not use a scrubbing parity to repair
2588 * the data, so the capability of the repair is declined.
2589 * (In the case of RAID5, we can not repair anything)
2591 if (dfail > rbio->bioc->max_errors - 1)
2595 * If all data is good, only parity is correctly, just
2596 * repair the parity.
2599 finish_parity_scrub(rbio, 0);
2604 * Here means we got one corrupted data stripe and one
2605 * corrupted parity on RAID6, if the corrupted parity
2606 * is scrubbing parity, luckily, use the other one to repair
2607 * the data, or we can not repair the data stripe.
2609 if (failp != rbio->scrubp)
2612 __raid_recover_end_io(rbio);
2614 finish_parity_scrub(rbio, 1);
2619 rbio_orig_end_io(rbio, BLK_STS_IOERR);
2623 * end io for the read phase of the rmw cycle. All the bios here are physical
2624 * stripe bios we've read from the disk so we can recalculate the parity of the
2627 * This will usually kick off finish_rmw once all the bios are read in, but it
2628 * may trigger parity reconstruction if we had any errors along the way
2630 static void raid56_parity_scrub_end_io_work(struct work_struct *work)
2632 struct btrfs_raid_bio *rbio =
2633 container_of(work, struct btrfs_raid_bio, end_io_work);
2636 * This will normally call finish_rmw to start our write, but if there
2637 * are any failed stripes we'll reconstruct from parity first
2639 validate_rbio_for_parity_scrub(rbio);
2642 static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
2644 int bios_to_read = 0;
2645 struct bio_list bio_list;
2647 int total_sector_nr;
2650 bio_list_init(&bio_list);
2652 ret = alloc_rbio_essential_pages(rbio);
2656 atomic_set(&rbio->error, 0);
2657 /* Build a list of bios to read all the missing parts. */
2658 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
2659 total_sector_nr++) {
2660 int sectornr = total_sector_nr % rbio->stripe_nsectors;
2661 int stripe = total_sector_nr / rbio->stripe_nsectors;
2662 struct sector_ptr *sector;
2664 /* No data in the vertical stripe, no need to read. */
2665 if (!test_bit(sectornr, &rbio->dbitmap))
2669 * We want to find all the sectors missing from the rbio and
2670 * read them from the disk. If sector_in_rbio() finds a sector
2671 * in the bio list we don't need to read it off the stripe.
2673 sector = sector_in_rbio(rbio, stripe, sectornr, 1);
2677 sector = rbio_stripe_sector(rbio, stripe, sectornr);
2679 * The bio cache may have handed us an uptodate sector. If so,
2682 if (sector->uptodate)
2685 ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
2686 sectornr, REQ_OP_READ);
2691 bios_to_read = bio_list_size(&bio_list);
2692 if (!bios_to_read) {
2694 * this can happen if others have merged with
2695 * us, it means there is nothing left to read.
2696 * But if there are missing devices it may not be
2697 * safe to do the full stripe write yet.
2703 * The bioc may be freed once we submit the last bio. Make sure not to
2704 * touch it after that.
2706 atomic_set(&rbio->stripes_pending, bios_to_read);
2707 INIT_WORK(&rbio->end_io_work, raid56_parity_scrub_end_io_work);
2708 while ((bio = bio_list_pop(&bio_list))) {
2709 bio->bi_end_io = raid56_bio_end_io;
2711 if (trace_raid56_scrub_read_enabled()) {
2712 struct raid56_bio_trace_info trace_info = { 0 };
2714 bio_get_trace_info(rbio, bio, &trace_info);
2715 trace_raid56_scrub_read(rbio, bio, &trace_info);
2719 /* the actual write will happen once the reads are done */
2723 rbio_orig_end_io(rbio, BLK_STS_IOERR);
2725 while ((bio = bio_list_pop(&bio_list)))
2731 validate_rbio_for_parity_scrub(rbio);
2734 static void scrub_parity_work(struct work_struct *work)
2736 struct btrfs_raid_bio *rbio;
2738 rbio = container_of(work, struct btrfs_raid_bio, work);
2739 raid56_parity_scrub_stripe(rbio);
2742 void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
2744 if (!lock_stripe_add(rbio))
2745 start_async_work(rbio, scrub_parity_work);
2748 /* The following code is used for dev replace of a missing RAID 5/6 device. */
2750 struct btrfs_raid_bio *
2751 raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc)
2753 struct btrfs_fs_info *fs_info = bioc->fs_info;
2754 struct btrfs_raid_bio *rbio;
2756 rbio = alloc_rbio(fs_info, bioc);
2760 rbio->operation = BTRFS_RBIO_REBUILD_MISSING;
2761 bio_list_add(&rbio->bio_list, bio);
2763 * This is a special bio which is used to hold the completion handler
2764 * and make the scrub rbio is similar to the other types
2766 ASSERT(!bio->bi_iter.bi_size);
2768 rbio->faila = find_logical_bio_stripe(rbio, bio);
2769 if (rbio->faila == -1) {
2776 * When we get bioc, we have already increased bio_counter, record it
2777 * so we can free it at rbio_orig_end_io()
2779 rbio->generic_bio_cnt = 1;
2784 void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio)
2786 if (!lock_stripe_add(rbio))
2787 start_async_work(rbio, read_rebuild_work);