1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2012 Fusion-io All rights reserved.
4 * Copyright (C) 2012 Intel Corp. All rights reserved.
7 #include <linux/sched.h>
9 #include <linux/slab.h>
10 #include <linux/blkdev.h>
11 #include <linux/raid/pq.h>
12 #include <linux/hash.h>
13 #include <linux/list_sort.h>
14 #include <linux/raid/xor.h>
21 #include "async-thread.h"
23 /* set when additional merges to this rbio are not allowed */
24 #define RBIO_RMW_LOCKED_BIT 1
27 * set when this rbio is sitting in the hash, but it is just a cache
30 #define RBIO_CACHE_BIT 2
33 * set when it is safe to trust the stripe_pages for caching
35 #define RBIO_CACHE_READY_BIT 3
37 #define RBIO_CACHE_SIZE 1024
39 #define BTRFS_STRIPE_HASH_TABLE_BITS 11
41 /* Used by the raid56 code to lock stripes for read/modify/write */
42 struct btrfs_stripe_hash {
43 struct list_head hash_list;
47 /* Used by the raid56 code to lock stripes for read/modify/write */
48 struct btrfs_stripe_hash_table {
49 struct list_head stripe_cache;
50 spinlock_t cache_lock;
52 struct btrfs_stripe_hash table[];
57 BTRFS_RBIO_READ_REBUILD,
58 BTRFS_RBIO_PARITY_SCRUB,
59 BTRFS_RBIO_REBUILD_MISSING,
62 struct btrfs_raid_bio {
63 struct btrfs_io_context *bioc;
65 /* while we're doing rmw on a stripe
66 * we put it into a hash table so we can
67 * lock the stripe and merge more rbios
70 struct list_head hash_list;
73 * LRU list for the stripe cache
75 struct list_head stripe_cache;
78 * for scheduling work in the helper threads
80 struct btrfs_work work;
83 * bio list and bio_list_lock are used
84 * to add more bios into the stripe
85 * in hopes of avoiding the full rmw
87 struct bio_list bio_list;
88 spinlock_t bio_list_lock;
90 /* also protected by the bio_list_lock, the
91 * plug list is used by the plugging code
92 * to collect partial bios while plugged. The
93 * stripe locking code also uses it to hand off
94 * the stripe lock to the next pending IO
96 struct list_head plug_list;
99 * flags that tell us if it is safe to
100 * merge with this bio
104 /* size of each individual stripe on disk */
107 /* number of data stripes (no p/q) */
114 * set if we're doing a parity rebuild
115 * for a read from higher up, which is handled
116 * differently from a parity rebuild as part of
119 enum btrfs_rbio_ops operation;
121 /* first bad stripe */
124 /* second bad stripe (for raid6 use) */
129 * number of pages needed to represent the full
135 * size of all the bios in the bio_list. This
136 * helps us decide if the rbio maps to a full
145 atomic_t stripes_pending;
149 * these are two arrays of pointers. We allocate the
150 * rbio big enough to hold them both and setup their
151 * locations when the rbio is allocated
154 /* pointers to pages that we allocated for
155 * reading/writing stripes directly from the disk (including P/Q)
157 struct page **stripe_pages;
160 * pointers to the pages in the bio_list. Stored
161 * here for faster lookup
163 struct page **bio_pages;
166 * bitmap to record which horizontal stripe has data
168 unsigned long *dbitmap;
170 /* allocated with real_stripes-many pointers for finish_*() calls */
171 void **finish_pointers;
173 /* allocated with stripe_npages-many bits for finish_*() calls */
174 unsigned long *finish_pbitmap;
177 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
178 static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
179 static void rmw_work(struct btrfs_work *work);
180 static void read_rebuild_work(struct btrfs_work *work);
181 static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
182 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
183 static void __free_raid_bio(struct btrfs_raid_bio *rbio);
184 static void index_rbio_pages(struct btrfs_raid_bio *rbio);
185 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
187 static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
189 static void scrub_parity_work(struct btrfs_work *work);
191 static void start_async_work(struct btrfs_raid_bio *rbio, btrfs_func_t work_func)
193 btrfs_init_work(&rbio->work, work_func, NULL, NULL);
194 btrfs_queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work);
198 * the stripe hash table is used for locking, and to collect
199 * bios in hopes of making a full stripe
201 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
203 struct btrfs_stripe_hash_table *table;
204 struct btrfs_stripe_hash_table *x;
205 struct btrfs_stripe_hash *cur;
206 struct btrfs_stripe_hash *h;
207 int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
210 if (info->stripe_hash_table)
214 * The table is large, starting with order 4 and can go as high as
215 * order 7 in case lock debugging is turned on.
217 * Try harder to allocate and fallback to vmalloc to lower the chance
218 * of a failing mount.
220 table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL);
224 spin_lock_init(&table->cache_lock);
225 INIT_LIST_HEAD(&table->stripe_cache);
229 for (i = 0; i < num_entries; i++) {
231 INIT_LIST_HEAD(&cur->hash_list);
232 spin_lock_init(&cur->lock);
235 x = cmpxchg(&info->stripe_hash_table, NULL, table);
241 * caching an rbio means to copy anything from the
242 * bio_pages array into the stripe_pages array. We
243 * use the page uptodate bit in the stripe cache array
244 * to indicate if it has valid data
246 * once the caching is done, we set the cache ready
249 static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
254 ret = alloc_rbio_pages(rbio);
258 for (i = 0; i < rbio->nr_pages; i++) {
259 if (!rbio->bio_pages[i])
262 copy_highpage(rbio->stripe_pages[i], rbio->bio_pages[i]);
263 SetPageUptodate(rbio->stripe_pages[i]);
265 set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
269 * we hash on the first logical address of the stripe
271 static int rbio_bucket(struct btrfs_raid_bio *rbio)
273 u64 num = rbio->bioc->raid_map[0];
276 * we shift down quite a bit. We're using byte
277 * addressing, and most of the lower bits are zeros.
278 * This tends to upset hash_64, and it consistently
279 * returns just one or two different values.
281 * shifting off the lower bits fixes things.
283 return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
287 * stealing an rbio means taking all the uptodate pages from the stripe
288 * array in the source rbio and putting them into the destination rbio
290 static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
296 if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
299 for (i = 0; i < dest->nr_pages; i++) {
300 s = src->stripe_pages[i];
301 if (!s || !PageUptodate(s)) {
305 d = dest->stripe_pages[i];
309 dest->stripe_pages[i] = s;
310 src->stripe_pages[i] = NULL;
315 * merging means we take the bio_list from the victim and
316 * splice it into the destination. The victim should
317 * be discarded afterwards.
319 * must be called with dest->rbio_list_lock held
321 static void merge_rbio(struct btrfs_raid_bio *dest,
322 struct btrfs_raid_bio *victim)
324 bio_list_merge(&dest->bio_list, &victim->bio_list);
325 dest->bio_list_bytes += victim->bio_list_bytes;
326 dest->generic_bio_cnt += victim->generic_bio_cnt;
327 bio_list_init(&victim->bio_list);
331 * used to prune items that are in the cache. The caller
332 * must hold the hash table lock.
334 static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
336 int bucket = rbio_bucket(rbio);
337 struct btrfs_stripe_hash_table *table;
338 struct btrfs_stripe_hash *h;
342 * check the bit again under the hash table lock.
344 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
347 table = rbio->bioc->fs_info->stripe_hash_table;
348 h = table->table + bucket;
350 /* hold the lock for the bucket because we may be
351 * removing it from the hash table
356 * hold the lock for the bio list because we need
357 * to make sure the bio list is empty
359 spin_lock(&rbio->bio_list_lock);
361 if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
362 list_del_init(&rbio->stripe_cache);
363 table->cache_size -= 1;
366 /* if the bio list isn't empty, this rbio is
367 * still involved in an IO. We take it out
368 * of the cache list, and drop the ref that
369 * was held for the list.
371 * If the bio_list was empty, we also remove
372 * the rbio from the hash_table, and drop
373 * the corresponding ref
375 if (bio_list_empty(&rbio->bio_list)) {
376 if (!list_empty(&rbio->hash_list)) {
377 list_del_init(&rbio->hash_list);
378 refcount_dec(&rbio->refs);
379 BUG_ON(!list_empty(&rbio->plug_list));
384 spin_unlock(&rbio->bio_list_lock);
385 spin_unlock(&h->lock);
388 __free_raid_bio(rbio);
392 * prune a given rbio from the cache
394 static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
396 struct btrfs_stripe_hash_table *table;
399 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
402 table = rbio->bioc->fs_info->stripe_hash_table;
404 spin_lock_irqsave(&table->cache_lock, flags);
405 __remove_rbio_from_cache(rbio);
406 spin_unlock_irqrestore(&table->cache_lock, flags);
410 * remove everything in the cache
412 static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
414 struct btrfs_stripe_hash_table *table;
416 struct btrfs_raid_bio *rbio;
418 table = info->stripe_hash_table;
420 spin_lock_irqsave(&table->cache_lock, flags);
421 while (!list_empty(&table->stripe_cache)) {
422 rbio = list_entry(table->stripe_cache.next,
423 struct btrfs_raid_bio,
425 __remove_rbio_from_cache(rbio);
427 spin_unlock_irqrestore(&table->cache_lock, flags);
431 * remove all cached entries and free the hash table
434 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
436 if (!info->stripe_hash_table)
438 btrfs_clear_rbio_cache(info);
439 kvfree(info->stripe_hash_table);
440 info->stripe_hash_table = NULL;
444 * insert an rbio into the stripe cache. It
445 * must have already been prepared by calling
448 * If this rbio was already cached, it gets
449 * moved to the front of the lru.
451 * If the size of the rbio cache is too big, we
454 static void cache_rbio(struct btrfs_raid_bio *rbio)
456 struct btrfs_stripe_hash_table *table;
459 if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
462 table = rbio->bioc->fs_info->stripe_hash_table;
464 spin_lock_irqsave(&table->cache_lock, flags);
465 spin_lock(&rbio->bio_list_lock);
467 /* bump our ref if we were not in the list before */
468 if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
469 refcount_inc(&rbio->refs);
471 if (!list_empty(&rbio->stripe_cache)){
472 list_move(&rbio->stripe_cache, &table->stripe_cache);
474 list_add(&rbio->stripe_cache, &table->stripe_cache);
475 table->cache_size += 1;
478 spin_unlock(&rbio->bio_list_lock);
480 if (table->cache_size > RBIO_CACHE_SIZE) {
481 struct btrfs_raid_bio *found;
483 found = list_entry(table->stripe_cache.prev,
484 struct btrfs_raid_bio,
488 __remove_rbio_from_cache(found);
491 spin_unlock_irqrestore(&table->cache_lock, flags);
495 * helper function to run the xor_blocks api. It is only
496 * able to do MAX_XOR_BLOCKS at a time, so we need to
499 static void run_xor(void **pages, int src_cnt, ssize_t len)
503 void *dest = pages[src_cnt];
506 xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
507 xor_blocks(xor_src_cnt, len, dest, pages + src_off);
509 src_cnt -= xor_src_cnt;
510 src_off += xor_src_cnt;
515 * Returns true if the bio list inside this rbio covers an entire stripe (no
518 static int rbio_is_full(struct btrfs_raid_bio *rbio)
521 unsigned long size = rbio->bio_list_bytes;
524 spin_lock_irqsave(&rbio->bio_list_lock, flags);
525 if (size != rbio->nr_data * rbio->stripe_len)
527 BUG_ON(size > rbio->nr_data * rbio->stripe_len);
528 spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
534 * returns 1 if it is safe to merge two rbios together.
535 * The merging is safe if the two rbios correspond to
536 * the same stripe and if they are both going in the same
537 * direction (read vs write), and if neither one is
538 * locked for final IO
540 * The caller is responsible for locking such that
541 * rmw_locked is safe to test
543 static int rbio_can_merge(struct btrfs_raid_bio *last,
544 struct btrfs_raid_bio *cur)
546 if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
547 test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
551 * we can't merge with cached rbios, since the
552 * idea is that when we merge the destination
553 * rbio is going to run our IO for us. We can
554 * steal from cached rbios though, other functions
557 if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
558 test_bit(RBIO_CACHE_BIT, &cur->flags))
561 if (last->bioc->raid_map[0] != cur->bioc->raid_map[0])
564 /* we can't merge with different operations */
565 if (last->operation != cur->operation)
568 * We've need read the full stripe from the drive.
569 * check and repair the parity and write the new results.
571 * We're not allowed to add any new bios to the
572 * bio list here, anyone else that wants to
573 * change this stripe needs to do their own rmw.
575 if (last->operation == BTRFS_RBIO_PARITY_SCRUB)
578 if (last->operation == BTRFS_RBIO_REBUILD_MISSING)
581 if (last->operation == BTRFS_RBIO_READ_REBUILD) {
582 int fa = last->faila;
583 int fb = last->failb;
584 int cur_fa = cur->faila;
585 int cur_fb = cur->failb;
587 if (last->faila >= last->failb) {
592 if (cur->faila >= cur->failb) {
597 if (fa != cur_fa || fb != cur_fb)
603 static int rbio_stripe_page_index(struct btrfs_raid_bio *rbio, int stripe,
606 return stripe * rbio->stripe_npages + index;
610 * these are just the pages from the rbio array, not from anything
611 * the FS sent down to us
613 static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe,
616 return rbio->stripe_pages[rbio_stripe_page_index(rbio, stripe, index)];
620 * helper to index into the pstripe
622 static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
624 return rbio_stripe_page(rbio, rbio->nr_data, index);
628 * helper to index into the qstripe, returns null
629 * if there is no qstripe
631 static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
633 if (rbio->nr_data + 1 == rbio->real_stripes)
635 return rbio_stripe_page(rbio, rbio->nr_data + 1, index);
639 * The first stripe in the table for a logical address
640 * has the lock. rbios are added in one of three ways:
642 * 1) Nobody has the stripe locked yet. The rbio is given
643 * the lock and 0 is returned. The caller must start the IO
646 * 2) Someone has the stripe locked, but we're able to merge
647 * with the lock owner. The rbio is freed and the IO will
648 * start automatically along with the existing rbio. 1 is returned.
650 * 3) Someone has the stripe locked, but we're not able to merge.
651 * The rbio is added to the lock owner's plug list, or merged into
652 * an rbio already on the plug list. When the lock owner unlocks,
653 * the next rbio on the list is run and the IO is started automatically.
656 * If we return 0, the caller still owns the rbio and must continue with
657 * IO submission. If we return 1, the caller must assume the rbio has
658 * already been freed.
660 static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
662 struct btrfs_stripe_hash *h;
663 struct btrfs_raid_bio *cur;
664 struct btrfs_raid_bio *pending;
666 struct btrfs_raid_bio *freeit = NULL;
667 struct btrfs_raid_bio *cache_drop = NULL;
670 h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
672 spin_lock_irqsave(&h->lock, flags);
673 list_for_each_entry(cur, &h->hash_list, hash_list) {
674 if (cur->bioc->raid_map[0] != rbio->bioc->raid_map[0])
677 spin_lock(&cur->bio_list_lock);
679 /* Can we steal this cached rbio's pages? */
680 if (bio_list_empty(&cur->bio_list) &&
681 list_empty(&cur->plug_list) &&
682 test_bit(RBIO_CACHE_BIT, &cur->flags) &&
683 !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
684 list_del_init(&cur->hash_list);
685 refcount_dec(&cur->refs);
687 steal_rbio(cur, rbio);
689 spin_unlock(&cur->bio_list_lock);
694 /* Can we merge into the lock owner? */
695 if (rbio_can_merge(cur, rbio)) {
696 merge_rbio(cur, rbio);
697 spin_unlock(&cur->bio_list_lock);
705 * We couldn't merge with the running rbio, see if we can merge
706 * with the pending ones. We don't have to check for rmw_locked
707 * because there is no way they are inside finish_rmw right now
709 list_for_each_entry(pending, &cur->plug_list, plug_list) {
710 if (rbio_can_merge(pending, rbio)) {
711 merge_rbio(pending, rbio);
712 spin_unlock(&cur->bio_list_lock);
720 * No merging, put us on the tail of the plug list, our rbio
721 * will be started with the currently running rbio unlocks
723 list_add_tail(&rbio->plug_list, &cur->plug_list);
724 spin_unlock(&cur->bio_list_lock);
729 refcount_inc(&rbio->refs);
730 list_add(&rbio->hash_list, &h->hash_list);
732 spin_unlock_irqrestore(&h->lock, flags);
734 remove_rbio_from_cache(cache_drop);
736 __free_raid_bio(freeit);
741 * called as rmw or parity rebuild is completed. If the plug list has more
742 * rbios waiting for this stripe, the next one on the list will be started
744 static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
747 struct btrfs_stripe_hash *h;
751 bucket = rbio_bucket(rbio);
752 h = rbio->bioc->fs_info->stripe_hash_table->table + bucket;
754 if (list_empty(&rbio->plug_list))
757 spin_lock_irqsave(&h->lock, flags);
758 spin_lock(&rbio->bio_list_lock);
760 if (!list_empty(&rbio->hash_list)) {
762 * if we're still cached and there is no other IO
763 * to perform, just leave this rbio here for others
764 * to steal from later
766 if (list_empty(&rbio->plug_list) &&
767 test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
769 clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
770 BUG_ON(!bio_list_empty(&rbio->bio_list));
774 list_del_init(&rbio->hash_list);
775 refcount_dec(&rbio->refs);
778 * we use the plug list to hold all the rbios
779 * waiting for the chance to lock this stripe.
780 * hand the lock over to one of them.
782 if (!list_empty(&rbio->plug_list)) {
783 struct btrfs_raid_bio *next;
784 struct list_head *head = rbio->plug_list.next;
786 next = list_entry(head, struct btrfs_raid_bio,
789 list_del_init(&rbio->plug_list);
791 list_add(&next->hash_list, &h->hash_list);
792 refcount_inc(&next->refs);
793 spin_unlock(&rbio->bio_list_lock);
794 spin_unlock_irqrestore(&h->lock, flags);
796 if (next->operation == BTRFS_RBIO_READ_REBUILD)
797 start_async_work(next, read_rebuild_work);
798 else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) {
799 steal_rbio(rbio, next);
800 start_async_work(next, read_rebuild_work);
801 } else if (next->operation == BTRFS_RBIO_WRITE) {
802 steal_rbio(rbio, next);
803 start_async_work(next, rmw_work);
804 } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
805 steal_rbio(rbio, next);
806 start_async_work(next, scrub_parity_work);
813 spin_unlock(&rbio->bio_list_lock);
814 spin_unlock_irqrestore(&h->lock, flags);
818 remove_rbio_from_cache(rbio);
821 static void __free_raid_bio(struct btrfs_raid_bio *rbio)
825 if (!refcount_dec_and_test(&rbio->refs))
828 WARN_ON(!list_empty(&rbio->stripe_cache));
829 WARN_ON(!list_empty(&rbio->hash_list));
830 WARN_ON(!bio_list_empty(&rbio->bio_list));
832 for (i = 0; i < rbio->nr_pages; i++) {
833 if (rbio->stripe_pages[i]) {
834 __free_page(rbio->stripe_pages[i]);
835 rbio->stripe_pages[i] = NULL;
839 btrfs_put_bioc(rbio->bioc);
843 static void rbio_endio_bio_list(struct bio *cur, blk_status_t err)
850 cur->bi_status = err;
857 * this frees the rbio and runs through all the bios in the
858 * bio_list and calls end_io on them
860 static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
862 struct bio *cur = bio_list_get(&rbio->bio_list);
865 if (rbio->generic_bio_cnt)
866 btrfs_bio_counter_sub(rbio->bioc->fs_info, rbio->generic_bio_cnt);
869 * At this moment, rbio->bio_list is empty, however since rbio does not
870 * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the
871 * hash list, rbio may be merged with others so that rbio->bio_list
873 * Once unlock_stripe() is done, rbio->bio_list will not be updated any
874 * more and we can call bio_endio() on all queued bios.
877 extra = bio_list_get(&rbio->bio_list);
878 __free_raid_bio(rbio);
880 rbio_endio_bio_list(cur, err);
882 rbio_endio_bio_list(extra, err);
886 * end io function used by finish_rmw. When we finally
887 * get here, we've written a full stripe
889 static void raid_write_end_io(struct bio *bio)
891 struct btrfs_raid_bio *rbio = bio->bi_private;
892 blk_status_t err = bio->bi_status;
896 fail_bio_stripe(rbio, bio);
900 if (!atomic_dec_and_test(&rbio->stripes_pending))
905 /* OK, we have read all the stripes we need to. */
906 max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ?
907 0 : rbio->bioc->max_errors;
908 if (atomic_read(&rbio->error) > max_errors)
911 rbio_orig_end_io(rbio, err);
915 * the read/modify/write code wants to use the original bio for
916 * any pages it included, and then use the rbio for everything
917 * else. This function decides if a given index (stripe number)
918 * and page number in that stripe fall inside the original bio
921 * if you set bio_list_only, you'll get a NULL back for any ranges
922 * that are outside the bio_list
924 * This doesn't take any refs on anything, you get a bare page pointer
925 * and the caller must bump refs as required.
927 * You must call index_rbio_pages once before you can trust
928 * the answers from this function.
930 static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
931 int index, int pagenr, int bio_list_only)
934 struct page *p = NULL;
936 chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
938 spin_lock_irq(&rbio->bio_list_lock);
939 p = rbio->bio_pages[chunk_page];
940 spin_unlock_irq(&rbio->bio_list_lock);
942 if (p || bio_list_only)
945 return rbio->stripe_pages[chunk_page];
949 * number of pages we need for the entire stripe across all the
952 static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
954 return DIV_ROUND_UP(stripe_len, PAGE_SIZE) * nr_stripes;
958 * allocation and initial setup for the btrfs_raid_bio. Not
959 * this does not allocate any pages for rbio->pages.
961 static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
962 struct btrfs_io_context *bioc,
965 struct btrfs_raid_bio *rbio;
967 int real_stripes = bioc->num_stripes - bioc->num_tgtdevs;
968 int num_pages = rbio_nr_pages(stripe_len, real_stripes);
969 int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE);
972 rbio = kzalloc(sizeof(*rbio) +
973 sizeof(*rbio->stripe_pages) * num_pages +
974 sizeof(*rbio->bio_pages) * num_pages +
975 sizeof(*rbio->finish_pointers) * real_stripes +
976 sizeof(*rbio->dbitmap) * BITS_TO_LONGS(stripe_npages) +
977 sizeof(*rbio->finish_pbitmap) *
978 BITS_TO_LONGS(stripe_npages),
981 return ERR_PTR(-ENOMEM);
983 bio_list_init(&rbio->bio_list);
984 INIT_LIST_HEAD(&rbio->plug_list);
985 spin_lock_init(&rbio->bio_list_lock);
986 INIT_LIST_HEAD(&rbio->stripe_cache);
987 INIT_LIST_HEAD(&rbio->hash_list);
989 rbio->stripe_len = stripe_len;
990 rbio->nr_pages = num_pages;
991 rbio->real_stripes = real_stripes;
992 rbio->stripe_npages = stripe_npages;
995 refcount_set(&rbio->refs, 1);
996 atomic_set(&rbio->error, 0);
997 atomic_set(&rbio->stripes_pending, 0);
1000 * the stripe_pages, bio_pages, etc arrays point to the extra
1001 * memory we allocated past the end of the rbio
1004 #define CONSUME_ALLOC(ptr, count) do { \
1006 p = (unsigned char *)p + sizeof(*(ptr)) * (count); \
1008 CONSUME_ALLOC(rbio->stripe_pages, num_pages);
1009 CONSUME_ALLOC(rbio->bio_pages, num_pages);
1010 CONSUME_ALLOC(rbio->finish_pointers, real_stripes);
1011 CONSUME_ALLOC(rbio->dbitmap, BITS_TO_LONGS(stripe_npages));
1012 CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_npages));
1013 #undef CONSUME_ALLOC
1015 if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
1016 nr_data = real_stripes - 1;
1017 else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
1018 nr_data = real_stripes - 2;
1022 rbio->nr_data = nr_data;
1026 /* allocate pages for all the stripes in the bio, including parity */
1027 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
1032 for (i = 0; i < rbio->nr_pages; i++) {
1033 if (rbio->stripe_pages[i])
1035 page = alloc_page(GFP_NOFS);
1038 rbio->stripe_pages[i] = page;
1043 /* only allocate pages for p/q stripes */
1044 static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
1049 i = rbio_stripe_page_index(rbio, rbio->nr_data, 0);
1051 for (; i < rbio->nr_pages; i++) {
1052 if (rbio->stripe_pages[i])
1054 page = alloc_page(GFP_NOFS);
1057 rbio->stripe_pages[i] = page;
1063 * add a single page from a specific stripe into our list of bios for IO
1064 * this will try to merge into existing bios if possible, and returns
1065 * zero if all went well.
1067 static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
1068 struct bio_list *bio_list,
1071 unsigned long page_index,
1072 unsigned long bio_max_len)
1074 struct bio *last = bio_list->tail;
1077 struct btrfs_io_stripe *stripe;
1080 stripe = &rbio->bioc->stripes[stripe_nr];
1081 disk_start = stripe->physical + (page_index << PAGE_SHIFT);
1083 /* if the device is missing, just fail this stripe */
1084 if (!stripe->dev->bdev)
1085 return fail_rbio_index(rbio, stripe_nr);
1087 /* see if we can add this page onto our existing bio */
1089 u64 last_end = last->bi_iter.bi_sector << 9;
1090 last_end += last->bi_iter.bi_size;
1093 * we can't merge these if they are from different
1094 * devices or if they are not contiguous
1096 if (last_end == disk_start && !last->bi_status &&
1097 last->bi_bdev == stripe->dev->bdev) {
1098 ret = bio_add_page(last, page, PAGE_SIZE, 0);
1099 if (ret == PAGE_SIZE)
1104 /* put a new bio on the list */
1105 bio = btrfs_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1);
1106 btrfs_bio(bio)->device = stripe->dev;
1107 bio->bi_iter.bi_size = 0;
1108 bio_set_dev(bio, stripe->dev->bdev);
1109 bio->bi_iter.bi_sector = disk_start >> 9;
1111 bio_add_page(bio, page, PAGE_SIZE, 0);
1112 bio_list_add(bio_list, bio);
1117 * while we're doing the read/modify/write cycle, we could
1118 * have errors in reading pages off the disk. This checks
1119 * for errors and if we're not able to read the page it'll
1120 * trigger parity reconstruction. The rmw will be finished
1121 * after we've reconstructed the failed stripes
1123 static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
1125 if (rbio->faila >= 0 || rbio->failb >= 0) {
1126 BUG_ON(rbio->faila == rbio->real_stripes - 1);
1127 __raid56_parity_recover(rbio);
1134 * helper function to walk our bio list and populate the bio_pages array with
1135 * the result. This seems expensive, but it is faster than constantly
1136 * searching through the bio list as we setup the IO in finish_rmw or stripe
1139 * This must be called before you trust the answers from page_in_rbio
1141 static void index_rbio_pages(struct btrfs_raid_bio *rbio)
1145 unsigned long stripe_offset;
1146 unsigned long page_index;
1148 spin_lock_irq(&rbio->bio_list_lock);
1149 bio_list_for_each(bio, &rbio->bio_list) {
1150 struct bio_vec bvec;
1151 struct bvec_iter iter;
1154 start = bio->bi_iter.bi_sector << 9;
1155 stripe_offset = start - rbio->bioc->raid_map[0];
1156 page_index = stripe_offset >> PAGE_SHIFT;
1158 if (bio_flagged(bio, BIO_CLONED))
1159 bio->bi_iter = btrfs_bio(bio)->iter;
1161 bio_for_each_segment(bvec, bio, iter) {
1162 rbio->bio_pages[page_index + i] = bvec.bv_page;
1166 spin_unlock_irq(&rbio->bio_list_lock);
1170 * this is called from one of two situations. We either
1171 * have a full stripe from the higher layers, or we've read all
1172 * the missing bits off disk.
1174 * This will calculate the parity and then send down any
1177 static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1179 struct btrfs_io_context *bioc = rbio->bioc;
1180 void **pointers = rbio->finish_pointers;
1181 int nr_data = rbio->nr_data;
1185 struct bio_list bio_list;
1189 bio_list_init(&bio_list);
1191 if (rbio->real_stripes - rbio->nr_data == 1)
1192 has_qstripe = false;
1193 else if (rbio->real_stripes - rbio->nr_data == 2)
1198 /* at this point we either have a full stripe,
1199 * or we've read the full stripe from the drive.
1200 * recalculate the parity and write the new results.
1202 * We're not allowed to add any new bios to the
1203 * bio list here, anyone else that wants to
1204 * change this stripe needs to do their own rmw.
1206 spin_lock_irq(&rbio->bio_list_lock);
1207 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1208 spin_unlock_irq(&rbio->bio_list_lock);
1210 atomic_set(&rbio->error, 0);
1213 * now that we've set rmw_locked, run through the
1214 * bio list one last time and map the page pointers
1216 * We don't cache full rbios because we're assuming
1217 * the higher layers are unlikely to use this area of
1218 * the disk again soon. If they do use it again,
1219 * hopefully they will send another full bio.
1221 index_rbio_pages(rbio);
1222 if (!rbio_is_full(rbio))
1223 cache_rbio_pages(rbio);
1225 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
1227 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
1229 /* first collect one page from each data stripe */
1230 for (stripe = 0; stripe < nr_data; stripe++) {
1231 p = page_in_rbio(rbio, stripe, pagenr, 0);
1232 pointers[stripe] = kmap_local_page(p);
1235 /* then add the parity stripe */
1236 p = rbio_pstripe_page(rbio, pagenr);
1238 pointers[stripe++] = kmap_local_page(p);
1243 * raid6, add the qstripe and call the
1244 * library function to fill in our p/q
1246 p = rbio_qstripe_page(rbio, pagenr);
1248 pointers[stripe++] = kmap_local_page(p);
1250 raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
1254 copy_page(pointers[nr_data], pointers[0]);
1255 run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
1257 for (stripe = stripe - 1; stripe >= 0; stripe--)
1258 kunmap_local(pointers[stripe]);
1262 * time to start writing. Make bios for everything from the
1263 * higher layers (the bio_list in our rbio) and our p/q. Ignore
1266 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1267 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
1269 if (stripe < rbio->nr_data) {
1270 page = page_in_rbio(rbio, stripe, pagenr, 1);
1274 page = rbio_stripe_page(rbio, stripe, pagenr);
1277 ret = rbio_add_io_page(rbio, &bio_list,
1278 page, stripe, pagenr, rbio->stripe_len);
1284 if (likely(!bioc->num_tgtdevs))
1287 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1288 if (!bioc->tgtdev_map[stripe])
1291 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
1293 if (stripe < rbio->nr_data) {
1294 page = page_in_rbio(rbio, stripe, pagenr, 1);
1298 page = rbio_stripe_page(rbio, stripe, pagenr);
1301 ret = rbio_add_io_page(rbio, &bio_list, page,
1302 rbio->bioc->tgtdev_map[stripe],
1303 pagenr, rbio->stripe_len);
1310 atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
1311 BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
1313 while ((bio = bio_list_pop(&bio_list))) {
1314 bio->bi_private = rbio;
1315 bio->bi_end_io = raid_write_end_io;
1316 bio->bi_opf = REQ_OP_WRITE;
1323 rbio_orig_end_io(rbio, BLK_STS_IOERR);
1325 while ((bio = bio_list_pop(&bio_list)))
1330 * helper to find the stripe number for a given bio. Used to figure out which
1331 * stripe has failed. This expects the bio to correspond to a physical disk,
1332 * so it looks up based on physical sector numbers.
1334 static int find_bio_stripe(struct btrfs_raid_bio *rbio,
1337 u64 physical = bio->bi_iter.bi_sector;
1339 struct btrfs_io_stripe *stripe;
1343 for (i = 0; i < rbio->bioc->num_stripes; i++) {
1344 stripe = &rbio->bioc->stripes[i];
1345 if (in_range(physical, stripe->physical, rbio->stripe_len) &&
1346 stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) {
1354 * helper to find the stripe number for a given
1355 * bio (before mapping). Used to figure out which stripe has
1356 * failed. This looks up based on logical block numbers.
1358 static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
1361 u64 logical = bio->bi_iter.bi_sector << 9;
1364 for (i = 0; i < rbio->nr_data; i++) {
1365 u64 stripe_start = rbio->bioc->raid_map[i];
1367 if (in_range(logical, stripe_start, rbio->stripe_len))
1374 * returns -EIO if we had too many failures
1376 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
1378 unsigned long flags;
1381 spin_lock_irqsave(&rbio->bio_list_lock, flags);
1383 /* we already know this stripe is bad, move on */
1384 if (rbio->faila == failed || rbio->failb == failed)
1387 if (rbio->faila == -1) {
1388 /* first failure on this rbio */
1389 rbio->faila = failed;
1390 atomic_inc(&rbio->error);
1391 } else if (rbio->failb == -1) {
1392 /* second failure on this rbio */
1393 rbio->failb = failed;
1394 atomic_inc(&rbio->error);
1399 spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
1405 * helper to fail a stripe based on a physical disk
1408 static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
1411 int failed = find_bio_stripe(rbio, bio);
1416 return fail_rbio_index(rbio, failed);
1420 * this sets each page in the bio uptodate. It should only be used on private
1421 * rbio pages, nothing that comes in from the higher layers
1423 static void set_bio_pages_uptodate(struct bio *bio)
1425 struct bio_vec *bvec;
1426 struct bvec_iter_all iter_all;
1428 ASSERT(!bio_flagged(bio, BIO_CLONED));
1430 bio_for_each_segment_all(bvec, bio, iter_all)
1431 SetPageUptodate(bvec->bv_page);
1435 * end io for the read phase of the rmw cycle. All the bios here are physical
1436 * stripe bios we've read from the disk so we can recalculate the parity of the
1439 * This will usually kick off finish_rmw once all the bios are read in, but it
1440 * may trigger parity reconstruction if we had any errors along the way
1442 static void raid_rmw_end_io(struct bio *bio)
1444 struct btrfs_raid_bio *rbio = bio->bi_private;
1447 fail_bio_stripe(rbio, bio);
1449 set_bio_pages_uptodate(bio);
1453 if (!atomic_dec_and_test(&rbio->stripes_pending))
1456 if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
1460 * this will normally call finish_rmw to start our write
1461 * but if there are any failed stripes we'll reconstruct
1464 validate_rbio_for_rmw(rbio);
1469 rbio_orig_end_io(rbio, BLK_STS_IOERR);
1473 * the stripe must be locked by the caller. It will
1474 * unlock after all the writes are done
1476 static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1478 int bios_to_read = 0;
1479 struct bio_list bio_list;
1485 bio_list_init(&bio_list);
1487 ret = alloc_rbio_pages(rbio);
1491 index_rbio_pages(rbio);
1493 atomic_set(&rbio->error, 0);
1495 * build a list of bios to read all the missing parts of this
1498 for (stripe = 0; stripe < rbio->nr_data; stripe++) {
1499 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
1502 * we want to find all the pages missing from
1503 * the rbio and read them from the disk. If
1504 * page_in_rbio finds a page in the bio list
1505 * we don't need to read it off the stripe.
1507 page = page_in_rbio(rbio, stripe, pagenr, 1);
1511 page = rbio_stripe_page(rbio, stripe, pagenr);
1513 * the bio cache may have handed us an uptodate
1514 * page. If so, be happy and use it
1516 if (PageUptodate(page))
1519 ret = rbio_add_io_page(rbio, &bio_list, page,
1520 stripe, pagenr, rbio->stripe_len);
1526 bios_to_read = bio_list_size(&bio_list);
1527 if (!bios_to_read) {
1529 * this can happen if others have merged with
1530 * us, it means there is nothing left to read.
1531 * But if there are missing devices it may not be
1532 * safe to do the full stripe write yet.
1538 * The bioc may be freed once we submit the last bio. Make sure not to
1539 * touch it after that.
1541 atomic_set(&rbio->stripes_pending, bios_to_read);
1542 while ((bio = bio_list_pop(&bio_list))) {
1543 bio->bi_private = rbio;
1544 bio->bi_end_io = raid_rmw_end_io;
1545 bio->bi_opf = REQ_OP_READ;
1547 btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
1551 /* the actual write will happen once the reads are done */
1555 rbio_orig_end_io(rbio, BLK_STS_IOERR);
1557 while ((bio = bio_list_pop(&bio_list)))
1563 validate_rbio_for_rmw(rbio);
1568 * if the upper layers pass in a full stripe, we thank them by only allocating
1569 * enough pages to hold the parity, and sending it all down quickly.
1571 static int full_stripe_write(struct btrfs_raid_bio *rbio)
1575 ret = alloc_rbio_parity_pages(rbio);
1577 __free_raid_bio(rbio);
1581 ret = lock_stripe_add(rbio);
1588 * partial stripe writes get handed over to async helpers.
1589 * We're really hoping to merge a few more writes into this
1590 * rbio before calculating new parity
1592 static int partial_stripe_write(struct btrfs_raid_bio *rbio)
1596 ret = lock_stripe_add(rbio);
1598 start_async_work(rbio, rmw_work);
1603 * sometimes while we were reading from the drive to
1604 * recalculate parity, enough new bios come into create
1605 * a full stripe. So we do a check here to see if we can
1606 * go directly to finish_rmw
1608 static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
1610 /* head off into rmw land if we don't have a full stripe */
1611 if (!rbio_is_full(rbio))
1612 return partial_stripe_write(rbio);
1613 return full_stripe_write(rbio);
1617 * We use plugging call backs to collect full stripes.
1618 * Any time we get a partial stripe write while plugged
1619 * we collect it into a list. When the unplug comes down,
1620 * we sort the list by logical block number and merge
1621 * everything we can into the same rbios
1623 struct btrfs_plug_cb {
1624 struct blk_plug_cb cb;
1625 struct btrfs_fs_info *info;
1626 struct list_head rbio_list;
1627 struct btrfs_work work;
1631 * rbios on the plug list are sorted for easier merging.
1633 static int plug_cmp(void *priv, const struct list_head *a,
1634 const struct list_head *b)
1636 const struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
1638 const struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
1640 u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
1641 u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
1643 if (a_sector < b_sector)
1645 if (a_sector > b_sector)
1650 static void run_plug(struct btrfs_plug_cb *plug)
1652 struct btrfs_raid_bio *cur;
1653 struct btrfs_raid_bio *last = NULL;
1656 * sort our plug list then try to merge
1657 * everything we can in hopes of creating full
1660 list_sort(NULL, &plug->rbio_list, plug_cmp);
1661 while (!list_empty(&plug->rbio_list)) {
1662 cur = list_entry(plug->rbio_list.next,
1663 struct btrfs_raid_bio, plug_list);
1664 list_del_init(&cur->plug_list);
1666 if (rbio_is_full(cur)) {
1669 /* we have a full stripe, send it down */
1670 ret = full_stripe_write(cur);
1675 if (rbio_can_merge(last, cur)) {
1676 merge_rbio(last, cur);
1677 __free_raid_bio(cur);
1681 __raid56_parity_write(last);
1686 __raid56_parity_write(last);
1692 * if the unplug comes from schedule, we have to push the
1693 * work off to a helper thread
1695 static void unplug_work(struct btrfs_work *work)
1697 struct btrfs_plug_cb *plug;
1698 plug = container_of(work, struct btrfs_plug_cb, work);
1702 static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1704 struct btrfs_plug_cb *plug;
1705 plug = container_of(cb, struct btrfs_plug_cb, cb);
1707 if (from_schedule) {
1708 btrfs_init_work(&plug->work, unplug_work, NULL, NULL);
1709 btrfs_queue_work(plug->info->rmw_workers,
1717 * our main entry point for writes from the rest of the FS.
1719 int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc,
1722 struct btrfs_fs_info *fs_info = bioc->fs_info;
1723 struct btrfs_raid_bio *rbio;
1724 struct btrfs_plug_cb *plug = NULL;
1725 struct blk_plug_cb *cb;
1728 rbio = alloc_rbio(fs_info, bioc, stripe_len);
1730 btrfs_put_bioc(bioc);
1731 return PTR_ERR(rbio);
1733 bio_list_add(&rbio->bio_list, bio);
1734 rbio->bio_list_bytes = bio->bi_iter.bi_size;
1735 rbio->operation = BTRFS_RBIO_WRITE;
1737 btrfs_bio_counter_inc_noblocked(fs_info);
1738 rbio->generic_bio_cnt = 1;
1741 * don't plug on full rbios, just get them out the door
1742 * as quickly as we can
1744 if (rbio_is_full(rbio)) {
1745 ret = full_stripe_write(rbio);
1747 btrfs_bio_counter_dec(fs_info);
1751 cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug));
1753 plug = container_of(cb, struct btrfs_plug_cb, cb);
1755 plug->info = fs_info;
1756 INIT_LIST_HEAD(&plug->rbio_list);
1758 list_add_tail(&rbio->plug_list, &plug->rbio_list);
1761 ret = __raid56_parity_write(rbio);
1763 btrfs_bio_counter_dec(fs_info);
1769 * all parity reconstruction happens here. We've read in everything
1770 * we can find from the drives and this does the heavy lifting of
1771 * sorting the good from the bad.
1773 static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1778 int faila = -1, failb = -1;
1783 pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
1785 err = BLK_STS_RESOURCE;
1790 * Store copy of pointers that does not get reordered during
1791 * reconstruction so that kunmap_local works.
1793 unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
1795 err = BLK_STS_RESOURCE;
1796 goto cleanup_pointers;
1799 faila = rbio->faila;
1800 failb = rbio->failb;
1802 if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1803 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
1804 spin_lock_irq(&rbio->bio_list_lock);
1805 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1806 spin_unlock_irq(&rbio->bio_list_lock);
1809 index_rbio_pages(rbio);
1811 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
1813 * Now we just use bitmap to mark the horizontal stripes in
1814 * which we have data when doing parity scrub.
1816 if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
1817 !test_bit(pagenr, rbio->dbitmap))
1821 * Setup our array of pointers with pages from each stripe
1823 * NOTE: store a duplicate array of pointers to preserve the
1826 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1828 * if we're rebuilding a read, we have to use
1829 * pages from the bio list
1831 if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1832 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
1833 (stripe == faila || stripe == failb)) {
1834 page = page_in_rbio(rbio, stripe, pagenr, 0);
1836 page = rbio_stripe_page(rbio, stripe, pagenr);
1838 pointers[stripe] = kmap_local_page(page);
1839 unmap_array[stripe] = pointers[stripe];
1842 /* all raid6 handling here */
1843 if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) {
1845 * single failure, rebuild from parity raid5
1849 if (faila == rbio->nr_data) {
1851 * Just the P stripe has failed, without
1852 * a bad data or Q stripe.
1853 * TODO, we should redo the xor here.
1855 err = BLK_STS_IOERR;
1859 * a single failure in raid6 is rebuilt
1860 * in the pstripe code below
1865 /* make sure our ps and qs are in order */
1869 /* if the q stripe is failed, do a pstripe reconstruction
1871 * If both the q stripe and the P stripe are failed, we're
1872 * here due to a crc mismatch and we can't give them the
1875 if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) {
1876 if (rbio->bioc->raid_map[faila] ==
1878 err = BLK_STS_IOERR;
1882 * otherwise we have one bad data stripe and
1883 * a good P stripe. raid5!
1888 if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) {
1889 raid6_datap_recov(rbio->real_stripes,
1890 PAGE_SIZE, faila, pointers);
1892 raid6_2data_recov(rbio->real_stripes,
1893 PAGE_SIZE, faila, failb,
1899 /* rebuild from P stripe here (raid5 or raid6) */
1900 BUG_ON(failb != -1);
1902 /* Copy parity block into failed block to start with */
1903 copy_page(pointers[faila], pointers[rbio->nr_data]);
1905 /* rearrange the pointer array */
1906 p = pointers[faila];
1907 for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
1908 pointers[stripe] = pointers[stripe + 1];
1909 pointers[rbio->nr_data - 1] = p;
1911 /* xor in the rest */
1912 run_xor(pointers, rbio->nr_data - 1, PAGE_SIZE);
1914 /* if we're doing this rebuild as part of an rmw, go through
1915 * and set all of our private rbio pages in the
1916 * failed stripes as uptodate. This way finish_rmw will
1917 * know they can be trusted. If this was a read reconstruction,
1918 * other endio functions will fiddle the uptodate bits
1920 if (rbio->operation == BTRFS_RBIO_WRITE) {
1921 for (i = 0; i < rbio->stripe_npages; i++) {
1923 page = rbio_stripe_page(rbio, faila, i);
1924 SetPageUptodate(page);
1927 page = rbio_stripe_page(rbio, failb, i);
1928 SetPageUptodate(page);
1932 for (stripe = rbio->real_stripes - 1; stripe >= 0; stripe--)
1933 kunmap_local(unmap_array[stripe]);
1944 * Similar to READ_REBUILD, REBUILD_MISSING at this point also has a
1945 * valid rbio which is consistent with ondisk content, thus such a
1946 * valid rbio can be cached to avoid further disk reads.
1948 if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1949 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
1951 * - In case of two failures, where rbio->failb != -1:
1953 * Do not cache this rbio since the above read reconstruction
1954 * (raid6_datap_recov() or raid6_2data_recov()) may have
1955 * changed some content of stripes which are not identical to
1956 * on-disk content any more, otherwise, a later write/recover
1957 * may steal stripe_pages from this rbio and end up with
1958 * corruptions or rebuild failures.
1960 * - In case of single failure, where rbio->failb == -1:
1962 * Cache this rbio iff the above read reconstruction is
1963 * executed without problems.
1965 if (err == BLK_STS_OK && rbio->failb < 0)
1966 cache_rbio_pages(rbio);
1968 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
1970 rbio_orig_end_io(rbio, err);
1971 } else if (err == BLK_STS_OK) {
1975 if (rbio->operation == BTRFS_RBIO_WRITE)
1977 else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB)
1978 finish_parity_scrub(rbio, 0);
1982 rbio_orig_end_io(rbio, err);
1987 * This is called only for stripes we've read from disk to
1988 * reconstruct the parity.
1990 static void raid_recover_end_io(struct bio *bio)
1992 struct btrfs_raid_bio *rbio = bio->bi_private;
1995 * we only read stripe pages off the disk, set them
1996 * up to date if there were no errors
1999 fail_bio_stripe(rbio, bio);
2001 set_bio_pages_uptodate(bio);
2004 if (!atomic_dec_and_test(&rbio->stripes_pending))
2007 if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
2008 rbio_orig_end_io(rbio, BLK_STS_IOERR);
2010 __raid_recover_end_io(rbio);
2014 * reads everything we need off the disk to reconstruct
2015 * the parity. endio handlers trigger final reconstruction
2016 * when the IO is done.
2018 * This is used both for reads from the higher layers and for
2019 * parity construction required to finish a rmw cycle.
2021 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
2023 int bios_to_read = 0;
2024 struct bio_list bio_list;
2030 bio_list_init(&bio_list);
2032 ret = alloc_rbio_pages(rbio);
2036 atomic_set(&rbio->error, 0);
2039 * read everything that hasn't failed. Thanks to the
2040 * stripe cache, it is possible that some or all of these
2041 * pages are going to be uptodate.
2043 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
2044 if (rbio->faila == stripe || rbio->failb == stripe) {
2045 atomic_inc(&rbio->error);
2049 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
2053 * the rmw code may have already read this
2056 p = rbio_stripe_page(rbio, stripe, pagenr);
2057 if (PageUptodate(p))
2060 ret = rbio_add_io_page(rbio, &bio_list,
2061 rbio_stripe_page(rbio, stripe, pagenr),
2062 stripe, pagenr, rbio->stripe_len);
2068 bios_to_read = bio_list_size(&bio_list);
2069 if (!bios_to_read) {
2071 * we might have no bios to read just because the pages
2072 * were up to date, or we might have no bios to read because
2073 * the devices were gone.
2075 if (atomic_read(&rbio->error) <= rbio->bioc->max_errors) {
2076 __raid_recover_end_io(rbio);
2084 * The bioc may be freed once we submit the last bio. Make sure not to
2085 * touch it after that.
2087 atomic_set(&rbio->stripes_pending, bios_to_read);
2088 while ((bio = bio_list_pop(&bio_list))) {
2089 bio->bi_private = rbio;
2090 bio->bi_end_io = raid_recover_end_io;
2091 bio->bi_opf = REQ_OP_READ;
2093 btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
2101 if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
2102 rbio->operation == BTRFS_RBIO_REBUILD_MISSING)
2103 rbio_orig_end_io(rbio, BLK_STS_IOERR);
2105 while ((bio = bio_list_pop(&bio_list)))
2112 * the main entry point for reads from the higher layers. This
2113 * is really only called when the normal read path had a failure,
2114 * so we assume the bio they send down corresponds to a failed part
2117 int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
2118 u64 stripe_len, int mirror_num, int generic_io)
2120 struct btrfs_fs_info *fs_info = bioc->fs_info;
2121 struct btrfs_raid_bio *rbio;
2125 ASSERT(bioc->mirror_num == mirror_num);
2126 btrfs_bio(bio)->mirror_num = mirror_num;
2129 rbio = alloc_rbio(fs_info, bioc, stripe_len);
2132 btrfs_put_bioc(bioc);
2133 return PTR_ERR(rbio);
2136 rbio->operation = BTRFS_RBIO_READ_REBUILD;
2137 bio_list_add(&rbio->bio_list, bio);
2138 rbio->bio_list_bytes = bio->bi_iter.bi_size;
2140 rbio->faila = find_logical_bio_stripe(rbio, bio);
2141 if (rbio->faila == -1) {
2143 "%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bioc has map_type %llu)",
2144 __func__, bio->bi_iter.bi_sector << 9,
2145 (u64)bio->bi_iter.bi_size, bioc->map_type);
2147 btrfs_put_bioc(bioc);
2153 btrfs_bio_counter_inc_noblocked(fs_info);
2154 rbio->generic_bio_cnt = 1;
2156 btrfs_get_bioc(bioc);
2161 * for 'mirror == 2', reconstruct from all other stripes.
2162 * for 'mirror_num > 2', select a stripe to fail on every retry.
2164 if (mirror_num > 2) {
2166 * 'mirror == 3' is to fail the p stripe and
2167 * reconstruct from the q stripe. 'mirror > 3' is to
2168 * fail a data stripe and reconstruct from p+q stripe.
2170 rbio->failb = rbio->real_stripes - (mirror_num - 1);
2171 ASSERT(rbio->failb > 0);
2172 if (rbio->failb <= rbio->faila)
2176 ret = lock_stripe_add(rbio);
2179 * __raid56_parity_recover will end the bio with
2180 * any errors it hits. We don't want to return
2181 * its error value up the stack because our caller
2182 * will end up calling bio_endio with any nonzero
2186 __raid56_parity_recover(rbio);
2188 * our rbio has been added to the list of
2189 * rbios that will be handled after the
2190 * currently lock owner is done
2196 static void rmw_work(struct btrfs_work *work)
2198 struct btrfs_raid_bio *rbio;
2200 rbio = container_of(work, struct btrfs_raid_bio, work);
2201 raid56_rmw_stripe(rbio);
2204 static void read_rebuild_work(struct btrfs_work *work)
2206 struct btrfs_raid_bio *rbio;
2208 rbio = container_of(work, struct btrfs_raid_bio, work);
2209 __raid56_parity_recover(rbio);
2213 * The following code is used to scrub/replace the parity stripe
2215 * Caller must have already increased bio_counter for getting @bioc.
2217 * Note: We need make sure all the pages that add into the scrub/replace
2218 * raid bio are correct and not be changed during the scrub/replace. That
2219 * is those pages just hold metadata or file data with checksum.
2222 struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
2223 struct btrfs_io_context *bioc,
2224 u64 stripe_len, struct btrfs_device *scrub_dev,
2225 unsigned long *dbitmap, int stripe_nsectors)
2227 struct btrfs_fs_info *fs_info = bioc->fs_info;
2228 struct btrfs_raid_bio *rbio;
2231 rbio = alloc_rbio(fs_info, bioc, stripe_len);
2234 bio_list_add(&rbio->bio_list, bio);
2236 * This is a special bio which is used to hold the completion handler
2237 * and make the scrub rbio is similar to the other types
2239 ASSERT(!bio->bi_iter.bi_size);
2240 rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
2243 * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted
2244 * to the end position, so this search can start from the first parity
2247 for (i = rbio->nr_data; i < rbio->real_stripes; i++) {
2248 if (bioc->stripes[i].dev == scrub_dev) {
2253 ASSERT(i < rbio->real_stripes);
2255 /* Now we just support the sectorsize equals to page size */
2256 ASSERT(fs_info->sectorsize == PAGE_SIZE);
2257 ASSERT(rbio->stripe_npages == stripe_nsectors);
2258 bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
2261 * We have already increased bio_counter when getting bioc, record it
2262 * so we can free it at rbio_orig_end_io().
2264 rbio->generic_bio_cnt = 1;
2269 /* Used for both parity scrub and missing. */
2270 void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
2276 ASSERT(logical >= rbio->bioc->raid_map[0]);
2277 ASSERT(logical + PAGE_SIZE <= rbio->bioc->raid_map[0] +
2278 rbio->stripe_len * rbio->nr_data);
2279 stripe_offset = (int)(logical - rbio->bioc->raid_map[0]);
2280 index = stripe_offset >> PAGE_SHIFT;
2281 rbio->bio_pages[index] = page;
2285 * We just scrub the parity that we have correct data on the same horizontal,
2286 * so we needn't allocate all pages for all the stripes.
2288 static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
2295 for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) {
2296 for (i = 0; i < rbio->real_stripes; i++) {
2297 index = i * rbio->stripe_npages + bit;
2298 if (rbio->stripe_pages[index])
2301 page = alloc_page(GFP_NOFS);
2304 rbio->stripe_pages[index] = page;
2310 static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
2313 struct btrfs_io_context *bioc = rbio->bioc;
2314 void **pointers = rbio->finish_pointers;
2315 unsigned long *pbitmap = rbio->finish_pbitmap;
2316 int nr_data = rbio->nr_data;
2320 struct page *p_page = NULL;
2321 struct page *q_page = NULL;
2322 struct bio_list bio_list;
2327 bio_list_init(&bio_list);
2329 if (rbio->real_stripes - rbio->nr_data == 1)
2330 has_qstripe = false;
2331 else if (rbio->real_stripes - rbio->nr_data == 2)
2336 if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) {
2338 bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages);
2342 * Because the higher layers(scrubber) are unlikely to
2343 * use this area of the disk again soon, so don't cache
2346 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2351 p_page = alloc_page(GFP_NOFS);
2354 SetPageUptodate(p_page);
2357 /* RAID6, allocate and map temp space for the Q stripe */
2358 q_page = alloc_page(GFP_NOFS);
2360 __free_page(p_page);
2363 SetPageUptodate(q_page);
2364 pointers[rbio->real_stripes - 1] = kmap_local_page(q_page);
2367 atomic_set(&rbio->error, 0);
2369 /* Map the parity stripe just once */
2370 pointers[nr_data] = kmap_local_page(p_page);
2372 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2375 /* first collect one page from each data stripe */
2376 for (stripe = 0; stripe < nr_data; stripe++) {
2377 p = page_in_rbio(rbio, stripe, pagenr, 0);
2378 pointers[stripe] = kmap_local_page(p);
2382 /* RAID6, call the library function to fill in our P/Q */
2383 raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
2387 copy_page(pointers[nr_data], pointers[0]);
2388 run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
2391 /* Check scrubbing parity and repair it */
2392 p = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
2393 parity = kmap_local_page(p);
2394 if (memcmp(parity, pointers[rbio->scrubp], PAGE_SIZE))
2395 copy_page(parity, pointers[rbio->scrubp]);
2397 /* Parity is right, needn't writeback */
2398 bitmap_clear(rbio->dbitmap, pagenr, 1);
2399 kunmap_local(parity);
2401 for (stripe = nr_data - 1; stripe >= 0; stripe--)
2402 kunmap_local(pointers[stripe]);
2405 kunmap_local(pointers[nr_data]);
2406 __free_page(p_page);
2408 kunmap_local(pointers[rbio->real_stripes - 1]);
2409 __free_page(q_page);
2414 * time to start writing. Make bios for everything from the
2415 * higher layers (the bio_list in our rbio) and our p/q. Ignore
2418 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2421 page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
2422 ret = rbio_add_io_page(rbio, &bio_list,
2423 page, rbio->scrubp, pagenr, rbio->stripe_len);
2431 for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) {
2434 page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
2435 ret = rbio_add_io_page(rbio, &bio_list, page,
2436 bioc->tgtdev_map[rbio->scrubp],
2437 pagenr, rbio->stripe_len);
2443 nr_data = bio_list_size(&bio_list);
2445 /* Every parity is right */
2446 rbio_orig_end_io(rbio, BLK_STS_OK);
2450 atomic_set(&rbio->stripes_pending, nr_data);
2452 while ((bio = bio_list_pop(&bio_list))) {
2453 bio->bi_private = rbio;
2454 bio->bi_end_io = raid_write_end_io;
2455 bio->bi_opf = REQ_OP_WRITE;
2462 rbio_orig_end_io(rbio, BLK_STS_IOERR);
2464 while ((bio = bio_list_pop(&bio_list)))
2468 static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
2470 if (stripe >= 0 && stripe < rbio->nr_data)
2476 * While we're doing the parity check and repair, we could have errors
2477 * in reading pages off the disk. This checks for errors and if we're
2478 * not able to read the page it'll trigger parity reconstruction. The
2479 * parity scrub will be finished after we've reconstructed the failed
2482 static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
2484 if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
2487 if (rbio->faila >= 0 || rbio->failb >= 0) {
2488 int dfail = 0, failp = -1;
2490 if (is_data_stripe(rbio, rbio->faila))
2492 else if (is_parity_stripe(rbio->faila))
2493 failp = rbio->faila;
2495 if (is_data_stripe(rbio, rbio->failb))
2497 else if (is_parity_stripe(rbio->failb))
2498 failp = rbio->failb;
2501 * Because we can not use a scrubbing parity to repair
2502 * the data, so the capability of the repair is declined.
2503 * (In the case of RAID5, we can not repair anything)
2505 if (dfail > rbio->bioc->max_errors - 1)
2509 * If all data is good, only parity is correctly, just
2510 * repair the parity.
2513 finish_parity_scrub(rbio, 0);
2518 * Here means we got one corrupted data stripe and one
2519 * corrupted parity on RAID6, if the corrupted parity
2520 * is scrubbing parity, luckily, use the other one to repair
2521 * the data, or we can not repair the data stripe.
2523 if (failp != rbio->scrubp)
2526 __raid_recover_end_io(rbio);
2528 finish_parity_scrub(rbio, 1);
2533 rbio_orig_end_io(rbio, BLK_STS_IOERR);
2537 * end io for the read phase of the rmw cycle. All the bios here are physical
2538 * stripe bios we've read from the disk so we can recalculate the parity of the
2541 * This will usually kick off finish_rmw once all the bios are read in, but it
2542 * may trigger parity reconstruction if we had any errors along the way
2544 static void raid56_parity_scrub_end_io(struct bio *bio)
2546 struct btrfs_raid_bio *rbio = bio->bi_private;
2549 fail_bio_stripe(rbio, bio);
2551 set_bio_pages_uptodate(bio);
2555 if (!atomic_dec_and_test(&rbio->stripes_pending))
2559 * this will normally call finish_rmw to start our write
2560 * but if there are any failed stripes we'll reconstruct
2563 validate_rbio_for_parity_scrub(rbio);
2566 static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
2568 int bios_to_read = 0;
2569 struct bio_list bio_list;
2575 bio_list_init(&bio_list);
2577 ret = alloc_rbio_essential_pages(rbio);
2581 atomic_set(&rbio->error, 0);
2583 * build a list of bios to read all the missing parts of this
2586 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
2587 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2590 * we want to find all the pages missing from
2591 * the rbio and read them from the disk. If
2592 * page_in_rbio finds a page in the bio list
2593 * we don't need to read it off the stripe.
2595 page = page_in_rbio(rbio, stripe, pagenr, 1);
2599 page = rbio_stripe_page(rbio, stripe, pagenr);
2601 * the bio cache may have handed us an uptodate
2602 * page. If so, be happy and use it
2604 if (PageUptodate(page))
2607 ret = rbio_add_io_page(rbio, &bio_list, page,
2608 stripe, pagenr, rbio->stripe_len);
2614 bios_to_read = bio_list_size(&bio_list);
2615 if (!bios_to_read) {
2617 * this can happen if others have merged with
2618 * us, it means there is nothing left to read.
2619 * But if there are missing devices it may not be
2620 * safe to do the full stripe write yet.
2626 * The bioc may be freed once we submit the last bio. Make sure not to
2627 * touch it after that.
2629 atomic_set(&rbio->stripes_pending, bios_to_read);
2630 while ((bio = bio_list_pop(&bio_list))) {
2631 bio->bi_private = rbio;
2632 bio->bi_end_io = raid56_parity_scrub_end_io;
2633 bio->bi_opf = REQ_OP_READ;
2635 btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
2639 /* the actual write will happen once the reads are done */
2643 rbio_orig_end_io(rbio, BLK_STS_IOERR);
2645 while ((bio = bio_list_pop(&bio_list)))
2651 validate_rbio_for_parity_scrub(rbio);
2654 static void scrub_parity_work(struct btrfs_work *work)
2656 struct btrfs_raid_bio *rbio;
2658 rbio = container_of(work, struct btrfs_raid_bio, work);
2659 raid56_parity_scrub_stripe(rbio);
2662 void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
2664 if (!lock_stripe_add(rbio))
2665 start_async_work(rbio, scrub_parity_work);
2668 /* The following code is used for dev replace of a missing RAID 5/6 device. */
2670 struct btrfs_raid_bio *
2671 raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc,
2674 struct btrfs_fs_info *fs_info = bioc->fs_info;
2675 struct btrfs_raid_bio *rbio;
2677 rbio = alloc_rbio(fs_info, bioc, length);
2681 rbio->operation = BTRFS_RBIO_REBUILD_MISSING;
2682 bio_list_add(&rbio->bio_list, bio);
2684 * This is a special bio which is used to hold the completion handler
2685 * and make the scrub rbio is similar to the other types
2687 ASSERT(!bio->bi_iter.bi_size);
2689 rbio->faila = find_logical_bio_stripe(rbio, bio);
2690 if (rbio->faila == -1) {
2697 * When we get bioc, we have already increased bio_counter, record it
2698 * so we can free it at rbio_orig_end_io()
2700 rbio->generic_bio_cnt = 1;
2705 void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio)
2707 if (!lock_stripe_add(rbio))
2708 start_async_work(rbio, read_rebuild_work);