fs/btrfs/raid56.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (C) 2012 Fusion-io  All rights reserved.
   4  * Copyright (C) 2012 Intel Corp. All rights reserved.
   5  */
   6
   7 #include <linux/sched.h>
   8 #include <linux/bio.h>
   9 #include <linux/slab.h>
  10 #include <linux/blkdev.h>
  11 #include <linux/raid/pq.h>
  12 #include <linux/hash.h>
  13 #include <linux/list_sort.h>
  14 #include <linux/raid/xor.h>
  15 #include <linux/mm.h>
  16 #include "misc.h"
  17 #include "ctree.h"
  18 #include "disk-io.h"
  19 #include "volumes.h"
  20 #include "raid56.h"
  21 #include "async-thread.h"
  22
  23 /* set when additional merges to this rbio are not allowed */
  24 #define RBIO_RMW_LOCKED_BIT     1
  25
  26 /*
  27  * set when this rbio is sitting in the hash, but it is just a cache
  28  * of past RMW
  29  */
  30 #define RBIO_CACHE_BIT          2
  31
  32 /*
  33  * set when it is safe to trust the stripe_pages for caching
  34  */
  35 #define RBIO_CACHE_READY_BIT    3
  36
  37 #define RBIO_CACHE_SIZE 1024
  38
  39 #define BTRFS_STRIPE_HASH_TABLE_BITS                            11
  40
  41 /* Used by the raid56 code to lock stripes for read/modify/write */
  42 struct btrfs_stripe_hash {
  43         struct list_head hash_list;
  44         spinlock_t lock;
  45 };
  46
  47 /* Used by the raid56 code to lock stripes for read/modify/write */
  48 struct btrfs_stripe_hash_table {
  49         struct list_head stripe_cache;
  50         spinlock_t cache_lock;
  51         int cache_size;
  52         struct btrfs_stripe_hash table[];
  53 };
  54
  55 enum btrfs_rbio_ops {
  56         BTRFS_RBIO_WRITE,
  57         BTRFS_RBIO_READ_REBUILD,
  58         BTRFS_RBIO_PARITY_SCRUB,
  59         BTRFS_RBIO_REBUILD_MISSING,
  60 };
  61
  62 struct btrfs_raid_bio {
  63         struct btrfs_io_context *bioc;
  64
  65         /* while we're doing rmw on a stripe
  66          * we put it into a hash table so we can
  67          * lock the stripe and merge more rbios
  68          * into it.
  69          */
  70         struct list_head hash_list;
  71
  72         /*
  73          * LRU list for the stripe cache
  74          */
  75         struct list_head stripe_cache;
  76
  77         /*
  78          * for scheduling work in the helper threads
  79          */
  80         struct btrfs_work work;
  81
  82         /*
  83          * bio list and bio_list_lock are used
  84          * to add more bios into the stripe
  85          * in hopes of avoiding the full rmw
  86          */
  87         struct bio_list bio_list;
  88         spinlock_t bio_list_lock;
  89
  90         /* also protected by the bio_list_lock, the
  91          * plug list is used by the plugging code
  92          * to collect partial bios while plugged.  The
  93          * stripe locking code also uses it to hand off
  94          * the stripe lock to the next pending IO
  95          */
  96         struct list_head plug_list;
  97
  98         /*
  99          * flags that tell us if it is safe to
 100          * merge with this bio
 101          */
 102         unsigned long flags;
 103
 104         /* size of each individual stripe on disk */
 105         int stripe_len;
 106
 107         /* number of data stripes (no p/q) */
 108         int nr_data;
 109
 110         int real_stripes;
 111
 112         int stripe_npages;
 113         /*
 114          * set if we're doing a parity rebuild
 115          * for a read from higher up, which is handled
 116          * differently from a parity rebuild as part of
 117          * rmw
 118          */
 119         enum btrfs_rbio_ops operation;
 120
 121         /* first bad stripe */
 122         int faila;
 123
 124         /* second bad stripe (for raid6 use) */
 125         int failb;
 126
 127         int scrubp;
 128         /*
 129          * number of pages needed to represent the full
 130          * stripe
 131          */
 132         int nr_pages;
 133
 134         /*
 135          * size of all the bios in the bio_list.  This
 136          * helps us decide if the rbio maps to a full
 137          * stripe or not
 138          */
 139         int bio_list_bytes;
 140
 141         int generic_bio_cnt;
 142
 143         refcount_t refs;
 144
 145         atomic_t stripes_pending;
 146
 147         atomic_t error;
 148         /*
 149          * these are two arrays of pointers.  We allocate the
 150          * rbio big enough to hold them both and setup their
 151          * locations when the rbio is allocated
 152          */
 153
 154         /* pointers to pages that we allocated for
 155          * reading/writing stripes directly from the disk (including P/Q)
 156          */
 157         struct page **stripe_pages;
 158
 159         /*
 160          * pointers to the pages in the bio_list.  Stored
 161          * here for faster lookup
 162          */
 163         struct page **bio_pages;
 164
 165         /*
 166          * bitmap to record which horizontal stripe has data
 167          */
 168         unsigned long *dbitmap;
 169
 170         /* allocated with real_stripes-many pointers for finish_*() calls */
 171         void **finish_pointers;
 172
 173         /* allocated with stripe_npages-many bits for finish_*() calls */
 174         unsigned long *finish_pbitmap;
 175 };
 176
 177 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
 178 static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
 179 static void rmw_work(struct btrfs_work *work);
 180 static void read_rebuild_work(struct btrfs_work *work);
 181 static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
 182 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
 183 static void __free_raid_bio(struct btrfs_raid_bio *rbio);
 184 static void index_rbio_pages(struct btrfs_raid_bio *rbio);
 185 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
 186
 187 static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
 188                                          int need_check);
 189 static void scrub_parity_work(struct btrfs_work *work);
 190
 191 static void start_async_work(struct btrfs_raid_bio *rbio, btrfs_func_t work_func)
 192 {
 193         btrfs_init_work(&rbio->work, work_func, NULL, NULL);
 194         btrfs_queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work);
 195 }
 196
 197 /*
 198  * the stripe hash table is used for locking, and to collect
 199  * bios in hopes of making a full stripe
 200  */
 201 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
 202 {
 203         struct btrfs_stripe_hash_table *table;
 204         struct btrfs_stripe_hash_table *x;
 205         struct btrfs_stripe_hash *cur;
 206         struct btrfs_stripe_hash *h;
 207         int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
 208         int i;
 209
 210         if (info->stripe_hash_table)
 211                 return 0;
 212
 213         /*
 214          * The table is large, starting with order 4 and can go as high as
 215          * order 7 in case lock debugging is turned on.
 216          *
 217          * Try harder to allocate and fallback to vmalloc to lower the chance
 218          * of a failing mount.
 219          */
 220         table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL);
 221         if (!table)
 222                 return -ENOMEM;
 223
 224         spin_lock_init(&table->cache_lock);
 225         INIT_LIST_HEAD(&table->stripe_cache);
 226
 227         h = table->table;
 228
 229         for (i = 0; i < num_entries; i++) {
 230                 cur = h + i;
 231                 INIT_LIST_HEAD(&cur->hash_list);
 232                 spin_lock_init(&cur->lock);
 233         }
 234
 235         x = cmpxchg(&info->stripe_hash_table, NULL, table);
 236         kvfree(x);
 237         return 0;
 238 }
 239
 240 /*
 241  * caching an rbio means to copy anything from the
 242  * bio_pages array into the stripe_pages array.  We
 243  * use the page uptodate bit in the stripe cache array
 244  * to indicate if it has valid data
 245  *
 246  * once the caching is done, we set the cache ready
 247  * bit.
 248  */
 249 static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
 250 {
 251         int i;
 252         int ret;
 253
 254         ret = alloc_rbio_pages(rbio);
 255         if (ret)
 256                 return;
 257
 258         for (i = 0; i < rbio->nr_pages; i++) {
 259                 if (!rbio->bio_pages[i])
 260                         continue;
 261
 262                 copy_highpage(rbio->stripe_pages[i], rbio->bio_pages[i]);
 263                 SetPageUptodate(rbio->stripe_pages[i]);
 264         }
 265         set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
 266 }
 267
 268 /*
 269  * we hash on the first logical address of the stripe
 270  */
 271 static int rbio_bucket(struct btrfs_raid_bio *rbio)
 272 {
 273         u64 num = rbio->bioc->raid_map[0];
 274
 275         /*
 276          * we shift down quite a bit.  We're using byte
 277          * addressing, and most of the lower bits are zeros.
 278          * This tends to upset hash_64, and it consistently
 279          * returns just one or two different values.
 280          *
 281          * shifting off the lower bits fixes things.
 282          */
 283         return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
 284 }
 285
 286 /*
 287  * stealing an rbio means taking all the uptodate pages from the stripe
 288  * array in the source rbio and putting them into the destination rbio
 289  */
 290 static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
 291 {
 292         int i;
 293         struct page *s;
 294         struct page *d;
 295
 296         if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
 297                 return;
 298
 299         for (i = 0; i < dest->nr_pages; i++) {
 300                 s = src->stripe_pages[i];
 301                 if (!s || !PageUptodate(s)) {
 302                         continue;
 303                 }
 304
 305                 d = dest->stripe_pages[i];
 306                 if (d)
 307                         __free_page(d);
 308
 309                 dest->stripe_pages[i] = s;
 310                 src->stripe_pages[i] = NULL;
 311         }
 312 }
 313
 314 /*
 315  * merging means we take the bio_list from the victim and
 316  * splice it into the destination.  The victim should
 317  * be discarded afterwards.
 318  *
 319  * must be called with dest->rbio_list_lock held
 320  */
 321 static void merge_rbio(struct btrfs_raid_bio *dest,
 322                        struct btrfs_raid_bio *victim)
 323 {
 324         bio_list_merge(&dest->bio_list, &victim->bio_list);
 325         dest->bio_list_bytes += victim->bio_list_bytes;
 326         dest->generic_bio_cnt += victim->generic_bio_cnt;
 327         bio_list_init(&victim->bio_list);
 328 }
 329
 330 /*
 331  * used to prune items that are in the cache.  The caller
 332  * must hold the hash table lock.
 333  */
 334 static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
 335 {
 336         int bucket = rbio_bucket(rbio);
 337         struct btrfs_stripe_hash_table *table;
 338         struct btrfs_stripe_hash *h;
 339         int freeit = 0;
 340
 341         /*
 342          * check the bit again under the hash table lock.
 343          */
 344         if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
 345                 return;
 346
 347         table = rbio->bioc->fs_info->stripe_hash_table;
 348         h = table->table + bucket;
 349
 350         /* hold the lock for the bucket because we may be
 351          * removing it from the hash table
 352          */
 353         spin_lock(&h->lock);
 354
 355         /*
 356          * hold the lock for the bio list because we need
 357          * to make sure the bio list is empty
 358          */
 359         spin_lock(&rbio->bio_list_lock);
 360
 361         if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
 362                 list_del_init(&rbio->stripe_cache);
 363                 table->cache_size -= 1;
 364                 freeit = 1;
 365
 366                 /* if the bio list isn't empty, this rbio is
 367                  * still involved in an IO.  We take it out
 368                  * of the cache list, and drop the ref that
 369                  * was held for the list.
 370                  *
 371                  * If the bio_list was empty, we also remove
 372                  * the rbio from the hash_table, and drop
 373                  * the corresponding ref
 374                  */
 375                 if (bio_list_empty(&rbio->bio_list)) {
 376                         if (!list_empty(&rbio->hash_list)) {
 377                                 list_del_init(&rbio->hash_list);
 378                                 refcount_dec(&rbio->refs);
 379                                 BUG_ON(!list_empty(&rbio->plug_list));
 380                         }
 381                 }
 382         }
 383
 384         spin_unlock(&rbio->bio_list_lock);
 385         spin_unlock(&h->lock);
 386
 387         if (freeit)
 388                 __free_raid_bio(rbio);
 389 }
 390
 391 /*
 392  * prune a given rbio from the cache
 393  */
 394 static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
 395 {
 396         struct btrfs_stripe_hash_table *table;
 397         unsigned long flags;
 398
 399         if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
 400                 return;
 401
 402         table = rbio->bioc->fs_info->stripe_hash_table;
 403
 404         spin_lock_irqsave(&table->cache_lock, flags);
 405         __remove_rbio_from_cache(rbio);
 406         spin_unlock_irqrestore(&table->cache_lock, flags);
 407 }
 408
 409 /*
 410  * remove everything in the cache
 411  */
 412 static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
 413 {
 414         struct btrfs_stripe_hash_table *table;
 415         unsigned long flags;
 416         struct btrfs_raid_bio *rbio;
 417
 418         table = info->stripe_hash_table;
 419
 420         spin_lock_irqsave(&table->cache_lock, flags);
 421         while (!list_empty(&table->stripe_cache)) {
 422                 rbio = list_entry(table->stripe_cache.next,
 423                                   struct btrfs_raid_bio,
 424                                   stripe_cache);
 425                 __remove_rbio_from_cache(rbio);
 426         }
 427         spin_unlock_irqrestore(&table->cache_lock, flags);
 428 }
 429
 430 /*
 431  * remove all cached entries and free the hash table
 432  * used by unmount
 433  */
 434 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
 435 {
 436         if (!info->stripe_hash_table)
 437                 return;
 438         btrfs_clear_rbio_cache(info);
 439         kvfree(info->stripe_hash_table);
 440         info->stripe_hash_table = NULL;
 441 }
 442
 443 /*
 444  * insert an rbio into the stripe cache.  It
 445  * must have already been prepared by calling
 446  * cache_rbio_pages
 447  *
 448  * If this rbio was already cached, it gets
 449  * moved to the front of the lru.
 450  *
 451  * If the size of the rbio cache is too big, we
 452  * prune an item.
 453  */
 454 static void cache_rbio(struct btrfs_raid_bio *rbio)
 455 {
 456         struct btrfs_stripe_hash_table *table;
 457         unsigned long flags;
 458
 459         if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
 460                 return;
 461
 462         table = rbio->bioc->fs_info->stripe_hash_table;
 463
 464         spin_lock_irqsave(&table->cache_lock, flags);
 465         spin_lock(&rbio->bio_list_lock);
 466
 467         /* bump our ref if we were not in the list before */
 468         if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
 469                 refcount_inc(&rbio->refs);
 470
 471         if (!list_empty(&rbio->stripe_cache)){
 472                 list_move(&rbio->stripe_cache, &table->stripe_cache);
 473         } else {
 474                 list_add(&rbio->stripe_cache, &table->stripe_cache);
 475                 table->cache_size += 1;
 476         }
 477
 478         spin_unlock(&rbio->bio_list_lock);
 479
 480         if (table->cache_size > RBIO_CACHE_SIZE) {
 481                 struct btrfs_raid_bio *found;
 482
 483                 found = list_entry(table->stripe_cache.prev,
 484                                   struct btrfs_raid_bio,
 485                                   stripe_cache);
 486
 487                 if (found != rbio)
 488                         __remove_rbio_from_cache(found);
 489         }
 490
 491         spin_unlock_irqrestore(&table->cache_lock, flags);
 492 }
 493
 494 /*
 495  * helper function to run the xor_blocks api.  It is only
 496  * able to do MAX_XOR_BLOCKS at a time, so we need to
 497  * loop through.
 498  */
 499 static void run_xor(void **pages, int src_cnt, ssize_t len)
 500 {
 501         int src_off = 0;
 502         int xor_src_cnt = 0;
 503         void *dest = pages[src_cnt];
 504
 505         while(src_cnt > 0) {
 506                 xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
 507                 xor_blocks(xor_src_cnt, len, dest, pages + src_off);
 508
 509                 src_cnt -= xor_src_cnt;
 510                 src_off += xor_src_cnt;
 511         }
 512 }
 513
 514 /*
 515  * Returns true if the bio list inside this rbio covers an entire stripe (no
 516  * rmw required).
 517  */
 518 static int rbio_is_full(struct btrfs_raid_bio *rbio)
 519 {
 520         unsigned long flags;
 521         unsigned long size = rbio->bio_list_bytes;
 522         int ret = 1;
 523
 524         spin_lock_irqsave(&rbio->bio_list_lock, flags);
 525         if (size != rbio->nr_data * rbio->stripe_len)
 526                 ret = 0;
 527         BUG_ON(size > rbio->nr_data * rbio->stripe_len);
 528         spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
 529
 530         return ret;
 531 }
 532
 533 /*
 534  * returns 1 if it is safe to merge two rbios together.
 535  * The merging is safe if the two rbios correspond to
 536  * the same stripe and if they are both going in the same
 537  * direction (read vs write), and if neither one is
 538  * locked for final IO
 539  *
 540  * The caller is responsible for locking such that
 541  * rmw_locked is safe to test
 542  */
 543 static int rbio_can_merge(struct btrfs_raid_bio *last,
 544                           struct btrfs_raid_bio *cur)
 545 {
 546         if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
 547             test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
 548                 return 0;
 549
 550         /*
 551          * we can't merge with cached rbios, since the
 552          * idea is that when we merge the destination
 553          * rbio is going to run our IO for us.  We can
 554          * steal from cached rbios though, other functions
 555          * handle that.
 556          */
 557         if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
 558             test_bit(RBIO_CACHE_BIT, &cur->flags))
 559                 return 0;
 560
 561         if (last->bioc->raid_map[0] != cur->bioc->raid_map[0])
 562                 return 0;
 563
 564         /* we can't merge with different operations */
 565         if (last->operation != cur->operation)
 566                 return 0;
 567         /*
 568          * We've need read the full stripe from the drive.
 569          * check and repair the parity and write the new results.
 570          *
 571          * We're not allowed to add any new bios to the
 572          * bio list here, anyone else that wants to
 573          * change this stripe needs to do their own rmw.
 574          */
 575         if (last->operation == BTRFS_RBIO_PARITY_SCRUB)
 576                 return 0;
 577
 578         if (last->operation == BTRFS_RBIO_REBUILD_MISSING)
 579                 return 0;
 580
 581         if (last->operation == BTRFS_RBIO_READ_REBUILD) {
 582                 int fa = last->faila;
 583                 int fb = last->failb;
 584                 int cur_fa = cur->faila;
 585                 int cur_fb = cur->failb;
 586
 587                 if (last->faila >= last->failb) {
 588                         fa = last->failb;
 589                         fb = last->faila;
 590                 }
 591
 592                 if (cur->faila >= cur->failb) {
 593                         cur_fa = cur->failb;
 594                         cur_fb = cur->faila;
 595                 }
 596
 597                 if (fa != cur_fa || fb != cur_fb)
 598                         return 0;
 599         }
 600         return 1;
 601 }
 602
 603 static int rbio_stripe_page_index(struct btrfs_raid_bio *rbio, int stripe,
 604                                   int index)
 605 {
 606         return stripe * rbio->stripe_npages + index;
 607 }
 608
 609 /*
 610  * these are just the pages from the rbio array, not from anything
 611  * the FS sent down to us
 612  */
 613 static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe,
 614                                      int index)
 615 {
 616         return rbio->stripe_pages[rbio_stripe_page_index(rbio, stripe, index)];
 617 }
 618
 619 /*
 620  * helper to index into the pstripe
 621  */
 622 static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
 623 {
 624         return rbio_stripe_page(rbio, rbio->nr_data, index);
 625 }
 626
 627 /*
 628  * helper to index into the qstripe, returns null
 629  * if there is no qstripe
 630  */
 631 static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
 632 {
 633         if (rbio->nr_data + 1 == rbio->real_stripes)
 634                 return NULL;
 635         return rbio_stripe_page(rbio, rbio->nr_data + 1, index);
 636 }
 637
 638 /*
 639  * The first stripe in the table for a logical address
 640  * has the lock.  rbios are added in one of three ways:
 641  *
 642  * 1) Nobody has the stripe locked yet.  The rbio is given
 643  * the lock and 0 is returned.  The caller must start the IO
 644  * themselves.
 645  *
 646  * 2) Someone has the stripe locked, but we're able to merge
 647  * with the lock owner.  The rbio is freed and the IO will
 648  * start automatically along with the existing rbio.  1 is returned.
 649  *
 650  * 3) Someone has the stripe locked, but we're not able to merge.
 651  * The rbio is added to the lock owner's plug list, or merged into
 652  * an rbio already on the plug list.  When the lock owner unlocks,
 653  * the next rbio on the list is run and the IO is started automatically.
 654  * 1 is returned
 655  *
 656  * If we return 0, the caller still owns the rbio and must continue with
 657  * IO submission.  If we return 1, the caller must assume the rbio has
 658  * already been freed.
 659  */
 660 static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
 661 {
 662         struct btrfs_stripe_hash *h;
 663         struct btrfs_raid_bio *cur;
 664         struct btrfs_raid_bio *pending;
 665         unsigned long flags;
 666         struct btrfs_raid_bio *freeit = NULL;
 667         struct btrfs_raid_bio *cache_drop = NULL;
 668         int ret = 0;
 669
 670         h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
 671
 672         spin_lock_irqsave(&h->lock, flags);
 673         list_for_each_entry(cur, &h->hash_list, hash_list) {
 674                 if (cur->bioc->raid_map[0] != rbio->bioc->raid_map[0])
 675                         continue;
 676
 677                 spin_lock(&cur->bio_list_lock);
 678
 679                 /* Can we steal this cached rbio's pages? */
 680                 if (bio_list_empty(&cur->bio_list) &&
 681                     list_empty(&cur->plug_list) &&
 682                     test_bit(RBIO_CACHE_BIT, &cur->flags) &&
 683                     !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
 684                         list_del_init(&cur->hash_list);
 685                         refcount_dec(&cur->refs);
 686
 687                         steal_rbio(cur, rbio);
 688                         cache_drop = cur;
 689                         spin_unlock(&cur->bio_list_lock);
 690
 691                         goto lockit;
 692                 }
 693
 694                 /* Can we merge into the lock owner? */
 695                 if (rbio_can_merge(cur, rbio)) {
 696                         merge_rbio(cur, rbio);
 697                         spin_unlock(&cur->bio_list_lock);
 698                         freeit = rbio;
 699                         ret = 1;
 700                         goto out;
 701                 }
 702
 703
 704                 /*
 705                  * We couldn't merge with the running rbio, see if we can merge
 706                  * with the pending ones.  We don't have to check for rmw_locked
 707                  * because there is no way they are inside finish_rmw right now
 708                  */
 709                 list_for_each_entry(pending, &cur->plug_list, plug_list) {
 710                         if (rbio_can_merge(pending, rbio)) {
 711                                 merge_rbio(pending, rbio);
 712                                 spin_unlock(&cur->bio_list_lock);
 713                                 freeit = rbio;
 714                                 ret = 1;
 715                                 goto out;
 716                         }
 717                 }
 718
 719                 /*
 720                  * No merging, put us on the tail of the plug list, our rbio
 721                  * will be started with the currently running rbio unlocks
 722                  */
 723                 list_add_tail(&rbio->plug_list, &cur->plug_list);
 724                 spin_unlock(&cur->bio_list_lock);
 725                 ret = 1;
 726                 goto out;
 727         }
 728 lockit:
 729         refcount_inc(&rbio->refs);
 730         list_add(&rbio->hash_list, &h->hash_list);
 731 out:
 732         spin_unlock_irqrestore(&h->lock, flags);
 733         if (cache_drop)
 734                 remove_rbio_from_cache(cache_drop);
 735         if (freeit)
 736                 __free_raid_bio(freeit);
 737         return ret;
 738 }
 739
 740 /*
 741  * called as rmw or parity rebuild is completed.  If the plug list has more
 742  * rbios waiting for this stripe, the next one on the list will be started
 743  */
 744 static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
 745 {
 746         int bucket;
 747         struct btrfs_stripe_hash *h;
 748         unsigned long flags;
 749         int keep_cache = 0;
 750
 751         bucket = rbio_bucket(rbio);
 752         h = rbio->bioc->fs_info->stripe_hash_table->table + bucket;
 753
 754         if (list_empty(&rbio->plug_list))
 755                 cache_rbio(rbio);
 756
 757         spin_lock_irqsave(&h->lock, flags);
 758         spin_lock(&rbio->bio_list_lock);
 759
 760         if (!list_empty(&rbio->hash_list)) {
 761                 /*
 762                  * if we're still cached and there is no other IO
 763                  * to perform, just leave this rbio here for others
 764                  * to steal from later
 765                  */
 766                 if (list_empty(&rbio->plug_list) &&
 767                     test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
 768                         keep_cache = 1;
 769                         clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
 770                         BUG_ON(!bio_list_empty(&rbio->bio_list));
 771                         goto done;
 772                 }
 773
 774                 list_del_init(&rbio->hash_list);
 775                 refcount_dec(&rbio->refs);
 776
 777                 /*
 778                  * we use the plug list to hold all the rbios
 779                  * waiting for the chance to lock this stripe.
 780                  * hand the lock over to one of them.
 781                  */
 782                 if (!list_empty(&rbio->plug_list)) {
 783                         struct btrfs_raid_bio *next;
 784                         struct list_head *head = rbio->plug_list.next;
 785
 786                         next = list_entry(head, struct btrfs_raid_bio,
 787                                           plug_list);
 788
 789                         list_del_init(&rbio->plug_list);
 790
 791                         list_add(&next->hash_list, &h->hash_list);
 792                         refcount_inc(&next->refs);
 793                         spin_unlock(&rbio->bio_list_lock);
 794                         spin_unlock_irqrestore(&h->lock, flags);
 795
 796                         if (next->operation == BTRFS_RBIO_READ_REBUILD)
 797                                 start_async_work(next, read_rebuild_work);
 798                         else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) {
 799                                 steal_rbio(rbio, next);
 800                                 start_async_work(next, read_rebuild_work);
 801                         } else if (next->operation == BTRFS_RBIO_WRITE) {
 802                                 steal_rbio(rbio, next);
 803                                 start_async_work(next, rmw_work);
 804                         } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
 805                                 steal_rbio(rbio, next);
 806                                 start_async_work(next, scrub_parity_work);
 807                         }
 808
 809                         goto done_nolock;
 810                 }
 811         }
 812 done:
 813         spin_unlock(&rbio->bio_list_lock);
 814         spin_unlock_irqrestore(&h->lock, flags);
 815
 816 done_nolock:
 817         if (!keep_cache)
 818                 remove_rbio_from_cache(rbio);
 819 }
 820
 821 static void __free_raid_bio(struct btrfs_raid_bio *rbio)
 822 {
 823         int i;
 824
 825         if (!refcount_dec_and_test(&rbio->refs))
 826                 return;
 827
 828         WARN_ON(!list_empty(&rbio->stripe_cache));
 829         WARN_ON(!list_empty(&rbio->hash_list));
 830         WARN_ON(!bio_list_empty(&rbio->bio_list));
 831
 832         for (i = 0; i < rbio->nr_pages; i++) {
 833                 if (rbio->stripe_pages[i]) {
 834                         __free_page(rbio->stripe_pages[i]);
 835                         rbio->stripe_pages[i] = NULL;
 836                 }
 837         }
 838
 839         btrfs_put_bioc(rbio->bioc);
 840         kfree(rbio);
 841 }
 842
 843 static void rbio_endio_bio_list(struct bio *cur, blk_status_t err)
 844 {
 845         struct bio *next;
 846
 847         while (cur) {
 848                 next = cur->bi_next;
 849                 cur->bi_next = NULL;
 850                 cur->bi_status = err;
 851                 bio_endio(cur);
 852                 cur = next;
 853         }
 854 }
 855
 856 /*
 857  * this frees the rbio and runs through all the bios in the
 858  * bio_list and calls end_io on them
 859  */
 860 static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
 861 {
 862         struct bio *cur = bio_list_get(&rbio->bio_list);
 863         struct bio *extra;
 864
 865         if (rbio->generic_bio_cnt)
 866                 btrfs_bio_counter_sub(rbio->bioc->fs_info, rbio->generic_bio_cnt);
 867
 868         /*
 869          * At this moment, rbio->bio_list is empty, however since rbio does not
 870          * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the
 871          * hash list, rbio may be merged with others so that rbio->bio_list
 872          * becomes non-empty.
 873          * Once unlock_stripe() is done, rbio->bio_list will not be updated any
 874          * more and we can call bio_endio() on all queued bios.
 875          */
 876         unlock_stripe(rbio);
 877         extra = bio_list_get(&rbio->bio_list);
 878         __free_raid_bio(rbio);
 879
 880         rbio_endio_bio_list(cur, err);
 881         if (extra)
 882                 rbio_endio_bio_list(extra, err);
 883 }
 884
 885 /*
 886  * end io function used by finish_rmw.  When we finally
 887  * get here, we've written a full stripe
 888  */
 889 static void raid_write_end_io(struct bio *bio)
 890 {
 891         struct btrfs_raid_bio *rbio = bio->bi_private;
 892         blk_status_t err = bio->bi_status;
 893         int max_errors;
 894
 895         if (err)
 896                 fail_bio_stripe(rbio, bio);
 897
 898         bio_put(bio);
 899
 900         if (!atomic_dec_and_test(&rbio->stripes_pending))
 901                 return;
 902
 903         err = BLK_STS_OK;
 904
 905         /* OK, we have read all the stripes we need to. */
 906         max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ?
 907                      0 : rbio->bioc->max_errors;
 908         if (atomic_read(&rbio->error) > max_errors)
 909                 err = BLK_STS_IOERR;
 910
 911         rbio_orig_end_io(rbio, err);
 912 }
 913
 914 /*
 915  * the read/modify/write code wants to use the original bio for
 916  * any pages it included, and then use the rbio for everything
 917  * else.  This function decides if a given index (stripe number)
 918  * and page number in that stripe fall inside the original bio
 919  * or the rbio.
 920  *
 921  * if you set bio_list_only, you'll get a NULL back for any ranges
 922  * that are outside the bio_list
 923  *
 924  * This doesn't take any refs on anything, you get a bare page pointer
 925  * and the caller must bump refs as required.
 926  *
 927  * You must call index_rbio_pages once before you can trust
 928  * the answers from this function.
 929  */
 930 static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
 931                                  int index, int pagenr, int bio_list_only)
 932 {
 933         int chunk_page;
 934         struct page *p = NULL;
 935
 936         chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
 937
 938         spin_lock_irq(&rbio->bio_list_lock);
 939         p = rbio->bio_pages[chunk_page];
 940         spin_unlock_irq(&rbio->bio_list_lock);
 941
 942         if (p || bio_list_only)
 943                 return p;
 944
 945         return rbio->stripe_pages[chunk_page];
 946 }
 947
 948 /*
 949  * number of pages we need for the entire stripe across all the
 950  * drives
 951  */
 952 static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
 953 {
 954         return DIV_ROUND_UP(stripe_len, PAGE_SIZE) * nr_stripes;
 955 }
 956
 957 /*
 958  * allocation and initial setup for the btrfs_raid_bio.  Not
 959  * this does not allocate any pages for rbio->pages.
 960  */
 961 static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
 962                                          struct btrfs_io_context *bioc,
 963                                          u64 stripe_len)
 964 {
 965         struct btrfs_raid_bio *rbio;
 966         int nr_data = 0;
 967         int real_stripes = bioc->num_stripes - bioc->num_tgtdevs;
 968         int num_pages = rbio_nr_pages(stripe_len, real_stripes);
 969         int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE);
 970         void *p;
 971
 972         rbio = kzalloc(sizeof(*rbio) +
 973                        sizeof(*rbio->stripe_pages) * num_pages +
 974                        sizeof(*rbio->bio_pages) * num_pages +
 975                        sizeof(*rbio->finish_pointers) * real_stripes +
 976                        sizeof(*rbio->dbitmap) * BITS_TO_LONGS(stripe_npages) +
 977                        sizeof(*rbio->finish_pbitmap) *
 978                                 BITS_TO_LONGS(stripe_npages),
 979                        GFP_NOFS);
 980         if (!rbio)
 981                 return ERR_PTR(-ENOMEM);
 982
 983         bio_list_init(&rbio->bio_list);
 984         INIT_LIST_HEAD(&rbio->plug_list);
 985         spin_lock_init(&rbio->bio_list_lock);
 986         INIT_LIST_HEAD(&rbio->stripe_cache);
 987         INIT_LIST_HEAD(&rbio->hash_list);
 988         rbio->bioc = bioc;
 989         rbio->stripe_len = stripe_len;
 990         rbio->nr_pages = num_pages;
 991         rbio->real_stripes = real_stripes;
 992         rbio->stripe_npages = stripe_npages;
 993         rbio->faila = -1;
 994         rbio->failb = -1;
 995         refcount_set(&rbio->refs, 1);
 996         atomic_set(&rbio->error, 0);
 997         atomic_set(&rbio->stripes_pending, 0);
 998
 999         /*
1000          * the stripe_pages, bio_pages, etc arrays point to the extra
1001          * memory we allocated past the end of the rbio
1002          */
1003         p = rbio + 1;
1004 #define CONSUME_ALLOC(ptr, count)       do {                            \
1005                 ptr = p;                                                \
1006                 p = (unsigned char *)p + sizeof(*(ptr)) * (count);      \
1007         } while (0)
1008         CONSUME_ALLOC(rbio->stripe_pages, num_pages);
1009         CONSUME_ALLOC(rbio->bio_pages, num_pages);
1010         CONSUME_ALLOC(rbio->finish_pointers, real_stripes);
1011         CONSUME_ALLOC(rbio->dbitmap, BITS_TO_LONGS(stripe_npages));
1012         CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_npages));
1013 #undef  CONSUME_ALLOC
1014
1015         if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
1016                 nr_data = real_stripes - 1;
1017         else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
1018                 nr_data = real_stripes - 2;
1019         else
1020                 BUG();
1021
1022         rbio->nr_data = nr_data;
1023         return rbio;
1024 }
1025
1026 /* allocate pages for all the stripes in the bio, including parity */
1027 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
1028 {
1029         int i;
1030         struct page *page;
1031
1032         for (i = 0; i < rbio->nr_pages; i++) {
1033                 if (rbio->stripe_pages[i])
1034                         continue;
1035                 page = alloc_page(GFP_NOFS);
1036                 if (!page)
1037                         return -ENOMEM;
1038                 rbio->stripe_pages[i] = page;
1039         }
1040         return 0;
1041 }
1042
1043 /* only allocate pages for p/q stripes */
1044 static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
1045 {
1046         int i;
1047         struct page *page;
1048
1049         i = rbio_stripe_page_index(rbio, rbio->nr_data, 0);
1050
1051         for (; i < rbio->nr_pages; i++) {
1052                 if (rbio->stripe_pages[i])
1053                         continue;
1054                 page = alloc_page(GFP_NOFS);
1055                 if (!page)
1056                         return -ENOMEM;
1057                 rbio->stripe_pages[i] = page;
1058         }
1059         return 0;
1060 }
1061
1062 /*
1063  * add a single page from a specific stripe into our list of bios for IO
1064  * this will try to merge into existing bios if possible, and returns
1065  * zero if all went well.
1066  */
1067 static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
1068                             struct bio_list *bio_list,
1069                             struct page *page,
1070                             int stripe_nr,
1071                             unsigned long page_index,
1072                             unsigned long bio_max_len)
1073 {
1074         struct bio *last = bio_list->tail;
1075         int ret;
1076         struct bio *bio;
1077         struct btrfs_io_stripe *stripe;
1078         u64 disk_start;
1079
1080         stripe = &rbio->bioc->stripes[stripe_nr];
1081         disk_start = stripe->physical + (page_index << PAGE_SHIFT);
1082
1083         /* if the device is missing, just fail this stripe */
1084         if (!stripe->dev->bdev)
1085                 return fail_rbio_index(rbio, stripe_nr);
1086
1087         /* see if we can add this page onto our existing bio */
1088         if (last) {
1089                 u64 last_end = last->bi_iter.bi_sector << 9;
1090                 last_end += last->bi_iter.bi_size;
1091
1092                 /*
1093                  * we can't merge these if they are from different
1094                  * devices or if they are not contiguous
1095                  */
1096                 if (last_end == disk_start && !last->bi_status &&
1097                     last->bi_bdev == stripe->dev->bdev) {
1098                         ret = bio_add_page(last, page, PAGE_SIZE, 0);
1099                         if (ret == PAGE_SIZE)
1100                                 return 0;
1101                 }
1102         }
1103
1104         /* put a new bio on the list */
1105         bio = btrfs_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1);
1106         btrfs_bio(bio)->device = stripe->dev;
1107         bio->bi_iter.bi_size = 0;
1108         bio_set_dev(bio, stripe->dev->bdev);
1109         bio->bi_iter.bi_sector = disk_start >> 9;
1110
1111         bio_add_page(bio, page, PAGE_SIZE, 0);
1112         bio_list_add(bio_list, bio);
1113         return 0;
1114 }
1115
1116 /*
1117  * while we're doing the read/modify/write cycle, we could
1118  * have errors in reading pages off the disk.  This checks
1119  * for errors and if we're not able to read the page it'll
1120  * trigger parity reconstruction.  The rmw will be finished
1121  * after we've reconstructed the failed stripes
1122  */
1123 static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
1124 {
1125         if (rbio->faila >= 0 || rbio->failb >= 0) {
1126                 BUG_ON(rbio->faila == rbio->real_stripes - 1);
1127                 __raid56_parity_recover(rbio);
1128         } else {
1129                 finish_rmw(rbio);
1130         }
1131 }
1132
1133 /*
1134  * helper function to walk our bio list and populate the bio_pages array with
1135  * the result.  This seems expensive, but it is faster than constantly
1136  * searching through the bio list as we setup the IO in finish_rmw or stripe
1137  * reconstruction.
1138  *
1139  * This must be called before you trust the answers from page_in_rbio
1140  */
1141 static void index_rbio_pages(struct btrfs_raid_bio *rbio)
1142 {
1143         struct bio *bio;
1144         u64 start;
1145         unsigned long stripe_offset;
1146         unsigned long page_index;
1147
1148         spin_lock_irq(&rbio->bio_list_lock);
1149         bio_list_for_each(bio, &rbio->bio_list) {
1150                 struct bio_vec bvec;
1151                 struct bvec_iter iter;
1152                 int i = 0;
1153
1154                 start = bio->bi_iter.bi_sector << 9;
1155                 stripe_offset = start - rbio->bioc->raid_map[0];
1156                 page_index = stripe_offset >> PAGE_SHIFT;
1157
1158                 if (bio_flagged(bio, BIO_CLONED))
1159                         bio->bi_iter = btrfs_bio(bio)->iter;
1160
1161                 bio_for_each_segment(bvec, bio, iter) {
1162                         rbio->bio_pages[page_index + i] = bvec.bv_page;
1163                         i++;
1164                 }
1165         }
1166         spin_unlock_irq(&rbio->bio_list_lock);
1167 }
1168
1169 /*
1170  * this is called from one of two situations.  We either
1171  * have a full stripe from the higher layers, or we've read all
1172  * the missing bits off disk.
1173  *
1174  * This will calculate the parity and then send down any
1175  * changed blocks.
1176  */
1177 static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1178 {
1179         struct btrfs_io_context *bioc = rbio->bioc;
1180         void **pointers = rbio->finish_pointers;
1181         int nr_data = rbio->nr_data;
1182         int stripe;
1183         int pagenr;
1184         bool has_qstripe;
1185         struct bio_list bio_list;
1186         struct bio *bio;
1187         int ret;
1188
1189         bio_list_init(&bio_list);
1190
1191         if (rbio->real_stripes - rbio->nr_data == 1)
1192                 has_qstripe = false;
1193         else if (rbio->real_stripes - rbio->nr_data == 2)
1194                 has_qstripe = true;
1195         else
1196                 BUG();
1197
1198         /* at this point we either have a full stripe,
1199          * or we've read the full stripe from the drive.
1200          * recalculate the parity and write the new results.
1201          *
1202          * We're not allowed to add any new bios to the
1203          * bio list here, anyone else that wants to
1204          * change this stripe needs to do their own rmw.
1205          */
1206         spin_lock_irq(&rbio->bio_list_lock);
1207         set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1208         spin_unlock_irq(&rbio->bio_list_lock);
1209
1210         atomic_set(&rbio->error, 0);
1211
1212         /*
1213          * now that we've set rmw_locked, run through the
1214          * bio list one last time and map the page pointers
1215          *
1216          * We don't cache full rbios because we're assuming
1217          * the higher layers are unlikely to use this area of
1218          * the disk again soon.  If they do use it again,
1219          * hopefully they will send another full bio.
1220          */
1221         index_rbio_pages(rbio);
1222         if (!rbio_is_full(rbio))
1223                 cache_rbio_pages(rbio);
1224         else
1225                 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
1226
1227         for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
1228                 struct page *p;
1229                 /* first collect one page from each data stripe */
1230                 for (stripe = 0; stripe < nr_data; stripe++) {
1231                         p = page_in_rbio(rbio, stripe, pagenr, 0);
1232                         pointers[stripe] = kmap_local_page(p);
1233                 }
1234
1235                 /* then add the parity stripe */
1236                 p = rbio_pstripe_page(rbio, pagenr);
1237                 SetPageUptodate(p);
1238                 pointers[stripe++] = kmap_local_page(p);
1239
1240                 if (has_qstripe) {
1241
1242                         /*
1243                          * raid6, add the qstripe and call the
1244                          * library function to fill in our p/q
1245                          */
1246                         p = rbio_qstripe_page(rbio, pagenr);
1247                         SetPageUptodate(p);
1248                         pointers[stripe++] = kmap_local_page(p);
1249
1250                         raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
1251                                                 pointers);
1252                 } else {
1253                         /* raid5 */
1254                         copy_page(pointers[nr_data], pointers[0]);
1255                         run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
1256                 }
1257                 for (stripe = stripe - 1; stripe >= 0; stripe--)
1258                         kunmap_local(pointers[stripe]);
1259         }
1260
1261         /*
1262          * time to start writing.  Make bios for everything from the
1263          * higher layers (the bio_list in our rbio) and our p/q.  Ignore
1264          * everything else.
1265          */
1266         for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1267                 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
1268                         struct page *page;
1269                         if (stripe < rbio->nr_data) {
1270                                 page = page_in_rbio(rbio, stripe, pagenr, 1);
1271                                 if (!page)
1272                                         continue;
1273                         } else {
1274                                page = rbio_stripe_page(rbio, stripe, pagenr);
1275                         }
1276
1277                         ret = rbio_add_io_page(rbio, &bio_list,
1278                                        page, stripe, pagenr, rbio->stripe_len);
1279                         if (ret)
1280                                 goto cleanup;
1281                 }
1282         }
1283
1284         if (likely(!bioc->num_tgtdevs))
1285                 goto write_data;
1286
1287         for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1288                 if (!bioc->tgtdev_map[stripe])
1289                         continue;
1290
1291                 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
1292                         struct page *page;
1293                         if (stripe < rbio->nr_data) {
1294                                 page = page_in_rbio(rbio, stripe, pagenr, 1);
1295                                 if (!page)
1296                                         continue;
1297                         } else {
1298                                page = rbio_stripe_page(rbio, stripe, pagenr);
1299                         }
1300
1301                         ret = rbio_add_io_page(rbio, &bio_list, page,
1302                                                rbio->bioc->tgtdev_map[stripe],
1303                                                pagenr, rbio->stripe_len);
1304                         if (ret)
1305                                 goto cleanup;
1306                 }
1307         }
1308
1309 write_data:
1310         atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
1311         BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
1312
1313         while ((bio = bio_list_pop(&bio_list))) {
1314                 bio->bi_private = rbio;
1315                 bio->bi_end_io = raid_write_end_io;
1316                 bio->bi_opf = REQ_OP_WRITE;
1317
1318                 submit_bio(bio);
1319         }
1320         return;
1321
1322 cleanup:
1323         rbio_orig_end_io(rbio, BLK_STS_IOERR);
1324
1325         while ((bio = bio_list_pop(&bio_list)))
1326                 bio_put(bio);
1327 }
1328
1329 /*
1330  * helper to find the stripe number for a given bio.  Used to figure out which
1331  * stripe has failed.  This expects the bio to correspond to a physical disk,
1332  * so it looks up based on physical sector numbers.
1333  */
1334 static int find_bio_stripe(struct btrfs_raid_bio *rbio,
1335                            struct bio *bio)
1336 {
1337         u64 physical = bio->bi_iter.bi_sector;
1338         int i;
1339         struct btrfs_io_stripe *stripe;
1340
1341         physical <<= 9;
1342
1343         for (i = 0; i < rbio->bioc->num_stripes; i++) {
1344                 stripe = &rbio->bioc->stripes[i];
1345                 if (in_range(physical, stripe->physical, rbio->stripe_len) &&
1346                     stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) {
1347                         return i;
1348                 }
1349         }
1350         return -1;
1351 }
1352
1353 /*
1354  * helper to find the stripe number for a given
1355  * bio (before mapping).  Used to figure out which stripe has
1356  * failed.  This looks up based on logical block numbers.
1357  */
1358 static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
1359                                    struct bio *bio)
1360 {
1361         u64 logical = bio->bi_iter.bi_sector << 9;
1362         int i;
1363
1364         for (i = 0; i < rbio->nr_data; i++) {
1365                 u64 stripe_start = rbio->bioc->raid_map[i];
1366
1367                 if (in_range(logical, stripe_start, rbio->stripe_len))
1368                         return i;
1369         }
1370         return -1;
1371 }
1372
1373 /*
1374  * returns -EIO if we had too many failures
1375  */
1376 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
1377 {
1378         unsigned long flags;
1379         int ret = 0;
1380
1381         spin_lock_irqsave(&rbio->bio_list_lock, flags);
1382
1383         /* we already know this stripe is bad, move on */
1384         if (rbio->faila == failed || rbio->failb == failed)
1385                 goto out;
1386
1387         if (rbio->faila == -1) {
1388                 /* first failure on this rbio */
1389                 rbio->faila = failed;
1390                 atomic_inc(&rbio->error);
1391         } else if (rbio->failb == -1) {
1392                 /* second failure on this rbio */
1393                 rbio->failb = failed;
1394                 atomic_inc(&rbio->error);
1395         } else {
1396                 ret = -EIO;
1397         }
1398 out:
1399         spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
1400
1401         return ret;
1402 }
1403
1404 /*
1405  * helper to fail a stripe based on a physical disk
1406  * bio.
1407  */
1408 static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
1409                            struct bio *bio)
1410 {
1411         int failed = find_bio_stripe(rbio, bio);
1412
1413         if (failed < 0)
1414                 return -EIO;
1415
1416         return fail_rbio_index(rbio, failed);
1417 }
1418
1419 /*
1420  * this sets each page in the bio uptodate.  It should only be used on private
1421  * rbio pages, nothing that comes in from the higher layers
1422  */
1423 static void set_bio_pages_uptodate(struct bio *bio)
1424 {
1425         struct bio_vec *bvec;
1426         struct bvec_iter_all iter_all;
1427
1428         ASSERT(!bio_flagged(bio, BIO_CLONED));
1429
1430         bio_for_each_segment_all(bvec, bio, iter_all)
1431                 SetPageUptodate(bvec->bv_page);
1432 }
1433
1434 /*
1435  * end io for the read phase of the rmw cycle.  All the bios here are physical
1436  * stripe bios we've read from the disk so we can recalculate the parity of the
1437  * stripe.
1438  *
1439  * This will usually kick off finish_rmw once all the bios are read in, but it
1440  * may trigger parity reconstruction if we had any errors along the way
1441  */
1442 static void raid_rmw_end_io(struct bio *bio)
1443 {
1444         struct btrfs_raid_bio *rbio = bio->bi_private;
1445
1446         if (bio->bi_status)
1447                 fail_bio_stripe(rbio, bio);
1448         else
1449                 set_bio_pages_uptodate(bio);
1450
1451         bio_put(bio);
1452
1453         if (!atomic_dec_and_test(&rbio->stripes_pending))
1454                 return;
1455
1456         if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
1457                 goto cleanup;
1458
1459         /*
1460          * this will normally call finish_rmw to start our write
1461          * but if there are any failed stripes we'll reconstruct
1462          * from parity first
1463          */
1464         validate_rbio_for_rmw(rbio);
1465         return;
1466
1467 cleanup:
1468
1469         rbio_orig_end_io(rbio, BLK_STS_IOERR);
1470 }
1471
1472 /*
1473  * the stripe must be locked by the caller.  It will
1474  * unlock after all the writes are done
1475  */
1476 static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1477 {
1478         int bios_to_read = 0;
1479         struct bio_list bio_list;
1480         int ret;
1481         int pagenr;
1482         int stripe;
1483         struct bio *bio;
1484
1485         bio_list_init(&bio_list);
1486
1487         ret = alloc_rbio_pages(rbio);
1488         if (ret)
1489                 goto cleanup;
1490
1491         index_rbio_pages(rbio);
1492
1493         atomic_set(&rbio->error, 0);
1494         /*
1495          * build a list of bios to read all the missing parts of this
1496          * stripe
1497          */
1498         for (stripe = 0; stripe < rbio->nr_data; stripe++) {
1499                 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
1500                         struct page *page;
1501                         /*
1502                          * we want to find all the pages missing from
1503                          * the rbio and read them from the disk.  If
1504                          * page_in_rbio finds a page in the bio list
1505                          * we don't need to read it off the stripe.
1506                          */
1507                         page = page_in_rbio(rbio, stripe, pagenr, 1);
1508                         if (page)
1509                                 continue;
1510
1511                         page = rbio_stripe_page(rbio, stripe, pagenr);
1512                         /*
1513                          * the bio cache may have handed us an uptodate
1514                          * page.  If so, be happy and use it
1515                          */
1516                         if (PageUptodate(page))
1517                                 continue;
1518
1519                         ret = rbio_add_io_page(rbio, &bio_list, page,
1520                                        stripe, pagenr, rbio->stripe_len);
1521                         if (ret)
1522                                 goto cleanup;
1523                 }
1524         }
1525
1526         bios_to_read = bio_list_size(&bio_list);
1527         if (!bios_to_read) {
1528                 /*
1529                  * this can happen if others have merged with
1530                  * us, it means there is nothing left to read.
1531                  * But if there are missing devices it may not be
1532                  * safe to do the full stripe write yet.
1533                  */
1534                 goto finish;
1535         }
1536
1537         /*
1538          * The bioc may be freed once we submit the last bio. Make sure not to
1539          * touch it after that.
1540          */
1541         atomic_set(&rbio->stripes_pending, bios_to_read);
1542         while ((bio = bio_list_pop(&bio_list))) {
1543                 bio->bi_private = rbio;
1544                 bio->bi_end_io = raid_rmw_end_io;
1545                 bio->bi_opf = REQ_OP_READ;
1546
1547                 btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
1548
1549                 submit_bio(bio);
1550         }
1551         /* the actual write will happen once the reads are done */
1552         return 0;
1553
1554 cleanup:
1555         rbio_orig_end_io(rbio, BLK_STS_IOERR);
1556
1557         while ((bio = bio_list_pop(&bio_list)))
1558                 bio_put(bio);
1559
1560         return -EIO;
1561
1562 finish:
1563         validate_rbio_for_rmw(rbio);
1564         return 0;
1565 }
1566
1567 /*
1568  * if the upper layers pass in a full stripe, we thank them by only allocating
1569  * enough pages to hold the parity, and sending it all down quickly.
1570  */
1571 static int full_stripe_write(struct btrfs_raid_bio *rbio)
1572 {
1573         int ret;
1574
1575         ret = alloc_rbio_parity_pages(rbio);
1576         if (ret) {
1577                 __free_raid_bio(rbio);
1578                 return ret;
1579         }
1580
1581         ret = lock_stripe_add(rbio);
1582         if (ret == 0)
1583                 finish_rmw(rbio);
1584         return 0;
1585 }
1586
1587 /*
1588  * partial stripe writes get handed over to async helpers.
1589  * We're really hoping to merge a few more writes into this
1590  * rbio before calculating new parity
1591  */
1592 static int partial_stripe_write(struct btrfs_raid_bio *rbio)
1593 {
1594         int ret;
1595
1596         ret = lock_stripe_add(rbio);
1597         if (ret == 0)
1598                 start_async_work(rbio, rmw_work);
1599         return 0;
1600 }
1601
1602 /*
1603  * sometimes while we were reading from the drive to
1604  * recalculate parity, enough new bios come into create
1605  * a full stripe.  So we do a check here to see if we can
1606  * go directly to finish_rmw
1607  */
1608 static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
1609 {
1610         /* head off into rmw land if we don't have a full stripe */
1611         if (!rbio_is_full(rbio))
1612                 return partial_stripe_write(rbio);
1613         return full_stripe_write(rbio);
1614 }
1615
1616 /*
1617  * We use plugging call backs to collect full stripes.
1618  * Any time we get a partial stripe write while plugged
1619  * we collect it into a list.  When the unplug comes down,
1620  * we sort the list by logical block number and merge
1621  * everything we can into the same rbios
1622  */
1623 struct btrfs_plug_cb {
1624         struct blk_plug_cb cb;
1625         struct btrfs_fs_info *info;
1626         struct list_head rbio_list;
1627         struct btrfs_work work;
1628 };
1629
1630 /*
1631  * rbios on the plug list are sorted for easier merging.
1632  */
1633 static int plug_cmp(void *priv, const struct list_head *a,
1634                     const struct list_head *b)
1635 {
1636         const struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
1637                                                        plug_list);
1638         const struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
1639                                                        plug_list);
1640         u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
1641         u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
1642
1643         if (a_sector < b_sector)
1644                 return -1;
1645         if (a_sector > b_sector)
1646                 return 1;
1647         return 0;
1648 }
1649
1650 static void run_plug(struct btrfs_plug_cb *plug)
1651 {
1652         struct btrfs_raid_bio *cur;
1653         struct btrfs_raid_bio *last = NULL;
1654
1655         /*
1656          * sort our plug list then try to merge
1657          * everything we can in hopes of creating full
1658          * stripes.
1659          */
1660         list_sort(NULL, &plug->rbio_list, plug_cmp);
1661         while (!list_empty(&plug->rbio_list)) {
1662                 cur = list_entry(plug->rbio_list.next,
1663                                  struct btrfs_raid_bio, plug_list);
1664                 list_del_init(&cur->plug_list);
1665
1666                 if (rbio_is_full(cur)) {
1667                         int ret;
1668
1669                         /* we have a full stripe, send it down */
1670                         ret = full_stripe_write(cur);
1671                         BUG_ON(ret);
1672                         continue;
1673                 }
1674                 if (last) {
1675                         if (rbio_can_merge(last, cur)) {
1676                                 merge_rbio(last, cur);
1677                                 __free_raid_bio(cur);
1678                                 continue;
1679
1680                         }
1681                         __raid56_parity_write(last);
1682                 }
1683                 last = cur;
1684         }
1685         if (last) {
1686                 __raid56_parity_write(last);
1687         }
1688         kfree(plug);
1689 }
1690
1691 /*
1692  * if the unplug comes from schedule, we have to push the
1693  * work off to a helper thread
1694  */
1695 static void unplug_work(struct btrfs_work *work)
1696 {
1697         struct btrfs_plug_cb *plug;
1698         plug = container_of(work, struct btrfs_plug_cb, work);
1699         run_plug(plug);
1700 }
1701
1702 static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1703 {
1704         struct btrfs_plug_cb *plug;
1705         plug = container_of(cb, struct btrfs_plug_cb, cb);
1706
1707         if (from_schedule) {
1708                 btrfs_init_work(&plug->work, unplug_work, NULL, NULL);
1709                 btrfs_queue_work(plug->info->rmw_workers,
1710                                  &plug->work);
1711                 return;
1712         }
1713         run_plug(plug);
1714 }
1715
1716 /*
1717  * our main entry point for writes from the rest of the FS.
1718  */
1719 int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc,
1720                         u64 stripe_len)
1721 {
1722         struct btrfs_fs_info *fs_info = bioc->fs_info;
1723         struct btrfs_raid_bio *rbio;
1724         struct btrfs_plug_cb *plug = NULL;
1725         struct blk_plug_cb *cb;
1726         int ret;
1727
1728         rbio = alloc_rbio(fs_info, bioc, stripe_len);
1729         if (IS_ERR(rbio)) {
1730                 btrfs_put_bioc(bioc);
1731                 return PTR_ERR(rbio);
1732         }
1733         bio_list_add(&rbio->bio_list, bio);
1734         rbio->bio_list_bytes = bio->bi_iter.bi_size;
1735         rbio->operation = BTRFS_RBIO_WRITE;
1736
1737         btrfs_bio_counter_inc_noblocked(fs_info);
1738         rbio->generic_bio_cnt = 1;
1739
1740         /*
1741          * don't plug on full rbios, just get them out the door
1742          * as quickly as we can
1743          */
1744         if (rbio_is_full(rbio)) {
1745                 ret = full_stripe_write(rbio);
1746                 if (ret)
1747                         btrfs_bio_counter_dec(fs_info);
1748                 return ret;
1749         }
1750
1751         cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug));
1752         if (cb) {
1753                 plug = container_of(cb, struct btrfs_plug_cb, cb);
1754                 if (!plug->info) {
1755                         plug->info = fs_info;
1756                         INIT_LIST_HEAD(&plug->rbio_list);
1757                 }
1758                 list_add_tail(&rbio->plug_list, &plug->rbio_list);
1759                 ret = 0;
1760         } else {
1761                 ret = __raid56_parity_write(rbio);
1762                 if (ret)
1763                         btrfs_bio_counter_dec(fs_info);
1764         }
1765         return ret;
1766 }
1767
1768 /*
1769  * all parity reconstruction happens here.  We've read in everything
1770  * we can find from the drives and this does the heavy lifting of
1771  * sorting the good from the bad.
1772  */
1773 static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1774 {
1775         int pagenr, stripe;
1776         void **pointers;
1777         void **unmap_array;
1778         int faila = -1, failb = -1;
1779         struct page *page;
1780         blk_status_t err;
1781         int i;
1782
1783         pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
1784         if (!pointers) {
1785                 err = BLK_STS_RESOURCE;
1786                 goto cleanup_io;
1787         }
1788
1789         /*
1790          * Store copy of pointers that does not get reordered during
1791          * reconstruction so that kunmap_local works.
1792          */
1793         unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
1794         if (!unmap_array) {
1795                 err = BLK_STS_RESOURCE;
1796                 goto cleanup_pointers;
1797         }
1798
1799         faila = rbio->faila;
1800         failb = rbio->failb;
1801
1802         if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1803             rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
1804                 spin_lock_irq(&rbio->bio_list_lock);
1805                 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1806                 spin_unlock_irq(&rbio->bio_list_lock);
1807         }
1808
1809         index_rbio_pages(rbio);
1810
1811         for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
1812                 /*
1813                  * Now we just use bitmap to mark the horizontal stripes in
1814                  * which we have data when doing parity scrub.
1815                  */
1816                 if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
1817                     !test_bit(pagenr, rbio->dbitmap))
1818                         continue;
1819
1820                 /*
1821                  * Setup our array of pointers with pages from each stripe
1822                  *
1823                  * NOTE: store a duplicate array of pointers to preserve the
1824                  * pointer order
1825                  */
1826                 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1827                         /*
1828                          * if we're rebuilding a read, we have to use
1829                          * pages from the bio list
1830                          */
1831                         if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1832                              rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
1833                             (stripe == faila || stripe == failb)) {
1834                                 page = page_in_rbio(rbio, stripe, pagenr, 0);
1835                         } else {
1836                                 page = rbio_stripe_page(rbio, stripe, pagenr);
1837                         }
1838                         pointers[stripe] = kmap_local_page(page);
1839                         unmap_array[stripe] = pointers[stripe];
1840                 }
1841
1842                 /* all raid6 handling here */
1843                 if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) {
1844                         /*
1845                          * single failure, rebuild from parity raid5
1846                          * style
1847                          */
1848                         if (failb < 0) {
1849                                 if (faila == rbio->nr_data) {
1850                                         /*
1851                                          * Just the P stripe has failed, without
1852                                          * a bad data or Q stripe.
1853                                          * TODO, we should redo the xor here.
1854                                          */
1855                                         err = BLK_STS_IOERR;
1856                                         goto cleanup;
1857                                 }
1858                                 /*
1859                                  * a single failure in raid6 is rebuilt
1860                                  * in the pstripe code below
1861                                  */
1862                                 goto pstripe;
1863                         }
1864
1865                         /* make sure our ps and qs are in order */
1866                         if (faila > failb)
1867                                 swap(faila, failb);
1868
1869                         /* if the q stripe is failed, do a pstripe reconstruction
1870                          * from the xors.
1871                          * If both the q stripe and the P stripe are failed, we're
1872                          * here due to a crc mismatch and we can't give them the
1873                          * data they want
1874                          */
1875                         if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) {
1876                                 if (rbio->bioc->raid_map[faila] ==
1877                                     RAID5_P_STRIPE) {
1878                                         err = BLK_STS_IOERR;
1879                                         goto cleanup;
1880                                 }
1881                                 /*
1882                                  * otherwise we have one bad data stripe and
1883                                  * a good P stripe.  raid5!
1884                                  */
1885                                 goto pstripe;
1886                         }
1887
1888                         if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) {
1889                                 raid6_datap_recov(rbio->real_stripes,
1890                                                   PAGE_SIZE, faila, pointers);
1891                         } else {
1892                                 raid6_2data_recov(rbio->real_stripes,
1893                                                   PAGE_SIZE, faila, failb,
1894                                                   pointers);
1895                         }
1896                 } else {
1897                         void *p;
1898
1899                         /* rebuild from P stripe here (raid5 or raid6) */
1900                         BUG_ON(failb != -1);
1901 pstripe:
1902                         /* Copy parity block into failed block to start with */
1903                         copy_page(pointers[faila], pointers[rbio->nr_data]);
1904
1905                         /* rearrange the pointer array */
1906                         p = pointers[faila];
1907                         for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
1908                                 pointers[stripe] = pointers[stripe + 1];
1909                         pointers[rbio->nr_data - 1] = p;
1910
1911                         /* xor in the rest */
1912                         run_xor(pointers, rbio->nr_data - 1, PAGE_SIZE);
1913                 }
1914                 /* if we're doing this rebuild as part of an rmw, go through
1915                  * and set all of our private rbio pages in the
1916                  * failed stripes as uptodate.  This way finish_rmw will
1917                  * know they can be trusted.  If this was a read reconstruction,
1918                  * other endio functions will fiddle the uptodate bits
1919                  */
1920                 if (rbio->operation == BTRFS_RBIO_WRITE) {
1921                         for (i = 0;  i < rbio->stripe_npages; i++) {
1922                                 if (faila != -1) {
1923                                         page = rbio_stripe_page(rbio, faila, i);
1924                                         SetPageUptodate(page);
1925                                 }
1926                                 if (failb != -1) {
1927                                         page = rbio_stripe_page(rbio, failb, i);
1928                                         SetPageUptodate(page);
1929                                 }
1930                         }
1931                 }
1932                 for (stripe = rbio->real_stripes - 1; stripe >= 0; stripe--)
1933                         kunmap_local(unmap_array[stripe]);
1934         }
1935
1936         err = BLK_STS_OK;
1937 cleanup:
1938         kfree(unmap_array);
1939 cleanup_pointers:
1940         kfree(pointers);
1941
1942 cleanup_io:
1943         /*
1944          * Similar to READ_REBUILD, REBUILD_MISSING at this point also has a
1945          * valid rbio which is consistent with ondisk content, thus such a
1946          * valid rbio can be cached to avoid further disk reads.
1947          */
1948         if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1949             rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
1950                 /*
1951                  * - In case of two failures, where rbio->failb != -1:
1952                  *
1953                  *   Do not cache this rbio since the above read reconstruction
1954                  *   (raid6_datap_recov() or raid6_2data_recov()) may have
1955                  *   changed some content of stripes which are not identical to
1956                  *   on-disk content any more, otherwise, a later write/recover
1957                  *   may steal stripe_pages from this rbio and end up with
1958                  *   corruptions or rebuild failures.
1959                  *
1960                  * - In case of single failure, where rbio->failb == -1:
1961                  *
1962                  *   Cache this rbio iff the above read reconstruction is
1963                  *   executed without problems.
1964                  */
1965                 if (err == BLK_STS_OK && rbio->failb < 0)
1966                         cache_rbio_pages(rbio);
1967                 else
1968                         clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
1969
1970                 rbio_orig_end_io(rbio, err);
1971         } else if (err == BLK_STS_OK) {
1972                 rbio->faila = -1;
1973                 rbio->failb = -1;
1974
1975                 if (rbio->operation == BTRFS_RBIO_WRITE)
1976                         finish_rmw(rbio);
1977                 else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB)
1978                         finish_parity_scrub(rbio, 0);
1979                 else
1980                         BUG();
1981         } else {
1982                 rbio_orig_end_io(rbio, err);
1983         }
1984 }
1985
1986 /*
1987  * This is called only for stripes we've read from disk to
1988  * reconstruct the parity.
1989  */
1990 static void raid_recover_end_io(struct bio *bio)
1991 {
1992         struct btrfs_raid_bio *rbio = bio->bi_private;
1993
1994         /*
1995          * we only read stripe pages off the disk, set them
1996          * up to date if there were no errors
1997          */
1998         if (bio->bi_status)
1999                 fail_bio_stripe(rbio, bio);
2000         else
2001                 set_bio_pages_uptodate(bio);
2002         bio_put(bio);
2003
2004         if (!atomic_dec_and_test(&rbio->stripes_pending))
2005                 return;
2006
2007         if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
2008                 rbio_orig_end_io(rbio, BLK_STS_IOERR);
2009         else
2010                 __raid_recover_end_io(rbio);
2011 }
2012
2013 /*
2014  * reads everything we need off the disk to reconstruct
2015  * the parity. endio handlers trigger final reconstruction
2016  * when the IO is done.
2017  *
2018  * This is used both for reads from the higher layers and for
2019  * parity construction required to finish a rmw cycle.
2020  */
2021 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
2022 {
2023         int bios_to_read = 0;
2024         struct bio_list bio_list;
2025         int ret;
2026         int pagenr;
2027         int stripe;
2028         struct bio *bio;
2029
2030         bio_list_init(&bio_list);
2031
2032         ret = alloc_rbio_pages(rbio);
2033         if (ret)
2034                 goto cleanup;
2035
2036         atomic_set(&rbio->error, 0);
2037
2038         /*
2039          * read everything that hasn't failed.  Thanks to the
2040          * stripe cache, it is possible that some or all of these
2041          * pages are going to be uptodate.
2042          */
2043         for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
2044                 if (rbio->faila == stripe || rbio->failb == stripe) {
2045                         atomic_inc(&rbio->error);
2046                         continue;
2047                 }
2048
2049                 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
2050                         struct page *p;
2051
2052                         /*
2053                          * the rmw code may have already read this
2054                          * page in
2055                          */
2056                         p = rbio_stripe_page(rbio, stripe, pagenr);
2057                         if (PageUptodate(p))
2058                                 continue;
2059
2060                         ret = rbio_add_io_page(rbio, &bio_list,
2061                                        rbio_stripe_page(rbio, stripe, pagenr),
2062                                        stripe, pagenr, rbio->stripe_len);
2063                         if (ret < 0)
2064                                 goto cleanup;
2065                 }
2066         }
2067
2068         bios_to_read = bio_list_size(&bio_list);
2069         if (!bios_to_read) {
2070                 /*
2071                  * we might have no bios to read just because the pages
2072                  * were up to date, or we might have no bios to read because
2073                  * the devices were gone.
2074                  */
2075                 if (atomic_read(&rbio->error) <= rbio->bioc->max_errors) {
2076                         __raid_recover_end_io(rbio);
2077                         return 0;
2078                 } else {
2079                         goto cleanup;
2080                 }
2081         }
2082
2083         /*
2084          * The bioc may be freed once we submit the last bio. Make sure not to
2085          * touch it after that.
2086          */
2087         atomic_set(&rbio->stripes_pending, bios_to_read);
2088         while ((bio = bio_list_pop(&bio_list))) {
2089                 bio->bi_private = rbio;
2090                 bio->bi_end_io = raid_recover_end_io;
2091                 bio->bi_opf = REQ_OP_READ;
2092
2093                 btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
2094
2095                 submit_bio(bio);
2096         }
2097
2098         return 0;
2099
2100 cleanup:
2101         if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
2102             rbio->operation == BTRFS_RBIO_REBUILD_MISSING)
2103                 rbio_orig_end_io(rbio, BLK_STS_IOERR);
2104
2105         while ((bio = bio_list_pop(&bio_list)))
2106                 bio_put(bio);
2107
2108         return -EIO;
2109 }
2110
2111 /*
2112  * the main entry point for reads from the higher layers.  This
2113  * is really only called when the normal read path had a failure,
2114  * so we assume the bio they send down corresponds to a failed part
2115  * of the drive.
2116  */
2117 int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
2118                           u64 stripe_len, int mirror_num, int generic_io)
2119 {
2120         struct btrfs_fs_info *fs_info = bioc->fs_info;
2121         struct btrfs_raid_bio *rbio;
2122         int ret;
2123
2124         if (generic_io) {
2125                 ASSERT(bioc->mirror_num == mirror_num);
2126                 btrfs_bio(bio)->mirror_num = mirror_num;
2127         }
2128
2129         rbio = alloc_rbio(fs_info, bioc, stripe_len);
2130         if (IS_ERR(rbio)) {
2131                 if (generic_io)
2132                         btrfs_put_bioc(bioc);
2133                 return PTR_ERR(rbio);
2134         }
2135
2136         rbio->operation = BTRFS_RBIO_READ_REBUILD;
2137         bio_list_add(&rbio->bio_list, bio);
2138         rbio->bio_list_bytes = bio->bi_iter.bi_size;
2139
2140         rbio->faila = find_logical_bio_stripe(rbio, bio);
2141         if (rbio->faila == -1) {
2142                 btrfs_warn(fs_info,
2143 "%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bioc has map_type %llu)",
2144                            __func__, bio->bi_iter.bi_sector << 9,
2145                            (u64)bio->bi_iter.bi_size, bioc->map_type);
2146                 if (generic_io)
2147                         btrfs_put_bioc(bioc);
2148                 kfree(rbio);
2149                 return -EIO;
2150         }
2151
2152         if (generic_io) {
2153                 btrfs_bio_counter_inc_noblocked(fs_info);
2154                 rbio->generic_bio_cnt = 1;
2155         } else {
2156                 btrfs_get_bioc(bioc);
2157         }
2158
2159         /*
2160          * Loop retry:
2161          * for 'mirror == 2', reconstruct from all other stripes.
2162          * for 'mirror_num > 2', select a stripe to fail on every retry.
2163          */
2164         if (mirror_num > 2) {
2165                 /*
2166                  * 'mirror == 3' is to fail the p stripe and
2167                  * reconstruct from the q stripe.  'mirror > 3' is to
2168                  * fail a data stripe and reconstruct from p+q stripe.
2169                  */
2170                 rbio->failb = rbio->real_stripes - (mirror_num - 1);
2171                 ASSERT(rbio->failb > 0);
2172                 if (rbio->failb <= rbio->faila)
2173                         rbio->failb--;
2174         }
2175
2176         ret = lock_stripe_add(rbio);
2177
2178         /*
2179          * __raid56_parity_recover will end the bio with
2180          * any errors it hits.  We don't want to return
2181          * its error value up the stack because our caller
2182          * will end up calling bio_endio with any nonzero
2183          * return
2184          */
2185         if (ret == 0)
2186                 __raid56_parity_recover(rbio);
2187         /*
2188          * our rbio has been added to the list of
2189          * rbios that will be handled after the
2190          * currently lock owner is done
2191          */
2192         return 0;
2193
2194 }
2195
2196 static void rmw_work(struct btrfs_work *work)
2197 {
2198         struct btrfs_raid_bio *rbio;
2199
2200         rbio = container_of(work, struct btrfs_raid_bio, work);
2201         raid56_rmw_stripe(rbio);
2202 }
2203
2204 static void read_rebuild_work(struct btrfs_work *work)
2205 {
2206         struct btrfs_raid_bio *rbio;
2207
2208         rbio = container_of(work, struct btrfs_raid_bio, work);
2209         __raid56_parity_recover(rbio);
2210 }
2211
2212 /*
2213  * The following code is used to scrub/replace the parity stripe
2214  *
2215  * Caller must have already increased bio_counter for getting @bioc.
2216  *
2217  * Note: We need make sure all the pages that add into the scrub/replace
2218  * raid bio are correct and not be changed during the scrub/replace. That
2219  * is those pages just hold metadata or file data with checksum.
2220  */
2221
2222 struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
2223                                 struct btrfs_io_context *bioc,
2224                                 u64 stripe_len, struct btrfs_device *scrub_dev,
2225                                 unsigned long *dbitmap, int stripe_nsectors)
2226 {
2227         struct btrfs_fs_info *fs_info = bioc->fs_info;
2228         struct btrfs_raid_bio *rbio;
2229         int i;
2230
2231         rbio = alloc_rbio(fs_info, bioc, stripe_len);
2232         if (IS_ERR(rbio))
2233                 return NULL;
2234         bio_list_add(&rbio->bio_list, bio);
2235         /*
2236          * This is a special bio which is used to hold the completion handler
2237          * and make the scrub rbio is similar to the other types
2238          */
2239         ASSERT(!bio->bi_iter.bi_size);
2240         rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
2241
2242         /*
2243          * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted
2244          * to the end position, so this search can start from the first parity
2245          * stripe.
2246          */
2247         for (i = rbio->nr_data; i < rbio->real_stripes; i++) {
2248                 if (bioc->stripes[i].dev == scrub_dev) {
2249                         rbio->scrubp = i;
2250                         break;
2251                 }
2252         }
2253         ASSERT(i < rbio->real_stripes);
2254
2255         /* Now we just support the sectorsize equals to page size */
2256         ASSERT(fs_info->sectorsize == PAGE_SIZE);
2257         ASSERT(rbio->stripe_npages == stripe_nsectors);
2258         bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
2259
2260         /*
2261          * We have already increased bio_counter when getting bioc, record it
2262          * so we can free it at rbio_orig_end_io().
2263          */
2264         rbio->generic_bio_cnt = 1;
2265
2266         return rbio;
2267 }
2268
2269 /* Used for both parity scrub and missing. */
2270 void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
2271                             u64 logical)
2272 {
2273         int stripe_offset;
2274         int index;
2275
2276         ASSERT(logical >= rbio->bioc->raid_map[0]);
2277         ASSERT(logical + PAGE_SIZE <= rbio->bioc->raid_map[0] +
2278                                 rbio->stripe_len * rbio->nr_data);
2279         stripe_offset = (int)(logical - rbio->bioc->raid_map[0]);
2280         index = stripe_offset >> PAGE_SHIFT;
2281         rbio->bio_pages[index] = page;
2282 }
2283
2284 /*
2285  * We just scrub the parity that we have correct data on the same horizontal,
2286  * so we needn't allocate all pages for all the stripes.
2287  */
2288 static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
2289 {
2290         int i;
2291         int bit;
2292         int index;
2293         struct page *page;
2294
2295         for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) {
2296                 for (i = 0; i < rbio->real_stripes; i++) {
2297                         index = i * rbio->stripe_npages + bit;
2298                         if (rbio->stripe_pages[index])
2299                                 continue;
2300
2301                         page = alloc_page(GFP_NOFS);
2302                         if (!page)
2303                                 return -ENOMEM;
2304                         rbio->stripe_pages[index] = page;
2305                 }
2306         }
2307         return 0;
2308 }
2309
2310 static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
2311                                          int need_check)
2312 {
2313         struct btrfs_io_context *bioc = rbio->bioc;
2314         void **pointers = rbio->finish_pointers;
2315         unsigned long *pbitmap = rbio->finish_pbitmap;
2316         int nr_data = rbio->nr_data;
2317         int stripe;
2318         int pagenr;
2319         bool has_qstripe;
2320         struct page *p_page = NULL;
2321         struct page *q_page = NULL;
2322         struct bio_list bio_list;
2323         struct bio *bio;
2324         int is_replace = 0;
2325         int ret;
2326
2327         bio_list_init(&bio_list);
2328
2329         if (rbio->real_stripes - rbio->nr_data == 1)
2330                 has_qstripe = false;
2331         else if (rbio->real_stripes - rbio->nr_data == 2)
2332                 has_qstripe = true;
2333         else
2334                 BUG();
2335
2336         if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) {
2337                 is_replace = 1;
2338                 bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages);
2339         }
2340
2341         /*
2342          * Because the higher layers(scrubber) are unlikely to
2343          * use this area of the disk again soon, so don't cache
2344          * it.
2345          */
2346         clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2347
2348         if (!need_check)
2349                 goto writeback;
2350
2351         p_page = alloc_page(GFP_NOFS);
2352         if (!p_page)
2353                 goto cleanup;
2354         SetPageUptodate(p_page);
2355
2356         if (has_qstripe) {
2357                 /* RAID6, allocate and map temp space for the Q stripe */
2358                 q_page = alloc_page(GFP_NOFS);
2359                 if (!q_page) {
2360                         __free_page(p_page);
2361                         goto cleanup;
2362                 }
2363                 SetPageUptodate(q_page);
2364                 pointers[rbio->real_stripes - 1] = kmap_local_page(q_page);
2365         }
2366
2367         atomic_set(&rbio->error, 0);
2368
2369         /* Map the parity stripe just once */
2370         pointers[nr_data] = kmap_local_page(p_page);
2371
2372         for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2373                 struct page *p;
2374                 void *parity;
2375                 /* first collect one page from each data stripe */
2376                 for (stripe = 0; stripe < nr_data; stripe++) {
2377                         p = page_in_rbio(rbio, stripe, pagenr, 0);
2378                         pointers[stripe] = kmap_local_page(p);
2379                 }
2380
2381                 if (has_qstripe) {
2382                         /* RAID6, call the library function to fill in our P/Q */
2383                         raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
2384                                                 pointers);
2385                 } else {
2386                         /* raid5 */
2387                         copy_page(pointers[nr_data], pointers[0]);
2388                         run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
2389                 }
2390
2391                 /* Check scrubbing parity and repair it */
2392                 p = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
2393                 parity = kmap_local_page(p);
2394                 if (memcmp(parity, pointers[rbio->scrubp], PAGE_SIZE))
2395                         copy_page(parity, pointers[rbio->scrubp]);
2396                 else
2397                         /* Parity is right, needn't writeback */
2398                         bitmap_clear(rbio->dbitmap, pagenr, 1);
2399                 kunmap_local(parity);
2400
2401                 for (stripe = nr_data - 1; stripe >= 0; stripe--)
2402                         kunmap_local(pointers[stripe]);
2403         }
2404
2405         kunmap_local(pointers[nr_data]);
2406         __free_page(p_page);
2407         if (q_page) {
2408                 kunmap_local(pointers[rbio->real_stripes - 1]);
2409                 __free_page(q_page);
2410         }
2411
2412 writeback:
2413         /*
2414          * time to start writing.  Make bios for everything from the
2415          * higher layers (the bio_list in our rbio) and our p/q.  Ignore
2416          * everything else.
2417          */
2418         for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2419                 struct page *page;
2420
2421                 page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
2422                 ret = rbio_add_io_page(rbio, &bio_list,
2423                                page, rbio->scrubp, pagenr, rbio->stripe_len);
2424                 if (ret)
2425                         goto cleanup;
2426         }
2427
2428         if (!is_replace)
2429                 goto submit_write;
2430
2431         for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) {
2432                 struct page *page;
2433
2434                 page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
2435                 ret = rbio_add_io_page(rbio, &bio_list, page,
2436                                        bioc->tgtdev_map[rbio->scrubp],
2437                                        pagenr, rbio->stripe_len);
2438                 if (ret)
2439                         goto cleanup;
2440         }
2441
2442 submit_write:
2443         nr_data = bio_list_size(&bio_list);
2444         if (!nr_data) {
2445                 /* Every parity is right */
2446                 rbio_orig_end_io(rbio, BLK_STS_OK);
2447                 return;
2448         }
2449
2450         atomic_set(&rbio->stripes_pending, nr_data);
2451
2452         while ((bio = bio_list_pop(&bio_list))) {
2453                 bio->bi_private = rbio;
2454                 bio->bi_end_io = raid_write_end_io;
2455                 bio->bi_opf = REQ_OP_WRITE;
2456
2457                 submit_bio(bio);
2458         }
2459         return;
2460
2461 cleanup:
2462         rbio_orig_end_io(rbio, BLK_STS_IOERR);
2463
2464         while ((bio = bio_list_pop(&bio_list)))
2465                 bio_put(bio);
2466 }
2467
2468 static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
2469 {
2470         if (stripe >= 0 && stripe < rbio->nr_data)
2471                 return 1;
2472         return 0;
2473 }
2474
2475 /*
2476  * While we're doing the parity check and repair, we could have errors
2477  * in reading pages off the disk.  This checks for errors and if we're
2478  * not able to read the page it'll trigger parity reconstruction.  The
2479  * parity scrub will be finished after we've reconstructed the failed
2480  * stripes
2481  */
2482 static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
2483 {
2484         if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
2485                 goto cleanup;
2486
2487         if (rbio->faila >= 0 || rbio->failb >= 0) {
2488                 int dfail = 0, failp = -1;
2489
2490                 if (is_data_stripe(rbio, rbio->faila))
2491                         dfail++;
2492                 else if (is_parity_stripe(rbio->faila))
2493                         failp = rbio->faila;
2494
2495                 if (is_data_stripe(rbio, rbio->failb))
2496                         dfail++;
2497                 else if (is_parity_stripe(rbio->failb))
2498                         failp = rbio->failb;
2499
2500                 /*
2501                  * Because we can not use a scrubbing parity to repair
2502                  * the data, so the capability of the repair is declined.
2503                  * (In the case of RAID5, we can not repair anything)
2504                  */
2505                 if (dfail > rbio->bioc->max_errors - 1)
2506                         goto cleanup;
2507
2508                 /*
2509                  * If all data is good, only parity is correctly, just
2510                  * repair the parity.
2511                  */
2512                 if (dfail == 0) {
2513                         finish_parity_scrub(rbio, 0);
2514                         return;
2515                 }
2516
2517                 /*
2518                  * Here means we got one corrupted data stripe and one
2519                  * corrupted parity on RAID6, if the corrupted parity
2520                  * is scrubbing parity, luckily, use the other one to repair
2521                  * the data, or we can not repair the data stripe.
2522                  */
2523                 if (failp != rbio->scrubp)
2524                         goto cleanup;
2525
2526                 __raid_recover_end_io(rbio);
2527         } else {
2528                 finish_parity_scrub(rbio, 1);
2529         }
2530         return;
2531
2532 cleanup:
2533         rbio_orig_end_io(rbio, BLK_STS_IOERR);
2534 }
2535
2536 /*
2537  * end io for the read phase of the rmw cycle.  All the bios here are physical
2538  * stripe bios we've read from the disk so we can recalculate the parity of the
2539  * stripe.
2540  *
2541  * This will usually kick off finish_rmw once all the bios are read in, but it
2542  * may trigger parity reconstruction if we had any errors along the way
2543  */
2544 static void raid56_parity_scrub_end_io(struct bio *bio)
2545 {
2546         struct btrfs_raid_bio *rbio = bio->bi_private;
2547
2548         if (bio->bi_status)
2549                 fail_bio_stripe(rbio, bio);
2550         else
2551                 set_bio_pages_uptodate(bio);
2552
2553         bio_put(bio);
2554
2555         if (!atomic_dec_and_test(&rbio->stripes_pending))
2556                 return;
2557
2558         /*
2559          * this will normally call finish_rmw to start our write
2560          * but if there are any failed stripes we'll reconstruct
2561          * from parity first
2562          */
2563         validate_rbio_for_parity_scrub(rbio);
2564 }
2565
2566 static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
2567 {
2568         int bios_to_read = 0;
2569         struct bio_list bio_list;
2570         int ret;
2571         int pagenr;
2572         int stripe;
2573         struct bio *bio;
2574
2575         bio_list_init(&bio_list);
2576
2577         ret = alloc_rbio_essential_pages(rbio);
2578         if (ret)
2579                 goto cleanup;
2580
2581         atomic_set(&rbio->error, 0);
2582         /*
2583          * build a list of bios to read all the missing parts of this
2584          * stripe
2585          */
2586         for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
2587                 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2588                         struct page *page;
2589                         /*
2590                          * we want to find all the pages missing from
2591                          * the rbio and read them from the disk.  If
2592                          * page_in_rbio finds a page in the bio list
2593                          * we don't need to read it off the stripe.
2594                          */
2595                         page = page_in_rbio(rbio, stripe, pagenr, 1);
2596                         if (page)
2597                                 continue;
2598
2599                         page = rbio_stripe_page(rbio, stripe, pagenr);
2600                         /*
2601                          * the bio cache may have handed us an uptodate
2602                          * page.  If so, be happy and use it
2603                          */
2604                         if (PageUptodate(page))
2605                                 continue;
2606
2607                         ret = rbio_add_io_page(rbio, &bio_list, page,
2608                                        stripe, pagenr, rbio->stripe_len);
2609                         if (ret)
2610                                 goto cleanup;
2611                 }
2612         }
2613
2614         bios_to_read = bio_list_size(&bio_list);
2615         if (!bios_to_read) {
2616                 /*
2617                  * this can happen if others have merged with
2618                  * us, it means there is nothing left to read.
2619                  * But if there are missing devices it may not be
2620                  * safe to do the full stripe write yet.
2621                  */
2622                 goto finish;
2623         }
2624
2625         /*
2626          * The bioc may be freed once we submit the last bio. Make sure not to
2627          * touch it after that.
2628          */
2629         atomic_set(&rbio->stripes_pending, bios_to_read);
2630         while ((bio = bio_list_pop(&bio_list))) {
2631                 bio->bi_private = rbio;
2632                 bio->bi_end_io = raid56_parity_scrub_end_io;
2633                 bio->bi_opf = REQ_OP_READ;
2634
2635                 btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
2636
2637                 submit_bio(bio);
2638         }
2639         /* the actual write will happen once the reads are done */
2640         return;
2641
2642 cleanup:
2643         rbio_orig_end_io(rbio, BLK_STS_IOERR);
2644
2645         while ((bio = bio_list_pop(&bio_list)))
2646                 bio_put(bio);
2647
2648         return;
2649
2650 finish:
2651         validate_rbio_for_parity_scrub(rbio);
2652 }
2653
2654 static void scrub_parity_work(struct btrfs_work *work)
2655 {
2656         struct btrfs_raid_bio *rbio;
2657
2658         rbio = container_of(work, struct btrfs_raid_bio, work);
2659         raid56_parity_scrub_stripe(rbio);
2660 }
2661
2662 void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
2663 {
2664         if (!lock_stripe_add(rbio))
2665                 start_async_work(rbio, scrub_parity_work);
2666 }
2667
2668 /* The following code is used for dev replace of a missing RAID 5/6 device. */
2669
2670 struct btrfs_raid_bio *
2671 raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc,
2672                           u64 length)
2673 {
2674         struct btrfs_fs_info *fs_info = bioc->fs_info;
2675         struct btrfs_raid_bio *rbio;
2676
2677         rbio = alloc_rbio(fs_info, bioc, length);
2678         if (IS_ERR(rbio))
2679                 return NULL;
2680
2681         rbio->operation = BTRFS_RBIO_REBUILD_MISSING;
2682         bio_list_add(&rbio->bio_list, bio);
2683         /*
2684          * This is a special bio which is used to hold the completion handler
2685          * and make the scrub rbio is similar to the other types
2686          */
2687         ASSERT(!bio->bi_iter.bi_size);
2688
2689         rbio->faila = find_logical_bio_stripe(rbio, bio);
2690         if (rbio->faila == -1) {
2691                 BUG();
2692                 kfree(rbio);
2693                 return NULL;
2694         }
2695
2696         /*
2697          * When we get bioc, we have already increased bio_counter, record it
2698          * so we can free it at rbio_orig_end_io()
2699          */
2700         rbio->generic_bio_cnt = 1;
2701
2702         return rbio;
2703 }
2704
2705 void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio)
2706 {
2707         if (!lock_stripe_add(rbio))
2708                 start_async_work(rbio, read_rebuild_work);
2709 }