fs/btrfs/raid56.c

   1 /*
   2  * Copyright (C) 2012 Fusion-io  All rights reserved.
   3  * Copyright (C) 2012 Intel Corp. All rights reserved.
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public
   7  * License v2 as published by the Free Software Foundation.
   8  *
   9  * This program is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public
  15  * License along with this program; if not, write to the
  16  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  17  * Boston, MA 021110-1307, USA.
  18  */
  19 #include <linux/sched.h>
  20 #include <linux/wait.h>
  21 #include <linux/bio.h>
  22 #include <linux/slab.h>
  23 #include <linux/buffer_head.h>
  24 #include <linux/blkdev.h>
  25 #include <linux/random.h>
  26 #include <linux/iocontext.h>
  27 #include <linux/capability.h>
  28 #include <linux/ratelimit.h>
  29 #include <linux/kthread.h>
  30 #include <linux/raid/pq.h>
  31 #include <linux/hash.h>
  32 #include <linux/list_sort.h>
  33 #include <linux/raid/xor.h>
  34 #include <asm/div64.h>
  35 #include "compat.h"
  36 #include "ctree.h"
  37 #include "extent_map.h"
  38 #include "disk-io.h"
  39 #include "transaction.h"
  40 #include "print-tree.h"
  41 #include "volumes.h"
  42 #include "raid56.h"
  43 #include "async-thread.h"
  44 #include "check-integrity.h"
  45 #include "rcu-string.h"
  46
  47 /* set when additional merges to this rbio are not allowed */
  48 #define RBIO_RMW_LOCKED_BIT     1
  49
  50 /*
  51  * set when this rbio is sitting in the hash, but it is just a cache
  52  * of past RMW
  53  */
  54 #define RBIO_CACHE_BIT          2
  55
  56 /*
  57  * set when it is safe to trust the stripe_pages for caching
  58  */
  59 #define RBIO_CACHE_READY_BIT    3
  60
  61
  62 #define RBIO_CACHE_SIZE 1024
  63
  64 struct btrfs_raid_bio {
  65         struct btrfs_fs_info *fs_info;
  66         struct btrfs_bio *bbio;
  67
  68         /*
  69          * logical block numbers for the start of each stripe
  70          * The last one or two are p/q.  These are sorted,
  71          * so raid_map[0] is the start of our full stripe
  72          */
  73         u64 *raid_map;
  74
  75         /* while we're doing rmw on a stripe
  76          * we put it into a hash table so we can
  77          * lock the stripe and merge more rbios
  78          * into it.
  79          */
  80         struct list_head hash_list;
  81
  82         /*
  83          * LRU list for the stripe cache
  84          */
  85         struct list_head stripe_cache;
  86
  87         /*
  88          * for scheduling work in the helper threads
  89          */
  90         struct btrfs_work work;
  91
  92         /*
  93          * bio list and bio_list_lock are used
  94          * to add more bios into the stripe
  95          * in hopes of avoiding the full rmw
  96          */
  97         struct bio_list bio_list;
  98         spinlock_t bio_list_lock;
  99
 100         /*
 101          * also protected by the bio_list_lock, the
 102          * stripe locking code uses plug_list to hand off
 103          * the stripe lock to the next pending IO
 104          */
 105         struct list_head plug_list;
 106
 107         /*
 108          * flags that tell us if it is safe to
 109          * merge with this bio
 110          */
 111         unsigned long flags;
 112
 113         /* size of each individual stripe on disk */
 114         int stripe_len;
 115
 116         /* number of data stripes (no p/q) */
 117         int nr_data;
 118
 119         /*
 120          * set if we're doing a parity rebuild
 121          * for a read from higher up, which is handled
 122          * differently from a parity rebuild as part of
 123          * rmw
 124          */
 125         int read_rebuild;
 126
 127         /* first bad stripe */
 128         int faila;
 129
 130         /* second bad stripe (for raid6 use) */
 131         int failb;
 132
 133         /*
 134          * number of pages needed to represent the full
 135          * stripe
 136          */
 137         int nr_pages;
 138
 139         /*
 140          * size of all the bios in the bio_list.  This
 141          * helps us decide if the rbio maps to a full
 142          * stripe or not
 143          */
 144         int bio_list_bytes;
 145
 146         atomic_t refs;
 147
 148         /*
 149          * these are two arrays of pointers.  We allocate the
 150          * rbio big enough to hold them both and setup their
 151          * locations when the rbio is allocated
 152          */
 153
 154         /* pointers to pages that we allocated for
 155          * reading/writing stripes directly from the disk (including P/Q)
 156          */
 157         struct page **stripe_pages;
 158
 159         /*
 160          * pointers to the pages in the bio_list.  Stored
 161          * here for faster lookup
 162          */
 163         struct page **bio_pages;
 164 };
 165
 166 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
 167 static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
 168 static void rmw_work(struct btrfs_work *work);
 169 static void read_rebuild_work(struct btrfs_work *work);
 170 static void async_rmw_stripe(struct btrfs_raid_bio *rbio);
 171 static void async_read_rebuild(struct btrfs_raid_bio *rbio);
 172 static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
 173 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
 174 static void __free_raid_bio(struct btrfs_raid_bio *rbio);
 175 static void index_rbio_pages(struct btrfs_raid_bio *rbio);
 176 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
 177
 178 /*
 179  * the stripe hash table is used for locking, and to collect
 180  * bios in hopes of making a full stripe
 181  */
 182 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
 183 {
 184         struct btrfs_stripe_hash_table *table;
 185         struct btrfs_stripe_hash_table *x;
 186         struct btrfs_stripe_hash *cur;
 187         struct btrfs_stripe_hash *h;
 188         int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
 189         int i;
 190
 191         if (info->stripe_hash_table)
 192                 return 0;
 193
 194         table = kzalloc(sizeof(*table) + sizeof(*h) * num_entries, GFP_NOFS);
 195         if (!table)
 196                 return -ENOMEM;
 197
 198         spin_lock_init(&table->cache_lock);
 199         INIT_LIST_HEAD(&table->stripe_cache);
 200
 201         h = table->table;
 202
 203         for (i = 0; i < num_entries; i++) {
 204                 cur = h + i;
 205                 INIT_LIST_HEAD(&cur->hash_list);
 206                 spin_lock_init(&cur->lock);
 207                 init_waitqueue_head(&cur->wait);
 208         }
 209
 210         x = cmpxchg(&info->stripe_hash_table, NULL, table);
 211         if (x)
 212                 kfree(x);
 213         return 0;
 214 }
 215
 216 /*
 217  * caching an rbio means to copy anything from the
 218  * bio_pages array into the stripe_pages array.  We
 219  * use the page uptodate bit in the stripe cache array
 220  * to indicate if it has valid data
 221  *
 222  * once the caching is done, we set the cache ready
 223  * bit.
 224  */
 225 static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
 226 {
 227         int i;
 228         char *s;
 229         char *d;
 230         int ret;
 231
 232         ret = alloc_rbio_pages(rbio);
 233         if (ret)
 234                 return;
 235
 236         for (i = 0; i < rbio->nr_pages; i++) {
 237                 if (!rbio->bio_pages[i])
 238                         continue;
 239
 240                 s = kmap(rbio->bio_pages[i]);
 241                 d = kmap(rbio->stripe_pages[i]);
 242
 243                 memcpy(d, s, PAGE_CACHE_SIZE);
 244
 245                 kunmap(rbio->bio_pages[i]);
 246                 kunmap(rbio->stripe_pages[i]);
 247                 SetPageUptodate(rbio->stripe_pages[i]);
 248         }
 249         set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
 250 }
 251
 252 /*
 253  * we hash on the first logical address of the stripe
 254  */
 255 static int rbio_bucket(struct btrfs_raid_bio *rbio)
 256 {
 257         u64 num = rbio->raid_map[0];
 258
 259         /*
 260          * we shift down quite a bit.  We're using byte
 261          * addressing, and most of the lower bits are zeros.
 262          * This tends to upset hash_64, and it consistently
 263          * returns just one or two different values.
 264          *
 265          * shifting off the lower bits fixes things.
 266          */
 267         return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
 268 }
 269
 270 /*
 271  * stealing an rbio means taking all the uptodate pages from the stripe
 272  * array in the source rbio and putting them into the destination rbio
 273  */
 274 static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
 275 {
 276         int i;
 277         struct page *s;
 278         struct page *d;
 279
 280         if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
 281                 return;
 282
 283         for (i = 0; i < dest->nr_pages; i++) {
 284                 s = src->stripe_pages[i];
 285                 if (!s || !PageUptodate(s)) {
 286                         continue;
 287                 }
 288
 289                 d = dest->stripe_pages[i];
 290                 if (d)
 291                         __free_page(d);
 292
 293                 dest->stripe_pages[i] = s;
 294                 src->stripe_pages[i] = NULL;
 295         }
 296 }
 297
 298 /*
 299  * merging means we take the bio_list from the victim and
 300  * splice it into the destination.  The victim should
 301  * be discarded afterwards.
 302  *
 303  * must be called with dest->rbio_list_lock held
 304  */
 305 static void merge_rbio(struct btrfs_raid_bio *dest,
 306                        struct btrfs_raid_bio *victim)
 307 {
 308         bio_list_merge(&dest->bio_list, &victim->bio_list);
 309         dest->bio_list_bytes += victim->bio_list_bytes;
 310         bio_list_init(&victim->bio_list);
 311 }
 312
 313 /*
 314  * used to prune items that are in the cache.  The caller
 315  * must hold the hash table lock.
 316  */
 317 static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
 318 {
 319         int bucket = rbio_bucket(rbio);
 320         struct btrfs_stripe_hash_table *table;
 321         struct btrfs_stripe_hash *h;
 322         int freeit = 0;
 323
 324         /*
 325          * check the bit again under the hash table lock.
 326          */
 327         if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
 328                 return;
 329
 330         table = rbio->fs_info->stripe_hash_table;
 331         h = table->table + bucket;
 332
 333         /* hold the lock for the bucket because we may be
 334          * removing it from the hash table
 335          */
 336         spin_lock(&h->lock);
 337
 338         /*
 339          * hold the lock for the bio list because we need
 340          * to make sure the bio list is empty
 341          */
 342         spin_lock(&rbio->bio_list_lock);
 343
 344         if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
 345                 list_del_init(&rbio->stripe_cache);
 346                 table->cache_size -= 1;
 347                 freeit = 1;
 348
 349                 /* if the bio list isn't empty, this rbio is
 350                  * still involved in an IO.  We take it out
 351                  * of the cache list, and drop the ref that
 352                  * was held for the list.
 353                  *
 354                  * If the bio_list was empty, we also remove
 355                  * the rbio from the hash_table, and drop
 356                  * the corresponding ref
 357                  */
 358                 if (bio_list_empty(&rbio->bio_list)) {
 359                         if (!list_empty(&rbio->hash_list)) {
 360                                 list_del_init(&rbio->hash_list);
 361                                 atomic_dec(&rbio->refs);
 362                                 BUG_ON(!list_empty(&rbio->plug_list));
 363                         }
 364                 }
 365         }
 366
 367         spin_unlock(&rbio->bio_list_lock);
 368         spin_unlock(&h->lock);
 369
 370         if (freeit)
 371                 __free_raid_bio(rbio);
 372 }
 373
 374 /*
 375  * prune a given rbio from the cache
 376  */
 377 static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
 378 {
 379         struct btrfs_stripe_hash_table *table;
 380         unsigned long flags;
 381
 382         if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
 383                 return;
 384
 385         table = rbio->fs_info->stripe_hash_table;
 386
 387         spin_lock_irqsave(&table->cache_lock, flags);
 388         __remove_rbio_from_cache(rbio);
 389         spin_unlock_irqrestore(&table->cache_lock, flags);
 390 }
 391
 392 /*
 393  * remove everything in the cache
 394  */
 395 void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
 396 {
 397         struct btrfs_stripe_hash_table *table;
 398         unsigned long flags;
 399         struct btrfs_raid_bio *rbio;
 400
 401         table = info->stripe_hash_table;
 402
 403         spin_lock_irqsave(&table->cache_lock, flags);
 404         while (!list_empty(&table->stripe_cache)) {
 405                 rbio = list_entry(table->stripe_cache.next,
 406                                   struct btrfs_raid_bio,
 407                                   stripe_cache);
 408                 __remove_rbio_from_cache(rbio);
 409         }
 410         spin_unlock_irqrestore(&table->cache_lock, flags);
 411 }
 412
 413 /*
 414  * remove all cached entries and free the hash table
 415  * used by unmount
 416  */
 417 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
 418 {
 419         if (!info->stripe_hash_table)
 420                 return;
 421         btrfs_clear_rbio_cache(info);
 422         kfree(info->stripe_hash_table);
 423         info->stripe_hash_table = NULL;
 424 }
 425
 426 /*
 427  * insert an rbio into the stripe cache.  It
 428  * must have already been prepared by calling
 429  * cache_rbio_pages
 430  *
 431  * If this rbio was already cached, it gets
 432  * moved to the front of the lru.
 433  *
 434  * If the size of the rbio cache is too big, we
 435  * prune an item.
 436  */
 437 static void cache_rbio(struct btrfs_raid_bio *rbio)
 438 {
 439         struct btrfs_stripe_hash_table *table;
 440         unsigned long flags;
 441
 442         if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
 443                 return;
 444
 445         table = rbio->fs_info->stripe_hash_table;
 446
 447         spin_lock_irqsave(&table->cache_lock, flags);
 448         spin_lock(&rbio->bio_list_lock);
 449
 450         /* bump our ref if we were not in the list before */
 451         if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
 452                 atomic_inc(&rbio->refs);
 453
 454         if (!list_empty(&rbio->stripe_cache)){
 455                 list_move(&rbio->stripe_cache, &table->stripe_cache);
 456         } else {
 457                 list_add(&rbio->stripe_cache, &table->stripe_cache);
 458                 table->cache_size += 1;
 459         }
 460
 461         spin_unlock(&rbio->bio_list_lock);
 462
 463         if (table->cache_size > RBIO_CACHE_SIZE) {
 464                 struct btrfs_raid_bio *found;
 465
 466                 found = list_entry(table->stripe_cache.prev,
 467                                   struct btrfs_raid_bio,
 468                                   stripe_cache);
 469
 470                 if (found != rbio)
 471                         __remove_rbio_from_cache(found);
 472         }
 473
 474         spin_unlock_irqrestore(&table->cache_lock, flags);
 475         return;
 476 }
 477
 478 /*
 479  * helper function to run the xor_blocks api.  It is only
 480  * able to do MAX_XOR_BLOCKS at a time, so we need to
 481  * loop through.
 482  */
 483 static void run_xor(void **pages, int src_cnt, ssize_t len)
 484 {
 485         int src_off = 0;
 486         int xor_src_cnt = 0;
 487         void *dest = pages[src_cnt];
 488
 489         while(src_cnt > 0) {
 490                 xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
 491                 xor_blocks(xor_src_cnt, len, dest, pages + src_off);
 492
 493                 src_cnt -= xor_src_cnt;
 494                 src_off += xor_src_cnt;
 495         }
 496 }
 497
 498 /*
 499  * returns true if the bio list inside this rbio
 500  * covers an entire stripe (no rmw required).
 501  * Must be called with the bio list lock held, or
 502  * at a time when you know it is impossible to add
 503  * new bios into the list
 504  */
 505 static int __rbio_is_full(struct btrfs_raid_bio *rbio)
 506 {
 507         unsigned long size = rbio->bio_list_bytes;
 508         int ret = 1;
 509
 510         if (size != rbio->nr_data * rbio->stripe_len)
 511                 ret = 0;
 512
 513         BUG_ON(size > rbio->nr_data * rbio->stripe_len);
 514         return ret;
 515 }
 516
 517 static int rbio_is_full(struct btrfs_raid_bio *rbio)
 518 {
 519         unsigned long flags;
 520         int ret;
 521
 522         spin_lock_irqsave(&rbio->bio_list_lock, flags);
 523         ret = __rbio_is_full(rbio);
 524         spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
 525         return ret;
 526 }
 527
 528 /*
 529  * returns 1 if it is safe to merge two rbios together.
 530  * The merging is safe if the two rbios correspond to
 531  * the same stripe and if they are both going in the same
 532  * direction (read vs write), and if neither one is
 533  * locked for final IO
 534  *
 535  * The caller is responsible for locking such that
 536  * rmw_locked is safe to test
 537  */
 538 static int rbio_can_merge(struct btrfs_raid_bio *last,
 539                           struct btrfs_raid_bio *cur)
 540 {
 541         if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
 542             test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
 543                 return 0;
 544
 545         /*
 546          * we can't merge with cached rbios, since the
 547          * idea is that when we merge the destination
 548          * rbio is going to run our IO for us.  We can
 549          * steal from cached rbio's though, other functions
 550          * handle that.
 551          */
 552         if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
 553             test_bit(RBIO_CACHE_BIT, &cur->flags))
 554                 return 0;
 555
 556         if (last->raid_map[0] !=
 557             cur->raid_map[0])
 558                 return 0;
 559
 560         /* reads can't merge with writes */
 561         if (last->read_rebuild !=
 562             cur->read_rebuild) {
 563                 return 0;
 564         }
 565
 566         return 1;
 567 }
 568
 569 /*
 570  * helper to index into the pstripe
 571  */
 572 static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
 573 {
 574         index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
 575         return rbio->stripe_pages[index];
 576 }
 577
 578 /*
 579  * helper to index into the qstripe, returns null
 580  * if there is no qstripe
 581  */
 582 static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
 583 {
 584         if (rbio->nr_data + 1 == rbio->bbio->num_stripes)
 585                 return NULL;
 586
 587         index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
 588                 PAGE_CACHE_SHIFT;
 589         return rbio->stripe_pages[index];
 590 }
 591
 592 /*
 593  * The first stripe in the table for a logical address
 594  * has the lock.  rbios are added in one of three ways:
 595  *
 596  * 1) Nobody has the stripe locked yet.  The rbio is given
 597  * the lock and 0 is returned.  The caller must start the IO
 598  * themselves.
 599  *
 600  * 2) Someone has the stripe locked, but we're able to merge
 601  * with the lock owner.  The rbio is freed and the IO will
 602  * start automatically along with the existing rbio.  1 is returned.
 603  *
 604  * 3) Someone has the stripe locked, but we're not able to merge.
 605  * The rbio is added to the lock owner's plug list, or merged into
 606  * an rbio already on the plug list.  When the lock owner unlocks,
 607  * the next rbio on the list is run and the IO is started automatically.
 608  * 1 is returned
 609  *
 610  * If we return 0, the caller still owns the rbio and must continue with
 611  * IO submission.  If we return 1, the caller must assume the rbio has
 612  * already been freed.
 613  */
 614 static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
 615 {
 616         int bucket = rbio_bucket(rbio);
 617         struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket;
 618         struct btrfs_raid_bio *cur;
 619         struct btrfs_raid_bio *pending;
 620         unsigned long flags;
 621         DEFINE_WAIT(wait);
 622         struct btrfs_raid_bio *freeit = NULL;
 623         struct btrfs_raid_bio *cache_drop = NULL;
 624         int ret = 0;
 625         int walk = 0;
 626
 627         spin_lock_irqsave(&h->lock, flags);
 628         list_for_each_entry(cur, &h->hash_list, hash_list) {
 629                 walk++;
 630                 if (cur->raid_map[0] == rbio->raid_map[0]) {
 631                         spin_lock(&cur->bio_list_lock);
 632
 633                         /* can we steal this cached rbio's pages? */
 634                         if (bio_list_empty(&cur->bio_list) &&
 635                             list_empty(&cur->plug_list) &&
 636                             test_bit(RBIO_CACHE_BIT, &cur->flags) &&
 637                             !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
 638                                 list_del_init(&cur->hash_list);
 639                                 atomic_dec(&cur->refs);
 640
 641                                 steal_rbio(cur, rbio);
 642                                 cache_drop = cur;
 643                                 spin_unlock(&cur->bio_list_lock);
 644
 645                                 goto lockit;
 646                         }
 647
 648                         /* can we merge into the lock owner? */
 649                         if (rbio_can_merge(cur, rbio)) {
 650                                 merge_rbio(cur, rbio);
 651                                 spin_unlock(&cur->bio_list_lock);
 652                                 freeit = rbio;
 653                                 ret = 1;
 654                                 goto out;
 655                         }
 656
 657
 658                         /*
 659                          * we couldn't merge with the running
 660                          * rbio, see if we can merge with the
 661                          * pending ones.  We don't have to
 662                          * check for rmw_locked because there
 663                          * is no way they are inside finish_rmw
 664                          * right now
 665                          */
 666                         list_for_each_entry(pending, &cur->plug_list,
 667                                             plug_list) {
 668                                 if (rbio_can_merge(pending, rbio)) {
 669                                         merge_rbio(pending, rbio);
 670                                         spin_unlock(&cur->bio_list_lock);
 671                                         freeit = rbio;
 672                                         ret = 1;
 673                                         goto out;
 674                                 }
 675                         }
 676
 677                         /* no merging, put us on the tail of the plug list,
 678                          * our rbio will be started with the currently
 679                          * running rbio unlocks
 680                          */
 681                         list_add_tail(&rbio->plug_list, &cur->plug_list);
 682                         spin_unlock(&cur->bio_list_lock);
 683                         ret = 1;
 684                         goto out;
 685                 }
 686         }
 687 lockit:
 688         atomic_inc(&rbio->refs);
 689         list_add(&rbio->hash_list, &h->hash_list);
 690 out:
 691         spin_unlock_irqrestore(&h->lock, flags);
 692         if (cache_drop)
 693                 remove_rbio_from_cache(cache_drop);
 694         if (freeit)
 695                 __free_raid_bio(freeit);
 696         return ret;
 697 }
 698
 699 /*
 700  * called as rmw or parity rebuild is completed.  If the plug list has more
 701  * rbios waiting for this stripe, the next one on the list will be started
 702  */
 703 static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
 704 {
 705         int bucket;
 706         struct btrfs_stripe_hash *h;
 707         unsigned long flags;
 708         int keep_cache = 0;
 709
 710         bucket = rbio_bucket(rbio);
 711         h = rbio->fs_info->stripe_hash_table->table + bucket;
 712
 713         if (list_empty(&rbio->plug_list))
 714                 cache_rbio(rbio);
 715
 716         spin_lock_irqsave(&h->lock, flags);
 717         spin_lock(&rbio->bio_list_lock);
 718
 719         if (!list_empty(&rbio->hash_list)) {
 720                 /*
 721                  * if we're still cached and there is no other IO
 722                  * to perform, just leave this rbio here for others
 723                  * to steal from later
 724                  */
 725                 if (list_empty(&rbio->plug_list) &&
 726                     test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
 727                         keep_cache = 1;
 728                         clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
 729                         BUG_ON(!bio_list_empty(&rbio->bio_list));
 730                         goto done;
 731                 }
 732
 733                 list_del_init(&rbio->hash_list);
 734                 atomic_dec(&rbio->refs);
 735
 736                 /*
 737                  * we use the plug list to hold all the rbios
 738                  * waiting for the chance to lock this stripe.
 739                  * hand the lock over to one of them.
 740                  */
 741                 if (!list_empty(&rbio->plug_list)) {
 742                         struct btrfs_raid_bio *next;
 743                         struct list_head *head = rbio->plug_list.next;
 744
 745                         next = list_entry(head, struct btrfs_raid_bio,
 746                                           plug_list);
 747
 748                         list_del_init(&rbio->plug_list);
 749
 750                         list_add(&next->hash_list, &h->hash_list);
 751                         atomic_inc(&next->refs);
 752                         spin_unlock(&rbio->bio_list_lock);
 753                         spin_unlock_irqrestore(&h->lock, flags);
 754
 755                         if (next->read_rebuild)
 756                                 async_read_rebuild(next);
 757                         else {
 758                                 steal_rbio(rbio, next);
 759                                 async_rmw_stripe(next);
 760                         }
 761
 762                         goto done_nolock;
 763                 } else  if (waitqueue_active(&h->wait)) {
 764                         spin_unlock(&rbio->bio_list_lock);
 765                         spin_unlock_irqrestore(&h->lock, flags);
 766                         wake_up(&h->wait);
 767                         goto done_nolock;
 768                 }
 769         }
 770 done:
 771         spin_unlock(&rbio->bio_list_lock);
 772         spin_unlock_irqrestore(&h->lock, flags);
 773
 774 done_nolock:
 775         if (!keep_cache)
 776                 remove_rbio_from_cache(rbio);
 777 }
 778
 779 static void __free_raid_bio(struct btrfs_raid_bio *rbio)
 780 {
 781         int i;
 782
 783         WARN_ON(atomic_read(&rbio->refs) < 0);
 784         if (!atomic_dec_and_test(&rbio->refs))
 785                 return;
 786
 787         WARN_ON(!list_empty(&rbio->stripe_cache));
 788         WARN_ON(!list_empty(&rbio->hash_list));
 789         WARN_ON(!bio_list_empty(&rbio->bio_list));
 790
 791         for (i = 0; i < rbio->nr_pages; i++) {
 792                 if (rbio->stripe_pages[i]) {
 793                         __free_page(rbio->stripe_pages[i]);
 794                         rbio->stripe_pages[i] = NULL;
 795                 }
 796         }
 797         kfree(rbio->raid_map);
 798         kfree(rbio->bbio);
 799         kfree(rbio);
 800 }
 801
 802 static void free_raid_bio(struct btrfs_raid_bio *rbio)
 803 {
 804         unlock_stripe(rbio);
 805         __free_raid_bio(rbio);
 806 }
 807
 808 /*
 809  * this frees the rbio and runs through all the bios in the
 810  * bio_list and calls end_io on them
 811  */
 812 static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
 813 {
 814         struct bio *cur = bio_list_get(&rbio->bio_list);
 815         struct bio *next;
 816         free_raid_bio(rbio);
 817
 818         while (cur) {
 819                 next = cur->bi_next;
 820                 cur->bi_next = NULL;
 821                 if (uptodate)
 822                         set_bit(BIO_UPTODATE, &cur->bi_flags);
 823                 bio_endio(cur, err);
 824                 cur = next;
 825         }
 826 }
 827
 828 /*
 829  * end io function used by finish_rmw.  When we finally
 830  * get here, we've written a full stripe
 831  */
 832 static void raid_write_end_io(struct bio *bio, int err)
 833 {
 834         struct btrfs_raid_bio *rbio = bio->bi_private;
 835
 836         if (err)
 837                 fail_bio_stripe(rbio, bio);
 838
 839         bio_put(bio);
 840
 841         if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
 842                 return;
 843
 844         err = 0;
 845
 846         /* OK, we have read all the stripes we need to. */
 847         if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
 848                 err = -EIO;
 849
 850         rbio_orig_end_io(rbio, err, 0);
 851         return;
 852 }
 853
 854 /*
 855  * the read/modify/write code wants to use the original bio for
 856  * any pages it included, and then use the rbio for everything
 857  * else.  This function decides if a given index (stripe number)
 858  * and page number in that stripe fall inside the original bio
 859  * or the rbio.
 860  *
 861  * if you set bio_list_only, you'll get a NULL back for any ranges
 862  * that are outside the bio_list
 863  *
 864  * This doesn't take any refs on anything, you get a bare page pointer
 865  * and the caller must bump refs as required.
 866  *
 867  * You must call index_rbio_pages once before you can trust
 868  * the answers from this function.
 869  */
 870 static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
 871                                  int index, int pagenr, int bio_list_only)
 872 {
 873         int chunk_page;
 874         struct page *p = NULL;
 875
 876         chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
 877
 878         spin_lock_irq(&rbio->bio_list_lock);
 879         p = rbio->bio_pages[chunk_page];
 880         spin_unlock_irq(&rbio->bio_list_lock);
 881
 882         if (p || bio_list_only)
 883                 return p;
 884
 885         return rbio->stripe_pages[chunk_page];
 886 }
 887
 888 /*
 889  * number of pages we need for the entire stripe across all the
 890  * drives
 891  */
 892 static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
 893 {
 894         unsigned long nr = stripe_len * nr_stripes;
 895         return (nr + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 896 }
 897
 898 /*
 899  * allocation and initial setup for the btrfs_raid_bio.  Not
 900  * this does not allocate any pages for rbio->pages.
 901  */
 902 static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
 903                           struct btrfs_bio *bbio, u64 *raid_map,
 904                           u64 stripe_len)
 905 {
 906         struct btrfs_raid_bio *rbio;
 907         int nr_data = 0;
 908         int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes);
 909         void *p;
 910
 911         rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2,
 912                         GFP_NOFS);
 913         if (!rbio) {
 914                 kfree(raid_map);
 915                 kfree(bbio);
 916                 return ERR_PTR(-ENOMEM);
 917         }
 918
 919         bio_list_init(&rbio->bio_list);
 920         INIT_LIST_HEAD(&rbio->plug_list);
 921         spin_lock_init(&rbio->bio_list_lock);
 922         INIT_LIST_HEAD(&rbio->stripe_cache);
 923         INIT_LIST_HEAD(&rbio->hash_list);
 924         rbio->bbio = bbio;
 925         rbio->raid_map = raid_map;
 926         rbio->fs_info = root->fs_info;
 927         rbio->stripe_len = stripe_len;
 928         rbio->nr_pages = num_pages;
 929         rbio->faila = -1;
 930         rbio->failb = -1;
 931         atomic_set(&rbio->refs, 1);
 932
 933         /*
 934          * the stripe_pages and bio_pages array point to the extra
 935          * memory we allocated past the end of the rbio
 936          */
 937         p = rbio + 1;
 938         rbio->stripe_pages = p;
 939         rbio->bio_pages = p + sizeof(struct page *) * num_pages;
 940
 941         if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
 942                 nr_data = bbio->num_stripes - 2;
 943         else
 944                 nr_data = bbio->num_stripes - 1;
 945
 946         rbio->nr_data = nr_data;
 947         return rbio;
 948 }
 949
 950 /* allocate pages for all the stripes in the bio, including parity */
 951 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
 952 {
 953         int i;
 954         struct page *page;
 955
 956         for (i = 0; i < rbio->nr_pages; i++) {
 957                 if (rbio->stripe_pages[i])
 958                         continue;
 959                 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
 960                 if (!page)
 961                         return -ENOMEM;
 962                 rbio->stripe_pages[i] = page;
 963                 ClearPageUptodate(page);
 964         }
 965         return 0;
 966 }
 967
 968 /* allocate pages for just the p/q stripes */
 969 static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
 970 {
 971         int i;
 972         struct page *page;
 973
 974         i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
 975
 976         for (; i < rbio->nr_pages; i++) {
 977                 if (rbio->stripe_pages[i])
 978                         continue;
 979                 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
 980                 if (!page)
 981                         return -ENOMEM;
 982                 rbio->stripe_pages[i] = page;
 983         }
 984         return 0;
 985 }
 986
 987 /*
 988  * add a single page from a specific stripe into our list of bios for IO
 989  * this will try to merge into existing bios if possible, and returns
 990  * zero if all went well.
 991  */
 992 int rbio_add_io_page(struct btrfs_raid_bio *rbio,
 993                      struct bio_list *bio_list,
 994                      struct page *page,
 995                      int stripe_nr,
 996                      unsigned long page_index,
 997                      unsigned long bio_max_len)
 998 {
 999         struct bio *last = bio_list->tail;
1000         u64 last_end = 0;
1001         int ret;
1002         struct bio *bio;
1003         struct btrfs_bio_stripe *stripe;
1004         u64 disk_start;
1005
1006         stripe = &rbio->bbio->stripes[stripe_nr];
1007         disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT);
1008
1009         /* if the device is missing, just fail this stripe */
1010         if (!stripe->dev->bdev)
1011                 return fail_rbio_index(rbio, stripe_nr);
1012
1013         /* see if we can add this page onto our existing bio */
1014         if (last) {
1015                 last_end = (u64)last->bi_sector << 9;
1016                 last_end += last->bi_size;
1017
1018                 /*
1019                  * we can't merge these if they are from different
1020                  * devices or if they are not contiguous
1021                  */
1022                 if (last_end == disk_start && stripe->dev->bdev &&
1023                     test_bit(BIO_UPTODATE, &last->bi_flags) &&
1024                     last->bi_bdev == stripe->dev->bdev) {
1025                         ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0);
1026                         if (ret == PAGE_CACHE_SIZE)
1027                                 return 0;
1028                 }
1029         }
1030
1031         /* put a new bio on the list */
1032         bio = bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1);
1033         if (!bio)
1034                 return -ENOMEM;
1035
1036         bio->bi_size = 0;
1037         bio->bi_bdev = stripe->dev->bdev;
1038         bio->bi_sector = disk_start >> 9;
1039         set_bit(BIO_UPTODATE, &bio->bi_flags);
1040
1041         bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
1042         bio_list_add(bio_list, bio);
1043         return 0;
1044 }
1045
1046 /*
1047  * while we're doing the read/modify/write cycle, we could
1048  * have errors in reading pages off the disk.  This checks
1049  * for errors and if we're not able to read the page it'll
1050  * trigger parity reconstruction.  The rmw will be finished
1051  * after we've reconstructed the failed stripes
1052  */
1053 static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
1054 {
1055         if (rbio->faila >= 0 || rbio->failb >= 0) {
1056                 BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1);
1057                 __raid56_parity_recover(rbio);
1058         } else {
1059                 finish_rmw(rbio);
1060         }
1061 }
1062
1063 /*
1064  * these are just the pages from the rbio array, not from anything
1065  * the FS sent down to us
1066  */
1067 static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page)
1068 {
1069         int index;
1070         index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT);
1071         index += page;
1072         return rbio->stripe_pages[index];
1073 }
1074
1075 /*
1076  * helper function to walk our bio list and populate the bio_pages array with
1077  * the result.  This seems expensive, but it is faster than constantly
1078  * searching through the bio list as we setup the IO in finish_rmw or stripe
1079  * reconstruction.
1080  *
1081  * This must be called before you trust the answers from page_in_rbio
1082  */
1083 static void index_rbio_pages(struct btrfs_raid_bio *rbio)
1084 {
1085         struct bio *bio;
1086         u64 start;
1087         unsigned long stripe_offset;
1088         unsigned long page_index;
1089         struct page *p;
1090         int i;
1091
1092         spin_lock_irq(&rbio->bio_list_lock);
1093         bio_list_for_each(bio, &rbio->bio_list) {
1094                 start = (u64)bio->bi_sector << 9;
1095                 stripe_offset = start - rbio->raid_map[0];
1096                 page_index = stripe_offset >> PAGE_CACHE_SHIFT;
1097
1098                 for (i = 0; i < bio->bi_vcnt; i++) {
1099                         p = bio->bi_io_vec[i].bv_page;
1100                         rbio->bio_pages[page_index + i] = p;
1101                 }
1102         }
1103         spin_unlock_irq(&rbio->bio_list_lock);
1104 }
1105
1106 /*
1107  * this is called from one of two situations.  We either
1108  * have a full stripe from the higher layers, or we've read all
1109  * the missing bits off disk.
1110  *
1111  * This will calculate the parity and then send down any
1112  * changed blocks.
1113  */
1114 static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1115 {
1116         struct btrfs_bio *bbio = rbio->bbio;
1117         void *pointers[bbio->num_stripes];
1118         int stripe_len = rbio->stripe_len;
1119         int nr_data = rbio->nr_data;
1120         int stripe;
1121         int pagenr;
1122         int p_stripe = -1;
1123         int q_stripe = -1;
1124         struct bio_list bio_list;
1125         struct bio *bio;
1126         int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT;
1127         int ret;
1128
1129         bio_list_init(&bio_list);
1130
1131         if (bbio->num_stripes - rbio->nr_data == 1) {
1132                 p_stripe = bbio->num_stripes - 1;
1133         } else if (bbio->num_stripes - rbio->nr_data == 2) {
1134                 p_stripe = bbio->num_stripes - 2;
1135                 q_stripe = bbio->num_stripes - 1;
1136         } else {
1137                 BUG();
1138         }
1139
1140         /* at this point we either have a full stripe,
1141          * or we've read the full stripe from the drive.
1142          * recalculate the parity and write the new results.
1143          *
1144          * We're not allowed to add any new bios to the
1145          * bio list here, anyone else that wants to
1146          * change this stripe needs to do their own rmw.
1147          */
1148         spin_lock_irq(&rbio->bio_list_lock);
1149         set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1150         spin_unlock_irq(&rbio->bio_list_lock);
1151
1152         atomic_set(&rbio->bbio->error, 0);
1153
1154         /*
1155          * now that we've set rmw_locked, run through the
1156          * bio list one last time and map the page pointers
1157          *
1158          * We don't cache full rbios because we're assuming
1159          * the higher layers are unlikely to use this area of
1160          * the disk again soon.  If they do use it again,
1161          * hopefully they will send another full bio.
1162          */
1163         index_rbio_pages(rbio);
1164         if (!rbio_is_full(rbio))
1165                 cache_rbio_pages(rbio);
1166         else
1167                 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
1168
1169         for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
1170                 struct page *p;
1171                 /* first collect one page from each data stripe */
1172                 for (stripe = 0; stripe < nr_data; stripe++) {
1173                         p = page_in_rbio(rbio, stripe, pagenr, 0);
1174                         pointers[stripe] = kmap(p);
1175                 }
1176
1177                 /* then add the parity stripe */
1178                 p = rbio_pstripe_page(rbio, pagenr);
1179                 SetPageUptodate(p);
1180                 pointers[stripe++] = kmap(p);
1181
1182                 if (q_stripe != -1) {
1183
1184                         /*
1185                          * raid6, add the qstripe and call the
1186                          * library function to fill in our p/q
1187                          */
1188                         p = rbio_qstripe_page(rbio, pagenr);
1189                         SetPageUptodate(p);
1190                         pointers[stripe++] = kmap(p);
1191
1192                         raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE,
1193                                                 pointers);
1194                 } else {
1195                         /* raid5 */
1196                         memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
1197                         run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
1198                 }
1199
1200
1201                 for (stripe = 0; stripe < bbio->num_stripes; stripe++)
1202                         kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
1203         }
1204
1205         /*
1206          * time to start writing.  Make bios for everything from the
1207          * higher layers (the bio_list in our rbio) and our p/q.  Ignore
1208          * everything else.
1209          */
1210         for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
1211                 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
1212                         struct page *page;
1213                         if (stripe < rbio->nr_data) {
1214                                 page = page_in_rbio(rbio, stripe, pagenr, 1);
1215                                 if (!page)
1216                                         continue;
1217                         } else {
1218                                page = rbio_stripe_page(rbio, stripe, pagenr);
1219                         }
1220
1221                         ret = rbio_add_io_page(rbio, &bio_list,
1222                                        page, stripe, pagenr, rbio->stripe_len);
1223                         if (ret)
1224                                 goto cleanup;
1225                 }
1226         }
1227
1228         atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list));
1229         BUG_ON(atomic_read(&bbio->stripes_pending) == 0);
1230
1231         while (1) {
1232                 bio = bio_list_pop(&bio_list);
1233                 if (!bio)
1234                         break;
1235
1236                 bio->bi_private = rbio;
1237                 bio->bi_end_io = raid_write_end_io;
1238                 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
1239                 submit_bio(WRITE, bio);
1240         }
1241         return;
1242
1243 cleanup:
1244         rbio_orig_end_io(rbio, -EIO, 0);
1245 }
1246
1247 /*
1248  * helper to find the stripe number for a given bio.  Used to figure out which
1249  * stripe has failed.  This expects the bio to correspond to a physical disk,
1250  * so it looks up based on physical sector numbers.
1251  */
1252 static int find_bio_stripe(struct btrfs_raid_bio *rbio,
1253                            struct bio *bio)
1254 {
1255         u64 physical = bio->bi_sector;
1256         u64 stripe_start;
1257         int i;
1258         struct btrfs_bio_stripe *stripe;
1259
1260         physical <<= 9;
1261
1262         for (i = 0; i < rbio->bbio->num_stripes; i++) {
1263                 stripe = &rbio->bbio->stripes[i];
1264                 stripe_start = stripe->physical;
1265                 if (physical >= stripe_start &&
1266                     physical < stripe_start + rbio->stripe_len) {
1267                         return i;
1268                 }
1269         }
1270         return -1;
1271 }
1272
1273 /*
1274  * helper to find the stripe number for a given
1275  * bio (before mapping).  Used to figure out which stripe has
1276  * failed.  This looks up based on logical block numbers.
1277  */
1278 static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
1279                                    struct bio *bio)
1280 {
1281         u64 logical = bio->bi_sector;
1282         u64 stripe_start;
1283         int i;
1284
1285         logical <<= 9;
1286
1287         for (i = 0; i < rbio->nr_data; i++) {
1288                 stripe_start = rbio->raid_map[i];
1289                 if (logical >= stripe_start &&
1290                     logical < stripe_start + rbio->stripe_len) {
1291                         return i;
1292                 }
1293         }
1294         return -1;
1295 }
1296
1297 /*
1298  * returns -EIO if we had too many failures
1299  */
1300 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
1301 {
1302         unsigned long flags;
1303         int ret = 0;
1304
1305         spin_lock_irqsave(&rbio->bio_list_lock, flags);
1306
1307         /* we already know this stripe is bad, move on */
1308         if (rbio->faila == failed || rbio->failb == failed)
1309                 goto out;
1310
1311         if (rbio->faila == -1) {
1312                 /* first failure on this rbio */
1313                 rbio->faila = failed;
1314                 atomic_inc(&rbio->bbio->error);
1315         } else if (rbio->failb == -1) {
1316                 /* second failure on this rbio */
1317                 rbio->failb = failed;
1318                 atomic_inc(&rbio->bbio->error);
1319         } else {
1320                 ret = -EIO;
1321         }
1322 out:
1323         spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
1324
1325         return ret;
1326 }
1327
1328 /*
1329  * helper to fail a stripe based on a physical disk
1330  * bio.
1331  */
1332 static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
1333                            struct bio *bio)
1334 {
1335         int failed = find_bio_stripe(rbio, bio);
1336
1337         if (failed < 0)
1338                 return -EIO;
1339
1340         return fail_rbio_index(rbio, failed);
1341 }
1342
1343 /*
1344  * this sets each page in the bio uptodate.  It should only be used on private
1345  * rbio pages, nothing that comes in from the higher layers
1346  */
1347 static void set_bio_pages_uptodate(struct bio *bio)
1348 {
1349         int i;
1350         struct page *p;
1351
1352         for (i = 0; i < bio->bi_vcnt; i++) {
1353                 p = bio->bi_io_vec[i].bv_page;
1354                 SetPageUptodate(p);
1355         }
1356 }
1357
1358 /*
1359  * end io for the read phase of the rmw cycle.  All the bios here are physical
1360  * stripe bios we've read from the disk so we can recalculate the parity of the
1361  * stripe.
1362  *
1363  * This will usually kick off finish_rmw once all the bios are read in, but it
1364  * may trigger parity reconstruction if we had any errors along the way
1365  */
1366 static void raid_rmw_end_io(struct bio *bio, int err)
1367 {
1368         struct btrfs_raid_bio *rbio = bio->bi_private;
1369
1370         if (err)
1371                 fail_bio_stripe(rbio, bio);
1372         else
1373                 set_bio_pages_uptodate(bio);
1374
1375         bio_put(bio);
1376
1377         if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
1378                 return;
1379
1380         err = 0;
1381         if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
1382                 goto cleanup;
1383
1384         /*
1385          * this will normally call finish_rmw to start our write
1386          * but if there are any failed stripes we'll reconstruct
1387          * from parity first
1388          */
1389         validate_rbio_for_rmw(rbio);
1390         return;
1391
1392 cleanup:
1393
1394         rbio_orig_end_io(rbio, -EIO, 0);
1395 }
1396
1397 static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
1398 {
1399         rbio->work.flags = 0;
1400         rbio->work.func = rmw_work;
1401
1402         btrfs_queue_worker(&rbio->fs_info->rmw_workers,
1403                            &rbio->work);
1404 }
1405
1406 static void async_read_rebuild(struct btrfs_raid_bio *rbio)
1407 {
1408         rbio->work.flags = 0;
1409         rbio->work.func = read_rebuild_work;
1410
1411         btrfs_queue_worker(&rbio->fs_info->rmw_workers,
1412                            &rbio->work);
1413 }
1414
1415 /*
1416  * the stripe must be locked by the caller.  It will
1417  * unlock after all the writes are done
1418  */
1419 static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1420 {
1421         int bios_to_read = 0;
1422         struct btrfs_bio *bbio = rbio->bbio;
1423         struct bio_list bio_list;
1424         int ret;
1425         int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1426         int pagenr;
1427         int stripe;
1428         struct bio *bio;
1429
1430         bio_list_init(&bio_list);
1431
1432         ret = alloc_rbio_pages(rbio);
1433         if (ret)
1434                 goto cleanup;
1435
1436         index_rbio_pages(rbio);
1437
1438         atomic_set(&rbio->bbio->error, 0);
1439         /*
1440          * build a list of bios to read all the missing parts of this
1441          * stripe
1442          */
1443         for (stripe = 0; stripe < rbio->nr_data; stripe++) {
1444                 for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1445                         struct page *page;
1446                         /*
1447                          * we want to find all the pages missing from
1448                          * the rbio and read them from the disk.  If
1449                          * page_in_rbio finds a page in the bio list
1450                          * we don't need to read it off the stripe.
1451                          */
1452                         page = page_in_rbio(rbio, stripe, pagenr, 1);
1453                         if (page)
1454                                 continue;
1455
1456                         page = rbio_stripe_page(rbio, stripe, pagenr);
1457                         /*
1458                          * the bio cache may have handed us an uptodate
1459                          * page.  If so, be happy and use it
1460                          */
1461                         if (PageUptodate(page))
1462                                 continue;
1463
1464                         ret = rbio_add_io_page(rbio, &bio_list, page,
1465                                        stripe, pagenr, rbio->stripe_len);
1466                         if (ret)
1467                                 goto cleanup;
1468                 }
1469         }
1470
1471         bios_to_read = bio_list_size(&bio_list);
1472         if (!bios_to_read) {
1473                 /*
1474                  * this can happen if others have merged with
1475                  * us, it means there is nothing left to read.
1476                  * But if there are missing devices it may not be
1477                  * safe to do the full stripe write yet.
1478                  */
1479                 goto finish;
1480         }
1481
1482         /*
1483          * the bbio may be freed once we submit the last bio.  Make sure
1484          * not to touch it after that
1485          */
1486         atomic_set(&bbio->stripes_pending, bios_to_read);
1487         while (1) {
1488                 bio = bio_list_pop(&bio_list);
1489                 if (!bio)
1490                         break;
1491
1492                 bio->bi_private = rbio;
1493                 bio->bi_end_io = raid_rmw_end_io;
1494
1495                 btrfs_bio_wq_end_io(rbio->fs_info, bio,
1496                                     BTRFS_WQ_ENDIO_RAID56);
1497
1498                 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
1499                 submit_bio(READ, bio);
1500         }
1501         /* the actual write will happen once the reads are done */
1502         return 0;
1503
1504 cleanup:
1505         rbio_orig_end_io(rbio, -EIO, 0);
1506         return -EIO;
1507
1508 finish:
1509         validate_rbio_for_rmw(rbio);
1510         return 0;
1511 }
1512
1513 /*
1514  * if the upper layers pass in a full stripe, we thank them by only allocating
1515  * enough pages to hold the parity, and sending it all down quickly.
1516  */
1517 static int full_stripe_write(struct btrfs_raid_bio *rbio)
1518 {
1519         int ret;
1520
1521         ret = alloc_rbio_parity_pages(rbio);
1522         if (ret)
1523                 return ret;
1524
1525         ret = lock_stripe_add(rbio);
1526         if (ret == 0)
1527                 finish_rmw(rbio);
1528         return 0;
1529 }
1530
1531 /*
1532  * partial stripe writes get handed over to async helpers.
1533  * We're really hoping to merge a few more writes into this
1534  * rbio before calculating new parity
1535  */
1536 static int partial_stripe_write(struct btrfs_raid_bio *rbio)
1537 {
1538         int ret;
1539
1540         ret = lock_stripe_add(rbio);
1541         if (ret == 0)
1542                 async_rmw_stripe(rbio);
1543         return 0;
1544 }
1545
1546 /*
1547  * sometimes while we were reading from the drive to
1548  * recalculate parity, enough new bios come into create
1549  * a full stripe.  So we do a check here to see if we can
1550  * go directly to finish_rmw
1551  */
1552 static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
1553 {
1554         /* head off into rmw land if we don't have a full stripe */
1555         if (!rbio_is_full(rbio))
1556                 return partial_stripe_write(rbio);
1557         return full_stripe_write(rbio);
1558 }
1559
1560 /*
1561  * our main entry point for writes from the rest of the FS.
1562  */
1563 int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
1564                         struct btrfs_bio *bbio, u64 *raid_map,
1565                         u64 stripe_len)
1566 {
1567         struct btrfs_raid_bio *rbio;
1568
1569         rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
1570         if (IS_ERR(rbio)) {
1571                 kfree(raid_map);
1572                 kfree(bbio);
1573                 return PTR_ERR(rbio);
1574         }
1575         bio_list_add(&rbio->bio_list, bio);
1576         rbio->bio_list_bytes = bio->bi_size;
1577         return __raid56_parity_write(rbio);
1578 }
1579
1580 /*
1581  * all parity reconstruction happens here.  We've read in everything
1582  * we can find from the drives and this does the heavy lifting of
1583  * sorting the good from the bad.
1584  */
1585 static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1586 {
1587         int pagenr, stripe;
1588         void **pointers;
1589         int faila = -1, failb = -1;
1590         int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1591         struct page *page;
1592         int err;
1593         int i;
1594
1595         pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *),
1596                            GFP_NOFS);
1597         if (!pointers) {
1598                 err = -ENOMEM;
1599                 goto cleanup_io;
1600         }
1601
1602         faila = rbio->faila;
1603         failb = rbio->failb;
1604
1605         if (rbio->read_rebuild) {
1606                 spin_lock_irq(&rbio->bio_list_lock);
1607                 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1608                 spin_unlock_irq(&rbio->bio_list_lock);
1609         }
1610
1611         index_rbio_pages(rbio);
1612
1613         for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1614                 /* setup our array of pointers with pages
1615                  * from each stripe
1616                  */
1617                 for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
1618                         /*
1619                          * if we're rebuilding a read, we have to use
1620                          * pages from the bio list
1621                          */
1622                         if (rbio->read_rebuild &&
1623                             (stripe == faila || stripe == failb)) {
1624                                 page = page_in_rbio(rbio, stripe, pagenr, 0);
1625                         } else {
1626                                 page = rbio_stripe_page(rbio, stripe, pagenr);
1627                         }
1628                         pointers[stripe] = kmap(page);
1629                 }
1630
1631                 /* all raid6 handling here */
1632                 if (rbio->raid_map[rbio->bbio->num_stripes - 1] ==
1633                     RAID6_Q_STRIPE) {
1634
1635                         /*
1636                          * single failure, rebuild from parity raid5
1637                          * style
1638                          */
1639                         if (failb < 0) {
1640                                 if (faila == rbio->nr_data) {
1641                                         /*
1642                                          * Just the P stripe has failed, without
1643                                          * a bad data or Q stripe.
1644                                          * TODO, we should redo the xor here.
1645                                          */
1646                                         err = -EIO;
1647                                         goto cleanup;
1648                                 }
1649                                 /*
1650                                  * a single failure in raid6 is rebuilt
1651                                  * in the pstripe code below
1652                                  */
1653                                 goto pstripe;
1654                         }
1655
1656                         /* make sure our ps and qs are in order */
1657                         if (faila > failb) {
1658                                 int tmp = failb;
1659                                 failb = faila;
1660                                 faila = tmp;
1661                         }
1662
1663                         /* if the q stripe is failed, do a pstripe reconstruction
1664                          * from the xors.
1665                          * If both the q stripe and the P stripe are failed, we're
1666                          * here due to a crc mismatch and we can't give them the
1667                          * data they want
1668                          */
1669                         if (rbio->raid_map[failb] == RAID6_Q_STRIPE) {
1670                                 if (rbio->raid_map[faila] == RAID5_P_STRIPE) {
1671                                         err = -EIO;
1672                                         goto cleanup;
1673                                 }
1674                                 /*
1675                                  * otherwise we have one bad data stripe and
1676                                  * a good P stripe.  raid5!
1677                                  */
1678                                 goto pstripe;
1679                         }
1680
1681                         if (rbio->raid_map[failb] == RAID5_P_STRIPE) {
1682                                 raid6_datap_recov(rbio->bbio->num_stripes,
1683                                                   PAGE_SIZE, faila, pointers);
1684                         } else {
1685                                 raid6_2data_recov(rbio->bbio->num_stripes,
1686                                                   PAGE_SIZE, faila, failb,
1687                                                   pointers);
1688                         }
1689                 } else {
1690                         void *p;
1691
1692                         /* rebuild from P stripe here (raid5 or raid6) */
1693                         BUG_ON(failb != -1);
1694 pstripe:
1695                         /* Copy parity block into failed block to start with */
1696                         memcpy(pointers[faila],
1697                                pointers[rbio->nr_data],
1698                                PAGE_CACHE_SIZE);
1699
1700                         /* rearrange the pointer array */
1701                         p = pointers[faila];
1702                         for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
1703                                 pointers[stripe] = pointers[stripe + 1];
1704                         pointers[rbio->nr_data - 1] = p;
1705
1706                         /* xor in the rest */
1707                         run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE);
1708                 }
1709                 /* if we're doing this rebuild as part of an rmw, go through
1710                  * and set all of our private rbio pages in the
1711                  * failed stripes as uptodate.  This way finish_rmw will
1712                  * know they can be trusted.  If this was a read reconstruction,
1713                  * other endio functions will fiddle the uptodate bits
1714                  */
1715                 if (!rbio->read_rebuild) {
1716                         for (i = 0;  i < nr_pages; i++) {
1717                                 if (faila != -1) {
1718                                         page = rbio_stripe_page(rbio, faila, i);
1719                                         SetPageUptodate(page);
1720                                 }
1721                                 if (failb != -1) {
1722                                         page = rbio_stripe_page(rbio, failb, i);
1723                                         SetPageUptodate(page);
1724                                 }
1725                         }
1726                 }
1727                 for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
1728                         /*
1729                          * if we're rebuilding a read, we have to use
1730                          * pages from the bio list
1731                          */
1732                         if (rbio->read_rebuild &&
1733                             (stripe == faila || stripe == failb)) {
1734                                 page = page_in_rbio(rbio, stripe, pagenr, 0);
1735                         } else {
1736                                 page = rbio_stripe_page(rbio, stripe, pagenr);
1737                         }
1738                         kunmap(page);
1739                 }
1740         }
1741
1742         err = 0;
1743 cleanup:
1744         kfree(pointers);
1745
1746 cleanup_io:
1747
1748         if (rbio->read_rebuild) {
1749                 if (err == 0)
1750                         cache_rbio_pages(rbio);
1751                 else
1752                         clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
1753
1754                 rbio_orig_end_io(rbio, err, err == 0);
1755         } else if (err == 0) {
1756                 rbio->faila = -1;
1757                 rbio->failb = -1;
1758                 finish_rmw(rbio);
1759         } else {
1760                 rbio_orig_end_io(rbio, err, 0);
1761         }
1762 }
1763
1764 /*
1765  * This is called only for stripes we've read from disk to
1766  * reconstruct the parity.
1767  */
1768 static void raid_recover_end_io(struct bio *bio, int err)
1769 {
1770         struct btrfs_raid_bio *rbio = bio->bi_private;
1771
1772         /*
1773          * we only read stripe pages off the disk, set them
1774          * up to date if there were no errors
1775          */
1776         if (err)
1777                 fail_bio_stripe(rbio, bio);
1778         else
1779                 set_bio_pages_uptodate(bio);
1780         bio_put(bio);
1781
1782         if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
1783                 return;
1784
1785         if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
1786                 rbio_orig_end_io(rbio, -EIO, 0);
1787         else
1788                 __raid_recover_end_io(rbio);
1789 }
1790
1791 /*
1792  * reads everything we need off the disk to reconstruct
1793  * the parity. endio handlers trigger final reconstruction
1794  * when the IO is done.
1795  *
1796  * This is used both for reads from the higher layers and for
1797  * parity construction required to finish a rmw cycle.
1798  */
1799 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
1800 {
1801         int bios_to_read = 0;
1802         struct btrfs_bio *bbio = rbio->bbio;
1803         struct bio_list bio_list;
1804         int ret;
1805         int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1806         int pagenr;
1807         int stripe;
1808         struct bio *bio;
1809
1810         bio_list_init(&bio_list);
1811
1812         ret = alloc_rbio_pages(rbio);
1813         if (ret)
1814                 goto cleanup;
1815
1816         atomic_set(&rbio->bbio->error, 0);
1817
1818         /*
1819          * read everything that hasn't failed.  Thanks to the
1820          * stripe cache, it is possible that some or all of these
1821          * pages are going to be uptodate.
1822          */
1823         for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
1824                 if (rbio->faila == stripe ||
1825                     rbio->failb == stripe)
1826                         continue;
1827
1828                 for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1829                         struct page *p;
1830
1831                         /*
1832                          * the rmw code may have already read this
1833                          * page in
1834                          */
1835                         p = rbio_stripe_page(rbio, stripe, pagenr);
1836                         if (PageUptodate(p))
1837                                 continue;
1838
1839                         ret = rbio_add_io_page(rbio, &bio_list,
1840                                        rbio_stripe_page(rbio, stripe, pagenr),
1841                                        stripe, pagenr, rbio->stripe_len);
1842                         if (ret < 0)
1843                                 goto cleanup;
1844                 }
1845         }
1846
1847         bios_to_read = bio_list_size(&bio_list);
1848         if (!bios_to_read) {
1849                 /*
1850                  * we might have no bios to read just because the pages
1851                  * were up to date, or we might have no bios to read because
1852                  * the devices were gone.
1853                  */
1854                 if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) {
1855                         __raid_recover_end_io(rbio);
1856                         goto out;
1857                 } else {
1858                         goto cleanup;
1859                 }
1860         }
1861
1862         /*
1863          * the bbio may be freed once we submit the last bio.  Make sure
1864          * not to touch it after that
1865          */
1866         atomic_set(&bbio->stripes_pending, bios_to_read);
1867         while (1) {
1868                 bio = bio_list_pop(&bio_list);
1869                 if (!bio)
1870                         break;
1871
1872                 bio->bi_private = rbio;
1873                 bio->bi_end_io = raid_recover_end_io;
1874
1875                 btrfs_bio_wq_end_io(rbio->fs_info, bio,
1876                                     BTRFS_WQ_ENDIO_RAID56);
1877
1878                 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
1879                 submit_bio(READ, bio);
1880         }
1881 out:
1882         return 0;
1883
1884 cleanup:
1885         if (rbio->read_rebuild)
1886                 rbio_orig_end_io(rbio, -EIO, 0);
1887         return -EIO;
1888 }
1889
1890 /*
1891  * the main entry point for reads from the higher layers.  This
1892  * is really only called when the normal read path had a failure,
1893  * so we assume the bio they send down corresponds to a failed part
1894  * of the drive.
1895  */
1896 int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
1897                           struct btrfs_bio *bbio, u64 *raid_map,
1898                           u64 stripe_len, int mirror_num)
1899 {
1900         struct btrfs_raid_bio *rbio;
1901         int ret;
1902
1903         rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
1904         if (IS_ERR(rbio)) {
1905                 return PTR_ERR(rbio);
1906         }
1907
1908         rbio->read_rebuild = 1;
1909         bio_list_add(&rbio->bio_list, bio);
1910         rbio->bio_list_bytes = bio->bi_size;
1911
1912         rbio->faila = find_logical_bio_stripe(rbio, bio);
1913         if (rbio->faila == -1) {
1914                 BUG();
1915                 kfree(rbio);
1916                 return -EIO;
1917         }
1918
1919         /*
1920          * reconstruct from the q stripe if they are
1921          * asking for mirror 3
1922          */
1923         if (mirror_num == 3)
1924                 rbio->failb = bbio->num_stripes - 2;
1925
1926         ret = lock_stripe_add(rbio);
1927
1928         /*
1929          * __raid56_parity_recover will end the bio with
1930          * any errors it hits.  We don't want to return
1931          * its error value up the stack because our caller
1932          * will end up calling bio_endio with any nonzero
1933          * return
1934          */
1935         if (ret == 0)
1936                 __raid56_parity_recover(rbio);
1937         /*
1938          * our rbio has been added to the list of
1939          * rbios that will be handled after the
1940          * currently lock owner is done
1941          */
1942         return 0;
1943
1944 }
1945
1946 static void rmw_work(struct btrfs_work *work)
1947 {
1948         struct btrfs_raid_bio *rbio;
1949
1950         rbio = container_of(work, struct btrfs_raid_bio, work);
1951         raid56_rmw_stripe(rbio);
1952 }
1953
1954 static void read_rebuild_work(struct btrfs_work *work)
1955 {
1956         struct btrfs_raid_bio *rbio;
1957
1958         rbio = container_of(work, struct btrfs_raid_bio, work);
1959         __raid56_parity_recover(rbio);
1960 }