Merge tag 'for-5.16-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

author Linus Torvalds <torvalds@linux-foundation.org>

Mon, 1 Nov 2021 19:48:25 +0000 (12:48 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Mon, 1 Nov 2021 19:48:25 +0000 (12:48 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Mon, 1 Nov 2021 19:48:25 +0000 (12:48 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Mon, 1 Nov 2021 19:48:25 +0000 (12:48 -0700)
diff --combined fs/btrfs/compression.c

index 6c7eb80,1d071c8..32da97c
--- 1/fs/btrfs/compression.c
--- 2/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@@ -9,7 -9,6 +9,7 @@@
   #include <linux/fs.h>
   #include <linux/pagemap.h>
   #include <linux/highmem.h>
+ +#include <linux/kthread.h>
   #include <linux/time.h>
   #include <linux/init.h>
   #include <linux/string.h>
@@@ -29,6 -28,7 +29,7 @@@
   #include "compression.h"
   #include "extent_io.h"
   #include "extent_map.h"
+ #include "subpage.h"
   #include "zoned.h"
   
   static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" };
@@@ -173,17 -173,16 +174,17 @@@ static int check_compressed_csum(struc
                 /* Hash through the page sector by sector */
                 for (pg_offset = 0; pg_offset < bytes_left;
                      pg_offset += sectorsize) {
- -                      kaddr = page_address(page);
+ +                      kaddr = kmap_atomic(page);
                         crypto_shash_digest(shash, kaddr + pg_offset,
                                             sectorsize, csum);
+ +                      kunmap_atomic(kaddr);
   
                         if (memcmp(&csum, cb_sum, csum_size) != 0) {
                                 btrfs_print_data_csum_error(inode, disk_start,
                                                 csum, cb_sum, cb->mirror_num);
-                               if (btrfs_io_bio(bio)->device)
+                               if (btrfs_bio(bio)->device)
                                         btrfs_dev_stat_inc_and_print(
-                                               btrfs_io_bio(bio)->device,
+                                               btrfs_bio(bio)->device,
                                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
                                 return -EIO;
                         }
@@@ -194,6 -193,87 +195,87 @@@
         return 0;
   }
   
+ /*
+  * Reduce bio and io accounting for a compressed_bio with its corresponding bio.
+  *
+  * Return true if there is no pending bio nor io.
+  * Return false otherwise.
+  */
+ static bool dec_and_test_compressed_bio(struct compressed_bio *cb, struct bio *bio)
+ {
+       struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
+       unsigned int bi_size = 0;
+       bool last_io = false;
+       struct bio_vec *bvec;
+       struct bvec_iter_all iter_all;
+ 
+       /*
+        * At endio time, bi_iter.bi_size doesn't represent the real bio size.
+        * Thus here we have to iterate through all segments to grab correct
+        * bio size.
+        */
+       bio_for_each_segment_all(bvec, bio, iter_all)
+               bi_size += bvec->bv_len;
+ 
+       if (bio->bi_status)
+               cb->errors = 1;
+ 
+       ASSERT(bi_size && bi_size <= cb->compressed_len);
+       last_io = refcount_sub_and_test(bi_size >> fs_info->sectorsize_bits,
+                                       &cb->pending_sectors);
+       /*
+        * Here we must wake up the possible error handler after all other
+        * operations on @cb finished, or we can race with
+        * finish_compressed_bio_*() which may free @cb.
+        */
+       wake_up_var(cb);
+ 
+       return last_io;
+ }
+ 
+ static void finish_compressed_bio_read(struct compressed_bio *cb, struct bio *bio)
+ {
+       unsigned int index;
+       struct page *page;
+ 
+       /* Release the compressed pages */
+       for (index = 0; index < cb->nr_pages; index++) {
+               page = cb->compressed_pages[index];
+               page->mapping = NULL;
+               put_page(page);
+       }
+ 
+       /* Do io completion on the original bio */
+       if (cb->errors) {
+               bio_io_error(cb->orig_bio);
+       } else {
+               struct bio_vec *bvec;
+               struct bvec_iter_all iter_all;
+ 
+               ASSERT(bio);
+               ASSERT(!bio->bi_status);
+               /*
+                * We have verified the checksum already, set page checked so
+                * the end_io handlers know about it
+                */
+               ASSERT(!bio_flagged(bio, BIO_CLONED));
+               bio_for_each_segment_all(bvec, cb->orig_bio, iter_all) {
+                       u64 bvec_start = page_offset(bvec->bv_page) +
+                                        bvec->bv_offset;
+ 
+                       btrfs_page_set_checked(btrfs_sb(cb->inode->i_sb),
+                                       bvec->bv_page, bvec_start,
+                                       bvec->bv_len);
+               }
+ 
+               bio_endio(cb->orig_bio);
+       }
+ 
+       /* Finally free the cb struct */
+       kfree(cb->compressed_pages);
+       kfree(cb);
+ }
+ 
   /* when we finish reading compressed pages from the disk, we
    * decompress them and then run the bio end_io routines on the
    * decompressed pages (in the inode address space).
@@@ -208,25 -288,17 +290,17 @@@ static void end_compressed_bio_read(str
   {
         struct compressed_bio *cb = bio->bi_private;
         struct inode *inode;
-       struct page *page;
-       unsigned int index;
-       unsigned int mirror = btrfs_io_bio(bio)->mirror_num;
+       unsigned int mirror = btrfs_bio(bio)->mirror_num;
         int ret = 0;
   
-       if (bio->bi_status)
-               cb->errors = 1;
- 
-       /* if there are more bios still pending for this compressed
-        * extent, just exit
-        */
-       if (!refcount_dec_and_test(&cb->pending_bios))
+       if (!dec_and_test_compressed_bio(cb, bio))
                 goto out;
   
         /*
          * Record the correct mirror_num in cb->orig_bio so that
          * read-repair can work properly.
          */
-       btrfs_io_bio(cb->orig_bio)->mirror_num = mirror;
+       btrfs_bio(cb->orig_bio)->mirror_num = mirror;
         cb->mirror_num = mirror;
   
         /*
@@@ -250,36 -322,7 +324,7 @@@
   csum_failed:
         if (ret)
                 cb->errors = 1;
- 
-       /* release the compressed pages */
-       index = 0;
-       for (index = 0; index < cb->nr_pages; index++) {
-               page = cb->compressed_pages[index];
-               page->mapping = NULL;
-               put_page(page);
-       }
- 
-       /* do io completion on the original bio */
-       if (cb->errors) {
-               bio_io_error(cb->orig_bio);
-       } else {
-               struct bio_vec *bvec;
-               struct bvec_iter_all iter_all;
- 
-               /*
-                * we have verified the checksum already, set page
-                * checked so the end_io handlers know about it
-                */
-               ASSERT(!bio_flagged(bio, BIO_CLONED));
-               bio_for_each_segment_all(bvec, cb->orig_bio, iter_all)
-                       SetPageChecked(bvec->bv_page);
- 
-               bio_endio(cb->orig_bio);
-       }
- 
-       /* finally free the cb struct */
-       kfree(cb->compressed_pages);
-       kfree(cb);
+       finish_compressed_bio_read(cb, bio);
   out:
         bio_put(bio);
   }
@@@ -291,6 -334,7 +336,7 @@@
   static noinline void end_compressed_writeback(struct inode *inode,
                                               const struct compressed_bio *cb)
   {
+       struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
         unsigned long index = cb->start >> PAGE_SHIFT;
         unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT;
         struct page *pages[16];
@@@ -313,7 -357,8 +359,8 @@@
                 for (i = 0; i < ret; i++) {
                         if (cb->errors)
                                 SetPageError(pages[i]);
-                       end_page_writeback(pages[i]);
+                       btrfs_page_clamp_clear_writeback(fs_info, pages[i],
+                                                        cb->start, cb->len);
                         put_page(pages[i]);
                 }
                 nr_pages -= ret;
@@@ -322,60 -367,127 +369,127 @@@
         /* the inode may be gone now */
   }
   
- /*
-  * do the cleanup once all the compressed pages hit the disk.
-  * This will clear writeback on the file pages and free the compressed
-  * pages.
-  *
-  * This also calls the writeback end hooks for the file pages so that
-  * metadata and checksums can be updated in the file.
-  */
- static void end_compressed_bio_write(struct bio *bio)
+ static void finish_compressed_bio_write(struct compressed_bio *cb)
   {
-       struct compressed_bio *cb = bio->bi_private;
-       struct inode *inode;
-       struct page *page;
+       struct inode *inode = cb->inode;
         unsigned int index;
   
-       if (bio->bi_status)
-               cb->errors = 1;
- 
-       /* if there are more bios still pending for this compressed
-        * extent, just exit
-        */
-       if (!refcount_dec_and_test(&cb->pending_bios))
-               goto out;
- 
-       /* ok, we're the last bio for this extent, step one is to
-        * call back into the FS and do all the end_io operations
+       /*
+        * Ok, we're the last bio for this extent, step one is to call back
+        * into the FS and do all the end_io operations.
          */
-       inode = cb->inode;
-       btrfs_record_physical_zoned(inode, cb->start, bio);
         btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), NULL,
                         cb->start, cb->start + cb->len - 1,
                         !cb->errors);
   
         end_compressed_writeback(inode, cb);
-       /* note, our inode could be gone now */
+       /* Note, our inode could be gone now */
   
         /*
-        * release the compressed pages, these came from alloc_page and
+        * Release the compressed pages, these came from alloc_page and
          * are not attached to the inode at all
          */
-       index = 0;
         for (index = 0; index < cb->nr_pages; index++) {
-               page = cb->compressed_pages[index];
+               struct page *page = cb->compressed_pages[index];
+ 
                 page->mapping = NULL;
                 put_page(page);
         }
   
-       /* finally free the cb struct */
+       /* Finally free the cb struct */
         kfree(cb->compressed_pages);
         kfree(cb);
+ }
+ 
+ /*
+  * Do the cleanup once all the compressed pages hit the disk.  This will clear
+  * writeback on the file pages and free the compressed pages.
+  *
+  * This also calls the writeback end hooks for the file pages so that metadata
+  * and checksums can be updated in the file.
+  */
+ static void end_compressed_bio_write(struct bio *bio)
+ {
+       struct compressed_bio *cb = bio->bi_private;
+ 
+       if (!dec_and_test_compressed_bio(cb, bio))
+               goto out;
+ 
+       btrfs_record_physical_zoned(cb->inode, cb->start, bio);
+ 
+       finish_compressed_bio_write(cb);
   out:
         bio_put(bio);
   }
   
+ static blk_status_t submit_compressed_bio(struct btrfs_fs_info *fs_info,
+                                         struct compressed_bio *cb,
+                                         struct bio *bio, int mirror_num)
+ {
+       blk_status_t ret;
+ 
+       ASSERT(bio->bi_iter.bi_size);
+       ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
+       if (ret)
+               return ret;
+       ret = btrfs_map_bio(fs_info, bio, mirror_num);
+       return ret;
+ }
+ 
+ /*
+  * Allocate a compressed_bio, which will be used to read/write on-disk
+  * (aka, compressed) * data.
+  *
+  * @cb:                 The compressed_bio structure, which records all the needed
+  *                      information to bind the compressed data to the uncompressed
+  *                      page cache.
+  * @disk_byten:         The logical bytenr where the compressed data will be read
+  *                      from or written to.
+  * @endio_func:         The endio function to call after the IO for compressed data
+  *                      is finished.
+  * @next_stripe_start:  Return value of logical bytenr of where next stripe starts.
+  *                      Let the caller know to only fill the bio up to the stripe
+  *                      boundary.
+  */
+ 
+ 
+ static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_bytenr,
+                                       unsigned int opf, bio_end_io_t endio_func,
+                                       u64 *next_stripe_start)
+ {
+       struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
+       struct btrfs_io_geometry geom;
+       struct extent_map *em;
+       struct bio *bio;
+       int ret;
+ 
+       bio = btrfs_bio_alloc(BIO_MAX_VECS);
+ 
+       bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
+       bio->bi_opf = opf;
+       bio->bi_private = cb;
+       bio->bi_end_io = endio_func;
+ 
+       em = btrfs_get_chunk_map(fs_info, disk_bytenr, fs_info->sectorsize);
+       if (IS_ERR(em)) {
+               bio_put(bio);
+               return ERR_CAST(em);
+       }
+ 
+       if (bio_op(bio) == REQ_OP_ZONE_APPEND)
+               bio_set_dev(bio, em->map_lookup->stripes[0].dev->bdev);
+ 
+       ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), disk_bytenr, &geom);
+       free_extent_map(em);
+       if (ret < 0) {
+               bio_put(bio);
+               return ERR_PTR(ret);
+       }
+       *next_stripe_start = disk_bytenr + geom.len;
+ 
+       return bio;
+ }
+ 
   /*
    * worker function to build and submit bios for previously compressed pages.
    * The corresponding pages in the inode should be marked for writeback
@@@ -396,20 -508,19 +510,19 @@@ blk_status_t btrfs_submit_compressed_wr
         struct btrfs_fs_info *fs_info = inode->root->fs_info;
         struct bio *bio = NULL;
         struct compressed_bio *cb;
-       unsigned long bytes_left;
-       int pg_index = 0;
-       struct page *page;
-       u64 first_byte = disk_start;
+       u64 cur_disk_bytenr = disk_start;
+       u64 next_stripe_start;
         blk_status_t ret;
         int skip_sum = inode->flags & BTRFS_INODE_NODATASUM;
         const bool use_append = btrfs_use_zone_append(inode, disk_start);
         const unsigned int bio_op = use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE;
   
-       WARN_ON(!PAGE_ALIGNED(start));
+       ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
+              IS_ALIGNED(len, fs_info->sectorsize));
         cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
         if (!cb)
                 return BLK_STS_RESOURCE;
-       refcount_set(&cb->pending_bios, 0);
+       refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits);
         cb->errors = 0;
         cb->inode = &inode->vfs_inode;
         cb->start = start;
@@@ -420,118 -531,100 +533,100 @@@
         cb->orig_bio = NULL;
         cb->nr_pages = nr_pages;
   
-       bio = btrfs_bio_alloc(first_byte);
-       bio->bi_opf = bio_op | write_flags;
-       bio->bi_private = cb;
-       bio->bi_end_io = end_compressed_bio_write;
- 
-       if (use_append) {
-               struct btrfs_device *device;
- 
-               device = btrfs_zoned_get_device(fs_info, disk_start, PAGE_SIZE);
-               if (IS_ERR(device)) {
-                       kfree(cb);
-                       bio_put(bio);
-                       return BLK_STS_NOTSUPP;
+       while (cur_disk_bytenr < disk_start + compressed_len) {
+               u64 offset = cur_disk_bytenr - disk_start;
+               unsigned int index = offset >> PAGE_SHIFT;
+               unsigned int real_size;
+               unsigned int added;
+               struct page *page = compressed_pages[index];
+               bool submit = false;
+ 
+               /* Allocate new bio if submitted or not yet allocated */
+               if (!bio) {
+                       bio = alloc_compressed_bio(cb, cur_disk_bytenr,
+                               bio_op | write_flags, end_compressed_bio_write,
+                               &next_stripe_start);
+                       if (IS_ERR(bio)) {
+                               ret = errno_to_blk_status(PTR_ERR(bio));
+                               bio = NULL;
+                               goto finish_cb;
+                       }
                 }
- 
-               bio_set_dev(bio, device->bdev);
-       }
- 
-       if (blkcg_css) {
-               bio->bi_opf |= REQ_CGROUP_PUNT;
-               kthread_associate_blkcg(blkcg_css);
-       }
-       refcount_set(&cb->pending_bios, 1);
- 
-       /* create and submit bios for the compressed pages */
-       bytes_left = compressed_len;
-       for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) {
-               int submit = 0;
-               int len = 0;
- 
-               page = compressed_pages[pg_index];
-               page->mapping = inode->vfs_inode.i_mapping;
-               if (bio->bi_iter.bi_size)
-                       submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE, bio,
-                                                         0);
- 
                 /*
-                * Page can only be added to bio if the current bio fits in
-                * stripe.
+                * We should never reach next_stripe_start start as we will
+                * submit comp_bio when reach the boundary immediately.
                  */
-               if (!submit) {
-                       if (pg_index == 0 && use_append)
-                               len = bio_add_zone_append_page(bio, page,
-                                                              PAGE_SIZE, 0);
-                       else
-                               len = bio_add_page(bio, page, PAGE_SIZE, 0);
-               }
- 
-               page->mapping = NULL;
-               if (submit || len < PAGE_SIZE) {
-                       /*
-                        * inc the count before we submit the bio so
-                        * we know the end IO handler won't happen before
-                        * we inc the count.  Otherwise, the cb might get
-                        * freed before we're done setting it up
-                        */
-                       refcount_inc(&cb->pending_bios);
-                       ret = btrfs_bio_wq_end_io(fs_info, bio,
-                                                 BTRFS_WQ_ENDIO_DATA);
-                       BUG_ON(ret); /* -ENOMEM */
+               ASSERT(cur_disk_bytenr != next_stripe_start);
   
+               /*
+                * We have various limits on the real read size:
+                * - stripe boundary
+                * - page boundary
+                * - compressed length boundary
+                */
+               real_size = min_t(u64, U32_MAX, next_stripe_start - cur_disk_bytenr);
+               real_size = min_t(u64, real_size, PAGE_SIZE - offset_in_page(offset));
+               real_size = min_t(u64, real_size, compressed_len - offset);
+               ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize));
+ 
+               if (use_append)
+                       added = bio_add_zone_append_page(bio, page, real_size,
+                                       offset_in_page(offset));
+               else
+                       added = bio_add_page(bio, page, real_size,
+                                       offset_in_page(offset));
+               /* Reached zoned boundary */
+               if (added == 0)
+                       submit = true;
+ 
+               cur_disk_bytenr += added;
+               /* Reached stripe boundary */
+               if (cur_disk_bytenr == next_stripe_start)
+                       submit = true;
+ 
+               /* Finished the range */
+               if (cur_disk_bytenr == disk_start + compressed_len)
+                       submit = true;
+ 
+               if (submit) {
                         if (!skip_sum) {
                                 ret = btrfs_csum_one_bio(inode, bio, start, 1);
-                               BUG_ON(ret); /* -ENOMEM */
-                       }
- 
-                       ret = btrfs_map_bio(fs_info, bio, 0);
-                       if (ret) {
-                               bio->bi_status = ret;
-                               bio_endio(bio);
+                               if (ret)
+                                       goto finish_cb;
                         }
   
-                       bio = btrfs_bio_alloc(first_byte);
-                       bio->bi_opf = bio_op | write_flags;
-                       bio->bi_private = cb;
-                       bio->bi_end_io = end_compressed_bio_write;
-                       if (blkcg_css)
-                               bio->bi_opf |= REQ_CGROUP_PUNT;
-                       /*
-                        * Use bio_add_page() to ensure the bio has at least one
-                        * page.
-                        */
-                       bio_add_page(bio, page, PAGE_SIZE, 0);
+                       ret = submit_compressed_bio(fs_info, cb, bio, 0);
+                       if (ret)
+                               goto finish_cb;
+                       bio = NULL;
                 }
-               if (bytes_left < PAGE_SIZE) {
-                       btrfs_info(fs_info,
-                                       "bytes left %lu compress len %u nr %u",
-                              bytes_left, cb->compressed_len, cb->nr_pages);
-               }
-               bytes_left -= PAGE_SIZE;
-               first_byte += PAGE_SIZE;
                 cond_resched();
         }
+       if (blkcg_css)
+               kthread_associate_blkcg(NULL);
   
-       ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
-       BUG_ON(ret); /* -ENOMEM */
- 
-       if (!skip_sum) {
-               ret = btrfs_csum_one_bio(inode, bio, start, 1);
-               BUG_ON(ret); /* -ENOMEM */
-       }
+       return 0;
   
-       ret = btrfs_map_bio(fs_info, bio, 0);
-       if (ret) {
+ finish_cb:
+       if (bio) {
                 bio->bi_status = ret;
                 bio_endio(bio);
         }
+       /* Last byte of @cb is submitted, endio will free @cb */
+       if (cur_disk_bytenr == disk_start + compressed_len)
+               return ret;
   
-       if (blkcg_css)
-               kthread_associate_blkcg(NULL);
- 
-       return 0;
+       wait_var_event(cb, refcount_read(&cb->pending_sectors) ==
+                          (disk_start + compressed_len - cur_disk_bytenr) >>
+                          fs_info->sectorsize_bits);
+       /*
+        * Even with previous bio ended, we should still have io not yet
+        * submitted, thus need to finish manually.
+        */
+       ASSERT(refcount_read(&cb->pending_sectors));
+       /* Now we are the only one referring @cb, can finish it safely. */
+       finish_compressed_bio_write(cb);
+       return ret;
   }
   
   static u64 bio_end_offset(struct bio *bio)
@@@ -541,25 -634,33 +636,33 @@@
         return page_offset(last->bv_page) + last->bv_len + last->bv_offset;
   }
   
+ /*
+  * Add extra pages in the same compressed file extent so that we don't need to
+  * re-read the same extent again and again.
+  *
+  * NOTE: this won't work well for subpage, as for subpage read, we lock the
+  * full page then submit bio for each compressed/regular extents.
+  *
+  * This means, if we have several sectors in the same page points to the same
+  * on-disk compressed data, we will re-read the same extent many times and
+  * this function can only help for the next page.
+  */
   static noinline int add_ra_bio_pages(struct inode *inode,
                                      u64 compressed_end,
                                      struct compressed_bio *cb)
   {
+       struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
         unsigned long end_index;
-       unsigned long pg_index;
-       u64 last_offset;
+       u64 cur = bio_end_offset(cb->orig_bio);
         u64 isize = i_size_read(inode);
         int ret;
         struct page *page;
-       unsigned long nr_pages = 0;
         struct extent_map *em;
         struct address_space *mapping = inode->i_mapping;
         struct extent_map_tree *em_tree;
         struct extent_io_tree *tree;
-       u64 end;
-       int misses = 0;
+       int sectors_missed = 0;
   
-       last_offset = bio_end_offset(cb->orig_bio);
         em_tree = &BTRFS_I(inode)->extent_tree;
         tree = &BTRFS_I(inode)->io_tree;
   
@@@ -578,18 -679,29 +681,29 @@@
   
         end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
   
-       while (last_offset < compressed_end) {
-               pg_index = last_offset >> PAGE_SHIFT;
+       while (cur < compressed_end) {
+               u64 page_end;
+               u64 pg_index = cur >> PAGE_SHIFT;
+               u32 add_size;
   
                 if (pg_index > end_index)
                         break;
   
                 page = xa_load(&mapping->i_pages, pg_index);
                 if (page && !xa_is_value(page)) {
-                       misses++;
-                       if (misses > 4)
+                       sectors_missed += (PAGE_SIZE - offset_in_page(cur)) >>
+                                         fs_info->sectorsize_bits;
+ 
+                       /* Beyond threshold, no need to continue */
+                       if (sectors_missed > 4)
                                 break;
-                       goto next;
+ 
+                       /*
+                        * Jump to next page start as we already have page for
+                        * current offset.
+                        */
+                       cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE;
+                       continue;
                 }
   
                 page = __page_cache_alloc(mapping_gfp_constraint(mapping,
@@@ -599,14 -711,11 +713,11 @@@
   
                 if (add_to_page_cache_lru(page, mapping, pg_index, GFP_NOFS)) {
                         put_page(page);
-                       goto next;
+                       /* There is already a page, skip to page end */
+                       cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE;
+                       continue;
                 }
   
-               /*
-                * at this point, we have a locked page in the page cache
-                * for these bytes in the file.  But, we have to make
-                * sure they map to this compressed extent on disk.
-                */
                 ret = set_page_extent_mapped(page);
                 if (ret < 0) {
                         unlock_page(page);
@@@ -614,18 -723,22 +725,22 @@@
                         break;
                 }
   
-               end = last_offset + PAGE_SIZE - 1;
-               lock_extent(tree, last_offset, end);
+               page_end = (pg_index << PAGE_SHIFT) + PAGE_SIZE - 1;
+               lock_extent(tree, cur, page_end);
                 read_lock(&em_tree->lock);
-               em = lookup_extent_mapping(em_tree, last_offset,
-                                          PAGE_SIZE);
+               em = lookup_extent_mapping(em_tree, cur, page_end + 1 - cur);
                 read_unlock(&em_tree->lock);
   
-               if (!em || last_offset < em->start ||
-                   (last_offset + PAGE_SIZE > extent_map_end(em)) ||
+               /*
+                * At this point, we have a locked page in the page cache for
+                * these bytes in the file.  But, we have to make sure they map
+                * to this compressed extent on disk.
+                */
+               if (!em || cur < em->start ||
+                   (cur + fs_info->sectorsize > extent_map_end(em)) ||
                     (em->block_start >> 9) != cb->orig_bio->bi_iter.bi_sector) {
                         free_extent_map(em);
-                       unlock_extent(tree, last_offset, end);
+                       unlock_extent(tree, cur, page_end);
                         unlock_page(page);
                         put_page(page);
                         break;
@@@ -643,20 -756,23 +758,23 @@@
                         }
                 }
   
-               ret = bio_add_page(cb->orig_bio, page,
-                                  PAGE_SIZE, 0);
- 
-               if (ret == PAGE_SIZE) {
-                       nr_pages++;
-                       put_page(page);
-               } else {
-                       unlock_extent(tree, last_offset, end);
+               add_size = min(em->start + em->len, page_end + 1) - cur;
+               ret = bio_add_page(cb->orig_bio, page, add_size, offset_in_page(cur));
+               if (ret != add_size) {
+                       unlock_extent(tree, cur, page_end);
                         unlock_page(page);
                         put_page(page);
                         break;
                 }
- next:
-               last_offset += PAGE_SIZE;
+               /*
+                * If it's subpage, we also need to increase its
+                * subpage::readers number, as at endio we will decrease
+                * subpage::readers and to unlock the page.
+                */
+               if (fs_info->sectorsize < PAGE_SIZE)
+                       btrfs_subpage_start_reader(fs_info, page, cur, add_size);
+               put_page(page);
+               cur += add_size;
         }
         return 0;
   }
@@@ -681,9 -797,10 +799,10 @@@ blk_status_t btrfs_submit_compressed_re
         unsigned int compressed_len;
         unsigned int nr_pages;
         unsigned int pg_index;
-       struct page *page;
-       struct bio *comp_bio;
-       u64 cur_disk_byte = bio->bi_iter.bi_sector << 9;
+       struct bio *comp_bio = NULL;
+       const u64 disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT;
+       u64 cur_disk_byte = disk_bytenr;
+       u64 next_stripe_start;
         u64 file_offset;
         u64 em_len;
         u64 em_start;
@@@ -710,7 -827,7 +829,7 @@@
         if (!cb)
                 goto out;
   
-       refcount_set(&cb->pending_bios, 0);
+       refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits);
         cb->errors = 0;
         cb->inode = inode;
         cb->mirror_num = mirror_num;
@@@ -750,86 -867,74 +869,74 @@@
         /* include any pages we added in add_ra-bio_pages */
         cb->len = bio->bi_iter.bi_size;
   
-       comp_bio = btrfs_bio_alloc(cur_disk_byte);
-       comp_bio->bi_opf = REQ_OP_READ;
-       comp_bio->bi_private = cb;
-       comp_bio->bi_end_io = end_compressed_bio_read;
-       refcount_set(&cb->pending_bios, 1);
- 
-       for (pg_index = 0; pg_index < nr_pages; pg_index++) {
-               u32 pg_len = PAGE_SIZE;
-               int submit = 0;
+       while (cur_disk_byte < disk_bytenr + compressed_len) {
+               u64 offset = cur_disk_byte - disk_bytenr;
+               unsigned int index = offset >> PAGE_SHIFT;
+               unsigned int real_size;
+               unsigned int added;
+               struct page *page = cb->compressed_pages[index];
+               bool submit = false;
+ 
+               /* Allocate new bio if submitted or not yet allocated */
+               if (!comp_bio) {
+                       comp_bio = alloc_compressed_bio(cb, cur_disk_byte,
+                                       REQ_OP_READ, end_compressed_bio_read,
+                                       &next_stripe_start);
+                       if (IS_ERR(comp_bio)) {
+                               ret = errno_to_blk_status(PTR_ERR(comp_bio));
+                               comp_bio = NULL;
+                               goto finish_cb;
+                       }
+               }
+               /*
+                * We should never reach next_stripe_start start as we will
+                * submit comp_bio when reach the boundary immediately.
+                */
+               ASSERT(cur_disk_byte != next_stripe_start);
+               /*
+                * We have various limit on the real read size:
+                * - stripe boundary
+                * - page boundary
+                * - compressed length boundary
+                */
+               real_size = min_t(u64, U32_MAX, next_stripe_start - cur_disk_byte);
+               real_size = min_t(u64, real_size, PAGE_SIZE - offset_in_page(offset));
+               real_size = min_t(u64, real_size, compressed_len - offset);
+               ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize));
   
+               added = bio_add_page(comp_bio, page, real_size, offset_in_page(offset));
                 /*
-                * To handle subpage case, we need to make sure the bio only
-                * covers the range we need.
-                *
-                * If we're at the last page, truncate the length to only cover
-                * the remaining part.
+                * Maximum compressed extent is smaller than bio size limit,
+                * thus bio_add_page() should always success.
                  */
-               if (pg_index == nr_pages - 1)
-                       pg_len = min_t(u32, PAGE_SIZE,
-                                       compressed_len - pg_index * PAGE_SIZE);
+               ASSERT(added == real_size);
+               cur_disk_byte += added;
   
-               page = cb->compressed_pages[pg_index];
-               page->mapping = inode->i_mapping;
-               page->index = em_start >> PAGE_SHIFT;
+               /* Reached stripe boundary, need to submit */
+               if (cur_disk_byte == next_stripe_start)
+                       submit = true;
   
-               if (comp_bio->bi_iter.bi_size)
-                       submit = btrfs_bio_fits_in_stripe(page, pg_len,
-                                                         comp_bio, 0);
+               /* Has finished the range, need to submit */
+               if (cur_disk_byte == disk_bytenr + compressed_len)
+                       submit = true;
   
-               page->mapping = NULL;
-               if (submit || bio_add_page(comp_bio, page, pg_len, 0) < pg_len) {
+               if (submit) {
                         unsigned int nr_sectors;
   
-                       ret = btrfs_bio_wq_end_io(fs_info, comp_bio,
-                                                 BTRFS_WQ_ENDIO_DATA);
-                       BUG_ON(ret); /* -ENOMEM */
- 
-                       /*
-                        * inc the count before we submit the bio so
-                        * we know the end IO handler won't happen before
-                        * we inc the count.  Otherwise, the cb might get
-                        * freed before we're done setting it up
-                        */
-                       refcount_inc(&cb->pending_bios);
- 
                         ret = btrfs_lookup_bio_sums(inode, comp_bio, sums);
-                       BUG_ON(ret); /* -ENOMEM */
+                       if (ret)
+                               goto finish_cb;
   
                         nr_sectors = DIV_ROUND_UP(comp_bio->bi_iter.bi_size,
                                                   fs_info->sectorsize);
                         sums += fs_info->csum_size * nr_sectors;
   
-                       ret = btrfs_map_bio(fs_info, comp_bio, mirror_num);
-                       if (ret) {
-                               comp_bio->bi_status = ret;
-                               bio_endio(comp_bio);
-                       }
- 
-                       comp_bio = btrfs_bio_alloc(cur_disk_byte);
-                       comp_bio->bi_opf = REQ_OP_READ;
-                       comp_bio->bi_private = cb;
-                       comp_bio->bi_end_io = end_compressed_bio_read;
- 
-                       bio_add_page(comp_bio, page, pg_len, 0);
+                       ret = submit_compressed_bio(fs_info, cb, comp_bio, mirror_num);
+                       if (ret)
+                               goto finish_cb;
+                       comp_bio = NULL;
                 }
-               cur_disk_byte += pg_len;
         }
- 
-       ret = btrfs_bio_wq_end_io(fs_info, comp_bio, BTRFS_WQ_ENDIO_DATA);
-       BUG_ON(ret); /* -ENOMEM */
- 
-       ret = btrfs_lookup_bio_sums(inode, comp_bio, sums);
-       BUG_ON(ret); /* -ENOMEM */
- 
-       ret = btrfs_map_bio(fs_info, comp_bio, mirror_num);
-       if (ret) {
-               comp_bio->bi_status = ret;
-               bio_endio(comp_bio);
-       }
- 
         return 0;
   
   fail2:
@@@ -844,6 -949,26 +951,26 @@@ fail1
   out:
         free_extent_map(em);
         return ret;
+ finish_cb:
+       if (comp_bio) {
+               comp_bio->bi_status = ret;
+               bio_endio(comp_bio);
+       }
+       /* All bytes of @cb is submitted, endio will free @cb */
+       if (cur_disk_byte == disk_bytenr + compressed_len)
+               return ret;
+ 
+       wait_var_event(cb, refcount_read(&cb->pending_sectors) ==
+                          (disk_bytenr + compressed_len - cur_disk_byte) >>
+                          fs_info->sectorsize_bits);
+       /*
+        * Even with previous bio ended, we should still have io not yet
+        * submitted, thus need to finish @cb manually.
+        */
+       ASSERT(refcount_read(&cb->pending_sectors));
+       /* Now we are the only one referring @cb, can finish it safely. */
+       finish_compressed_bio_read(cb, NULL);
+       return ret;
   }
   
   /*
diff --combined fs/btrfs/ctree.c

index 66290b2,74c8e18..c3983bd
--- 1/fs/btrfs/ctree.c
--- 2/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@@ -7,7 -7,6 +7,7 @@@
   #include <linux/slab.h>
   #include <linux/rbtree.h>
   #include <linux/mm.h>
+ +#include <linux/error-injection.h>
   #include "ctree.h"
   #include "disk-io.h"
   #include "transaction.h"
@@@ -396,7 -395,7 +396,7 @@@ static noinline int __btrfs_cow_block(s
         if (*cow_ret == buf)
                 unlock_orig = 1;
   
-       btrfs_assert_tree_locked(buf);
+       btrfs_assert_tree_write_locked(buf);
   
         WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
                 trans->transid != fs_info->running_transaction->transid);
@@@ -2488,7 -2487,7 +2488,7 @@@ static void insert_ptr(struct btrfs_tra
         int ret;
   
         BUG_ON(!path->nodes[level]);
-       btrfs_assert_tree_locked(path->nodes[level]);
+       btrfs_assert_tree_write_locked(path->nodes[level]);
         lower = path->nodes[level];
         nritems = btrfs_header_nritems(lower);
         BUG_ON(slot > nritems);
@@@ -2828,7 -2827,7 +2828,7 @@@ static int push_leaf_right(struct btrfs
         if (slot >= btrfs_header_nritems(upper) - 1)
                 return 1;
   
-       btrfs_assert_tree_locked(path->nodes[1]);
+       btrfs_assert_tree_write_locked(path->nodes[1]);
   
         right = btrfs_read_node_slot(upper, slot + 1);
         /*
@@@ -3066,7 -3065,7 +3066,7 @@@ static int push_leaf_left(struct btrfs_
         if (right_nritems == 0)
                 return 1;
   
-       btrfs_assert_tree_locked(path->nodes[1]);
+       btrfs_assert_tree_write_locked(path->nodes[1]);
   
         left = btrfs_read_node_slot(path->nodes[1], slot - 1);
         /*
@@@ -3582,40 -3581,6 +3582,6 @@@ int btrfs_split_item(struct btrfs_trans
   }
   
   /*
-  * This function duplicate a item, giving 'new_key' to the new item.
-  * It guarantees both items live in the same tree leaf and the new item
-  * is contiguous with the original item.
-  *
-  * This allows us to split file extent in place, keeping a lock on the
-  * leaf the entire time.
-  */
- int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
-                        struct btrfs_root *root,
-                        struct btrfs_path *path,
-                        const struct btrfs_key *new_key)
- {
-       struct extent_buffer *leaf;
-       int ret;
-       u32 item_size;
- 
-       leaf = path->nodes[0];
-       item_size = btrfs_item_size_nr(leaf, path->slots[0]);
-       ret = setup_leaf_for_split(trans, root, path,
-                                  item_size + sizeof(struct btrfs_item));
-       if (ret)
-               return ret;
- 
-       path->slots[0]++;
-       setup_items_for_insert(root, path, new_key, &item_size, 1);
-       leaf = path->nodes[0];
-       memcpy_extent_buffer(leaf,
-                            btrfs_item_ptr_offset(leaf, path->slots[0]),
-                            btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
-                            item_size);
-       return 0;
- }
- 
- /*
    * make the item pointed to by the path smaller.  new_size indicates
    * how small to make it, and from_end tells us if we just chop bytes
    * off the end of the item or if we shift the item to chop bytes off
@@@ -3786,13 -3751,10 +3752,10 @@@ void btrfs_extend_item(struct btrfs_pat
    *
    * @root:     root we are inserting items to
    * @path:     points to the leaf/slot where we are going to insert new items
-  * @cpu_key:  array of keys for items to be inserted
-  * @data_size:        size of the body of each item we are going to insert
-  * @nr:               size of @cpu_key/@data_size arrays
+  * @batch:      information about the batch of items to insert
    */
- void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
-                           const struct btrfs_key *cpu_key, u32 *data_size,
-                           int nr)
+ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
+                                  const struct btrfs_item_batch *batch)
   {
         struct btrfs_fs_info *fs_info = root->fs_info;
         struct btrfs_item *item;
@@@ -3804,14 -3766,14 +3767,14 @@@
         int slot;
         struct btrfs_map_token token;
         u32 total_size;
-       u32 total_data = 0;
- 
-       for (i = 0; i < nr; i++)
-               total_data += data_size[i];
-       total_size = total_data + (nr * sizeof(struct btrfs_item));
   
+       /*
+        * Before anything else, update keys in the parent and other ancestors
+        * if needed, then release the write locks on them, so that other tasks
+        * can use them while we modify the leaf.
+        */
         if (path->slots[0] == 0) {
-               btrfs_cpu_key_to_disk(&disk_key, cpu_key);
+               btrfs_cpu_key_to_disk(&disk_key, &batch->keys[0]);
                 fixup_low_keys(path, &disk_key, 1);
         }
         btrfs_unlock_up_safe(path, 1);
@@@ -3821,6 -3783,7 +3784,7 @@@
   
         nritems = btrfs_header_nritems(leaf);
         data_end = leaf_data_end(leaf);
+       total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
   
         if (btrfs_leaf_free_space(leaf) < total_size) {
                 btrfs_print_leaf(leaf);
@@@ -3850,31 -3813,32 +3814,32 @@@
                         item = btrfs_item_nr(i);
                         ioff = btrfs_token_item_offset(&token, item);
                         btrfs_set_token_item_offset(&token, item,
-                                                   ioff - total_data);
+                                                   ioff - batch->total_data_size);
                 }
                 /* shift the items */
-               memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
+               memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + batch->nr),
                               btrfs_item_nr_offset(slot),
                               (nritems - slot) * sizeof(struct btrfs_item));
   
                 /* shift the data */
                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
-                             data_end - total_data, BTRFS_LEAF_DATA_OFFSET +
-                             data_end, old_data - data_end);
+                                     data_end - batch->total_data_size,
+                                     BTRFS_LEAF_DATA_OFFSET + data_end,
+                                     old_data - data_end);
                 data_end = old_data;
         }
   
         /* setup the item for the new data */
-       for (i = 0; i < nr; i++) {
-               btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
+       for (i = 0; i < batch->nr; i++) {
+               btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]);
                 btrfs_set_item_key(leaf, &disk_key, slot + i);
                 item = btrfs_item_nr(slot + i);
-               data_end -= data_size[i];
+               data_end -= batch->data_sizes[i];
                 btrfs_set_token_item_offset(&token, item, data_end);
-               btrfs_set_token_item_size(&token, item, data_size[i]);
+               btrfs_set_token_item_size(&token, item, batch->data_sizes[i]);
         }
   
-       btrfs_set_header_nritems(leaf, nritems + nr);
+       btrfs_set_header_nritems(leaf, nritems + batch->nr);
         btrfs_mark_buffer_dirty(leaf);
   
         if (btrfs_leaf_free_space(leaf) < 0) {
@@@ -3884,26 -3848,43 +3849,43 @@@
   }
   
   /*
+  * Insert a new item into a leaf.
+  *
+  * @root:      The root of the btree.
+  * @path:      A path pointing to the target leaf and slot.
+  * @key:       The key of the new item.
+  * @data_size: The size of the data associated with the new key.
+  */
+ void btrfs_setup_item_for_insert(struct btrfs_root *root,
+                                struct btrfs_path *path,
+                                const struct btrfs_key *key,
+                                u32 data_size)
+ {
+       struct btrfs_item_batch batch;
+ 
+       batch.keys = key;
+       batch.data_sizes = &data_size;
+       batch.total_data_size = data_size;
+       batch.nr = 1;
+ 
+       setup_items_for_insert(root, path, &batch);
+ }
+ 
+ /*
    * Given a key and some data, insert items into the tree.
    * This does all the path init required, making room in the tree if needed.
    */
   int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root,
                             struct btrfs_path *path,
-                           const struct btrfs_key *cpu_key, u32 *data_size,
-                           int nr)
+                           const struct btrfs_item_batch *batch)
   {
         int ret = 0;
         int slot;
-       int i;
-       u32 total_size = 0;
-       u32 total_data = 0;
- 
-       for (i = 0; i < nr; i++)
-               total_data += data_size[i];
+       u32 total_size;
   
-       total_size = total_data + (nr * sizeof(struct btrfs_item));
-       ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
+       total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
+       ret = btrfs_search_slot(trans, root, &batch->keys[0], path, total_size, 1);
         if (ret == 0)
                 return -EEXIST;
         if (ret < 0)
@@@ -3912,7 -3893,7 +3894,7 @@@
         slot = path->slots[0];
         BUG_ON(slot < 0);
   
-       setup_items_for_insert(root, path, cpu_key, data_size, nr);
+       setup_items_for_insert(root, path, batch);
         return 0;
   }
   
@@@ -3944,6 -3925,40 +3926,40 @@@ int btrfs_insert_item(struct btrfs_tran
   }
   
   /*
+  * This function duplicates an item, giving 'new_key' to the new item.
+  * It guarantees both items live in the same tree leaf and the new item is
+  * contiguous with the original item.
+  *
+  * This allows us to split a file extent in place, keeping a lock on the leaf
+  * the entire time.
+  */
+ int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
+                        struct btrfs_root *root,
+                        struct btrfs_path *path,
+                        const struct btrfs_key *new_key)
+ {
+       struct extent_buffer *leaf;
+       int ret;
+       u32 item_size;
+ 
+       leaf = path->nodes[0];
+       item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+       ret = setup_leaf_for_split(trans, root, path,
+                                  item_size + sizeof(struct btrfs_item));
+       if (ret)
+               return ret;
+ 
+       path->slots[0]++;
+       btrfs_setup_item_for_insert(root, path, new_key, item_size);
+       leaf = path->nodes[0];
+       memcpy_extent_buffer(leaf,
+                            btrfs_item_ptr_offset(leaf, path->slots[0]),
+                            btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
+                            item_size);
+       return 0;
+ }
+ 
+ /*
    * delete the pointer from a given node.
    *
    * the tree should have been previously balanced so the deletion does not
diff --combined fs/btrfs/dev-replace.c

index fbb8b44,59ef388..c85a7d4
--- 1/fs/btrfs/dev-replace.c
--- 2/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@@ -70,6 -70,7 +70,7 @@@ static int btrfs_dev_replace_kthread(vo
   
   int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
   {
+       struct btrfs_dev_lookup_args args = { .devid = BTRFS_DEV_REPLACE_DEVID };
         struct btrfs_key key;
         struct btrfs_root *dev_root = fs_info->dev_root;
         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
@@@ -100,8 -101,7 +101,7 @@@ no_valid_dev_replace_entry_found
                  * We don't have a replace item or it's corrupted.  If there is
                  * a replace target, fail the mount.
                  */
-               if (btrfs_find_device(fs_info->fs_devices,
-                                     BTRFS_DEV_REPLACE_DEVID, NULL, NULL)) {
+               if (btrfs_find_device(fs_info->fs_devices, &args)) {
                         btrfs_err(fs_info,
                         "found replace target device without a valid replace item");
                         ret = -EUCLEAN;
@@@ -163,8 -163,7 +163,7 @@@
                  * We don't have an active replace item but if there is a
                  * replace target, fail the mount.
                  */
-               if (btrfs_find_device(fs_info->fs_devices,
-                                     BTRFS_DEV_REPLACE_DEVID, NULL, NULL)) {
+               if (btrfs_find_device(fs_info->fs_devices, &args)) {
                         btrfs_err(fs_info,
                         "replace devid present without an active replace item");
                         ret = -EUCLEAN;
@@@ -175,11 -174,10 +174,10 @@@
                 break;
         case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
         case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
-               dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices,
-                                               src_devid, NULL, NULL);
-               dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices,
-                                                       BTRFS_DEV_REPLACE_DEVID,
-                                                       NULL, NULL);
+               dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices, &args);
+               args.devid = src_devid;
+               dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices, &args);
+ 
                 /*
                  * allow 'btrfs dev replace_cancel' if src/tgt device is
                  * missing
@@@ -283,7 -281,8 +281,7 @@@ static int btrfs_init_dev_replace_tgtde
         }
   
   
- -      if (i_size_read(bdev->bd_inode) <
- -          btrfs_device_get_total_bytes(srcdev)) {
+ +      if (bdev_nr_bytes(bdev) < btrfs_device_get_total_bytes(srcdev)) {
                 btrfs_err(fs_info,
                           "target device is smaller than source device!");
                 ret = -EINVAL;
diff --combined fs/btrfs/disk-io.c

index 29e7598,c725433..59c3be8
--- 1/fs/btrfs/disk-io.c
--- 2/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@@ -683,7 -683,7 +683,7 @@@ err
         return ret;
   }
   
- int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio,
+ int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
                                    struct page *page, u64 start, u64 end,
                                    int mirror)
   {
@@@ -1036,7 -1036,7 +1036,7 @@@ static int btree_set_page_dirty(struct 
                 BUG_ON(!eb);
                 BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
                 BUG_ON(!atomic_read(&eb->refs));
-               btrfs_assert_tree_locked(eb);
+               btrfs_assert_tree_write_locked(eb);
                 return __set_page_dirty_nobuffers(page);
         }
         ASSERT(PagePrivate(page) && page->private);
@@@ -1061,7 -1061,7 +1061,7 @@@
                 ASSERT(eb);
                 ASSERT(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
                 ASSERT(atomic_read(&eb->refs));
-               btrfs_assert_tree_locked(eb);
+               btrfs_assert_tree_write_locked(eb);
                 free_extent_buffer(eb);
   
                 cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits);
@@@ -1125,7 -1125,7 +1125,7 @@@ void btrfs_clean_tree_block(struct exte
         struct btrfs_fs_info *fs_info = buf->fs_info;
         if (btrfs_header_generation(buf) ==
             fs_info->running_transaction->transid) {
-               btrfs_assert_tree_locked(buf);
+               btrfs_assert_tree_write_locked(buf);
   
                 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
                         percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
@@@ -1500,7 -1500,7 +1500,7 @@@ static int btrfs_init_fs_root(struct bt
                 goto fail;
   
         if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID &&
-           root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
+           !btrfs_is_data_reloc_root(root)) {
                 set_bit(BTRFS_ROOT_SHAREABLE, &root->state);
                 btrfs_check_and_init_root_item(&root->root_item);
         }
@@@ -1644,6 -1644,7 +1644,7 @@@ void btrfs_free_fs_info(struct btrfs_fs
         btrfs_extent_buffer_leak_debug_check(fs_info);
         kfree(fs_info->super_copy);
         kfree(fs_info->super_for_commit);
+       kfree(fs_info->subpage_info);
         kvfree(fs_info);
   }
   
@@@ -1953,8 -1954,7 +1954,7 @@@ sleep
                 wake_up_process(fs_info->cleaner_kthread);
                 mutex_unlock(&fs_info->transaction_kthread_mutex);
   
-               if (unlikely(test_bit(BTRFS_FS_STATE_ERROR,
-                                     &fs_info->fs_state)))
+               if (BTRFS_FS_ERROR(fs_info))
                         btrfs_cleanup_transaction(fs_info);
                 if (!kthread_should_stop() &&
                                 (!btrfs_transaction_blocked(fs_info) ||
@@@ -2592,8 -2592,7 +2592,7 @@@ static int validate_super(struct btrfs_
   
         /*
          * For 4K page size, we only support 4K sector size.
-        * For 64K page size, we support read-write for 64K sector size, and
-        * read-only for 4K sector size.
+        * For 64K page size, we support 64K and 4K sector sizes.
          */
         if ((PAGE_SIZE == SZ_4K && sectorsize != PAGE_SIZE) ||
             (PAGE_SIZE == SZ_64K && (sectorsize != SZ_4K &&
@@@ -2883,6 -2882,8 +2882,8 @@@ void btrfs_init_fs_info(struct btrfs_fs
         spin_lock_init(&fs_info->buffer_lock);
         spin_lock_init(&fs_info->unused_bgs_lock);
         spin_lock_init(&fs_info->treelog_bg_lock);
+       spin_lock_init(&fs_info->zone_active_bgs_lock);
+       spin_lock_init(&fs_info->relocation_bg_lock);
         rwlock_init(&fs_info->tree_mod_log_lock);
         mutex_init(&fs_info->unused_bg_unpin_mutex);
         mutex_init(&fs_info->reclaim_bgs_lock);
@@@ -2896,6 -2897,7 +2897,7 @@@
         INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
         INIT_LIST_HEAD(&fs_info->unused_bgs);
         INIT_LIST_HEAD(&fs_info->reclaim_bgs);
+       INIT_LIST_HEAD(&fs_info->zone_active_bgs);
   #ifdef CONFIG_BTRFS_DEBUG
         INIT_LIST_HEAD(&fs_info->allocated_roots);
         INIT_LIST_HEAD(&fs_info->allocated_ebs);
@@@ -3228,12 -3230,12 +3230,12 @@@ int __cold open_ctree(struct super_bloc
         mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
         btrfs_init_btree_inode(fs_info);
   
-       invalidate_bdev(fs_devices->latest_bdev);
+       invalidate_bdev(fs_devices->latest_dev->bdev);
   
         /*
          * Read super block and check the signature bytes only
          */
-       disk_super = btrfs_read_dev_super(fs_devices->latest_bdev);
+       disk_super = btrfs_read_dev_super(fs_devices->latest_dev->bdev);
         if (IS_ERR(disk_super)) {
                 err = PTR_ERR(disk_super);
                 goto fail_alloc;
@@@ -3392,12 -3394,12 +3394,12 @@@
                 goto fail_alloc;
         }
   
-       if (sectorsize != PAGE_SIZE) {
+       if (sectorsize < PAGE_SIZE) {
+               struct btrfs_subpage_info *subpage_info;
+ 
                 btrfs_warn(fs_info,
                 "read-write for sector size %u with page size %lu is experimental",
                            sectorsize, PAGE_SIZE);
-       }
-       if (sectorsize != PAGE_SIZE) {
                 if (btrfs_super_incompat_flags(fs_info->super_copy) &
                         BTRFS_FEATURE_INCOMPAT_RAID56) {
                         btrfs_err(fs_info,
@@@ -3406,6 -3408,11 +3408,11 @@@
                         err = -EINVAL;
                         goto fail_alloc;
                 }
+               subpage_info = kzalloc(sizeof(*subpage_info), GFP_KERNEL);
+               if (!subpage_info)
+                       goto fail_alloc;
+               btrfs_init_subpage_info(subpage_info, sectorsize);
+               fs_info->subpage_info = subpage_info;
         }
   
         ret = btrfs_init_workqueues(fs_info, fs_devices);
@@@ -3465,7 -3472,7 +3472,7 @@@
          * below in btrfs_init_dev_replace().
          */
         btrfs_free_extra_devids(fs_devices);
-       if (!fs_devices->latest_bdev) {
+       if (!fs_devices->latest_dev->bdev) {
                 btrfs_err(fs_info, "failed to read devices");
                 goto fail_tree_roots;
         }
@@@ -3556,7 -3563,8 +3563,8 @@@
                 goto fail_sysfs;
         }
   
-       if (!sb_rdonly(sb) && !btrfs_check_rw_degradable(fs_info, NULL)) {
+       if (!sb_rdonly(sb) && fs_info->fs_devices->missing_devices &&
+           !btrfs_check_rw_degradable(fs_info, NULL)) {
                 btrfs_warn(fs_info,
                 "writable mount is not allowed due to too many missing devices");
                 goto fail_sysfs;
@@@ -3740,7 -3748,7 +3748,7 @@@ struct btrfs_super_block *btrfs_read_de
         else if (ret)
                 return ERR_PTR(ret);
   
- -      if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode))
+ +      if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev))
                 return ERR_PTR(-EINVAL);
   
         page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS);
@@@ -3881,7 -3889,9 +3889,9 @@@ static int write_dev_supers(struct btrf
                         bio->bi_opf |= REQ_FUA;
   
                 btrfsic_submit_bio(bio);
-               btrfs_advance_sb_log(device, i);
+ 
+               if (btrfs_advance_sb_log(device, i))
+                       errors++;
         }
         return errors < i ? 0 : -1;
   }
@@@ -4221,7 -4231,7 +4231,7 @@@ void btrfs_drop_and_free_fs_root(struc
                 drop_ref = true;
         spin_unlock(&fs_info->fs_roots_radix_lock);
   
-       if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+       if (BTRFS_FS_ERROR(fs_info)) {
                 ASSERT(root->log_root == NULL);
                 if (root->reloc_root) {
                         btrfs_put_root(root->reloc_root);
@@@ -4372,8 -4382,7 +4382,7 @@@ void __cold close_ctree(struct btrfs_fs
                         btrfs_err(fs_info, "commit super ret %d", ret);
         }
   
-       if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state) ||
-           test_bit(BTRFS_FS_STATE_TRANS_ABORTED, &fs_info->fs_state))
+       if (BTRFS_FS_ERROR(fs_info))
                 btrfs_error_commit_super(fs_info);
   
         kthread_stop(fs_info->transaction_kthread);
@@@ -4470,7 -4479,7 +4479,7 @@@ void btrfs_mark_buffer_dirty(struct ext
         if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags)))
                 return;
   #endif
-       btrfs_assert_tree_locked(buf);
+       btrfs_assert_tree_write_locked(buf);
         if (transid != fs_info->generation)
                 WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, found %llu running %llu\n",
                         buf->start, transid, fs_info->generation);
diff --combined fs/btrfs/inode.c

index 954b53a,5fec009..b8c911a
--- 1/fs/btrfs/inode.c
--- 2/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@@ -6,7 -6,6 +6,7 @@@
   #include <crypto/hash.h>
   #include <linux/kernel.h>
   #include <linux/bio.h>
+ +#include <linux/blk-cgroup.h>
   #include <linux/file.h>
   #include <linux/fs.h>
   #include <linux/pagemap.h>
@@@ -288,9 -287,8 +288,9 @@@ static int insert_inline_extent(struct 
                         cur_size = min_t(unsigned long, compressed_size,
                                        PAGE_SIZE);
   
- -                      kaddr = page_address(cpage);
+ +                      kaddr = kmap_atomic(cpage);
                         write_extent_buffer(leaf, kaddr, ptr, cur_size);
+ +                      kunmap_atomic(kaddr);
   
                         i++;
                         ptr += cur_size;
@@@ -457,11 -455,10 +457,10 @@@ struct async_chunk 
         struct list_head extents;
         struct cgroup_subsys_state *blkcg_css;
         struct btrfs_work work;
-       atomic_t *pending;
+       struct async_cow *async_cow;
   };
   
   struct async_cow {
-       /* Number of chunks in flight; must be first in the structure */
         atomic_t num_chunks;
         struct async_chunk chunks[];
   };
@@@ -492,9 -489,6 +491,6 @@@ static noinline int add_async_extent(st
    */
   static inline bool inode_can_compress(struct btrfs_inode *inode)
   {
-       /* Subpage doesn't support compression yet */
-       if (inode->root->fs_info->sectorsize < PAGE_SIZE)
-               return false;
         if (inode->flags & BTRFS_INODE_NODATACOW ||
             inode->flags & BTRFS_INODE_NODATASUM)
                 return false;
@@@ -516,6 -510,38 +512,38 @@@ static inline int inode_need_compress(s
                         btrfs_ino(inode));
                 return 0;
         }
+       /*
+        * Special check for subpage.
+        *
+        * We lock the full page then run each delalloc range in the page, thus
+        * for the following case, we will hit some subpage specific corner case:
+        *
+        * 0            32K             64K
+        * |    |///////|       |///////|
+        *              \- A            \- B
+        *
+        * In above case, both range A and range B will try to unlock the full
+        * page [0, 64K), causing the one finished later will have page
+        * unlocked already, triggering various page lock requirement BUG_ON()s.
+        *
+        * So here we add an artificial limit that subpage compression can only
+        * if the range is fully page aligned.
+        *
+        * In theory we only need to ensure the first page is fully covered, but
+        * the tailing partial page will be locked until the full compression
+        * finishes, delaying the write of other range.
+        *
+        * TODO: Make btrfs_run_delalloc_range() to lock all delalloc range
+        * first to prevent any submitted async extent to unlock the full page.
+        * By this, we can ensure for subpage case that only the last async_cow
+        * will unlock the full page.
+        */
+       if (fs_info->sectorsize < PAGE_SIZE) {
+               if (!IS_ALIGNED(start, PAGE_SIZE) ||
+                   !IS_ALIGNED(end + 1, PAGE_SIZE))
+                       return 0;
+       }
+ 
         /* force compress */
         if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
                 return 1;
@@@ -617,13 -643,24 +645,24 @@@ again
         total_compressed = actual_end - start;
   
         /*
-        * skip compression for a small file range(<=blocksize) that
+        * Skip compression for a small file range(<=blocksize) that
          * isn't an inline extent, since it doesn't save disk space at all.
          */
         if (total_compressed <= blocksize &&
            (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
                 goto cleanup_and_bail_uncompressed;
   
+       /*
+        * For subpage case, we require full page alignment for the sector
+        * aligned range.
+        * Thus we must also check against @actual_end, not just @end.
+        */
+       if (blocksize < PAGE_SIZE) {
+               if (!IS_ALIGNED(start, PAGE_SIZE) ||
+                   !IS_ALIGNED(round_up(actual_end, blocksize), PAGE_SIZE))
+                       goto cleanup_and_bail_uncompressed;
+       }
+ 
         total_compressed = min_t(unsigned long, total_compressed,
                         BTRFS_MAX_UNCOMPRESSED);
         total_in = 0;
@@@ -761,7 -798,7 +800,7 @@@ cont
                  * win, compare the page count read with the blocks on disk,
                  * compression must free at least one sector size
                  */
-               total_in = ALIGN(total_in, PAGE_SIZE);
+               total_in = round_up(total_in, fs_info->sectorsize);
                 if (total_compressed + blocksize <= total_in) {
                         compressed_extents++;
   
@@@ -842,166 -879,148 +881,148 @@@ static void free_async_extent_pages(str
         async_extent->pages = NULL;
   }
   
- /*
-  * phase two of compressed writeback.  This is the ordered portion
-  * of the code, which only gets called in the order the work was
-  * queued.  We walk all the async extents created by compress_file_range
-  * and send them down to the disk.
-  */
- static noinline void submit_compressed_extents(struct async_chunk *async_chunk)
+ static int submit_uncompressed_range(struct btrfs_inode *inode,
+                                    struct async_extent *async_extent,
+                                    struct page *locked_page)
   {
-       struct btrfs_inode *inode = BTRFS_I(async_chunk->inode);
-       struct btrfs_fs_info *fs_info = inode->root->fs_info;
-       struct async_extent *async_extent;
-       u64 alloc_hint = 0;
-       struct btrfs_key ins;
-       struct extent_map *em;
-       struct btrfs_root *root = inode->root;
-       struct extent_io_tree *io_tree = &inode->io_tree;
-       int ret = 0;
- 
- again:
-       while (!list_empty(&async_chunk->extents)) {
-               async_extent = list_entry(async_chunk->extents.next,
-                                         struct async_extent, list);
-               list_del(&async_extent->list);
- 
- retry:
-               lock_extent(io_tree, async_extent->start,
-                           async_extent->start + async_extent->ram_size - 1);
-               /* did the compression code fall back to uncompressed IO? */
-               if (!async_extent->pages) {
-                       int page_started = 0;
-                       unsigned long nr_written = 0;
+       u64 start = async_extent->start;
+       u64 end = async_extent->start + async_extent->ram_size - 1;
+       unsigned long nr_written = 0;
+       int page_started = 0;
+       int ret;
   
-                       /* allocate blocks */
-                       ret = cow_file_range(inode, async_chunk->locked_page,
-                                            async_extent->start,
-                                            async_extent->start +
-                                            async_extent->ram_size - 1,
-                                            &page_started, &nr_written, 0);
+       /*
+        * Call cow_file_range() to run the delalloc range directly, since we
+        * won't go to NOCOW or async path again.
+        *
+        * Also we call cow_file_range() with @unlock_page == 0, so that we
+        * can directly submit them without interruption.
+        */
+       ret = cow_file_range(inode, locked_page, start, end, &page_started,
+                            &nr_written, 0);
+       /* Inline extent inserted, page gets unlocked and everything is done */
+       if (page_started) {
+               ret = 0;
+               goto out;
+       }
+       if (ret < 0) {
+               if (locked_page)
+                       unlock_page(locked_page);
+               goto out;
+       }
   
-                       /* JDM XXX */
+       ret = extent_write_locked_range(&inode->vfs_inode, start, end);
+       /* All pages will be unlocked, including @locked_page */
+ out:
+       kfree(async_extent);
+       return ret;
+ }
   
-                       /*
-                        * if page_started, cow_file_range inserted an
-                        * inline extent and took care of all the unlocking
-                        * and IO for us.  Otherwise, we need to submit
-                        * all those pages down to the drive.
-                        */
-                       if (!page_started && !ret)
-                               extent_write_locked_range(&inode->vfs_inode,
-                                                 async_extent->start,
-                                                 async_extent->start +
-                                                 async_extent->ram_size - 1,
-                                                 WB_SYNC_ALL);
-                       else if (ret && async_chunk->locked_page)
-                               unlock_page(async_chunk->locked_page);
-                       kfree(async_extent);
-                       cond_resched();
-                       continue;
-               }
+ static int submit_one_async_extent(struct btrfs_inode *inode,
+                                  struct async_chunk *async_chunk,
+                                  struct async_extent *async_extent,
+                                  u64 *alloc_hint)
+ {
+       struct extent_io_tree *io_tree = &inode->io_tree;
+       struct btrfs_root *root = inode->root;
+       struct btrfs_fs_info *fs_info = root->fs_info;
+       struct btrfs_key ins;
+       struct page *locked_page = NULL;
+       struct extent_map *em;
+       int ret = 0;
+       u64 start = async_extent->start;
+       u64 end = async_extent->start + async_extent->ram_size - 1;
   
-               ret = btrfs_reserve_extent(root, async_extent->ram_size,
-                                          async_extent->compressed_size,
-                                          async_extent->compressed_size,
-                                          0, alloc_hint, &ins, 1, 1);
-               if (ret) {
-                       free_async_extent_pages(async_extent);
+       /*
+        * If async_chunk->locked_page is in the async_extent range, we need to
+        * handle it.
+        */
+       if (async_chunk->locked_page) {
+               u64 locked_page_start = page_offset(async_chunk->locked_page);
+               u64 locked_page_end = locked_page_start + PAGE_SIZE - 1;
   
-                       if (ret == -ENOSPC) {
-                               unlock_extent(io_tree, async_extent->start,
-                                             async_extent->start +
-                                             async_extent->ram_size - 1);
+               if (!(start >= locked_page_end || end <= locked_page_start))
+                       locked_page = async_chunk->locked_page;
+       }
+       lock_extent(io_tree, start, end);
   
-                               /*
-                                * we need to redirty the pages if we decide to
-                                * fallback to uncompressed IO, otherwise we
-                                * will not submit these pages down to lower
-                                * layers.
-                                */
-                               extent_range_redirty_for_io(&inode->vfs_inode,
-                                               async_extent->start,
-                                               async_extent->start +
-                                               async_extent->ram_size - 1);
+       /* We have fall back to uncompressed write */
+       if (!async_extent->pages)
+               return submit_uncompressed_range(inode, async_extent, locked_page);
   
-                               goto retry;
-                       }
-                       goto out_free;
-               }
+       ret = btrfs_reserve_extent(root, async_extent->ram_size,
+                                  async_extent->compressed_size,
+                                  async_extent->compressed_size,
+                                  0, *alloc_hint, &ins, 1, 1);
+       if (ret) {
+               free_async_extent_pages(async_extent);
                 /*
-                * here we're doing allocation and writeback of the
-                * compressed pages
+                * Here we used to try again by going back to non-compressed
+                * path for ENOSPC.  But we can't reserve space even for
+                * compressed size, how could it work for uncompressed size
+                * which requires larger size?  So here we directly go error
+                * path.
                  */
-               em = create_io_em(inode, async_extent->start,
-                                 async_extent->ram_size, /* len */
-                                 async_extent->start, /* orig_start */
-                                 ins.objectid, /* block_start */
-                                 ins.offset, /* block_len */
-                                 ins.offset, /* orig_block_len */
-                                 async_extent->ram_size, /* ram_bytes */
-                                 async_extent->compress_type,
-                                 BTRFS_ORDERED_COMPRESSED);
-               if (IS_ERR(em))
-                       /* ret value is not necessary due to void function */
-                       goto out_free_reserve;
-               free_extent_map(em);
- 
-               ret = btrfs_add_ordered_extent_compress(inode,
-                                               async_extent->start,
-                                               ins.objectid,
-                                               async_extent->ram_size,
-                                               ins.offset,
-                                               async_extent->compress_type);
-               if (ret) {
-                       btrfs_drop_extent_cache(inode, async_extent->start,
-                                               async_extent->start +
-                                               async_extent->ram_size - 1, 0);
-                       goto out_free_reserve;
-               }
-               btrfs_dec_block_group_reservations(fs_info, ins.objectid);
+               goto out_free;
+       }
+ 
+       /* Here we're doing allocation and writeback of the compressed pages */
+       em = create_io_em(inode, start,
+                         async_extent->ram_size,       /* len */
+                         start,                        /* orig_start */
+                         ins.objectid,                 /* block_start */
+                         ins.offset,                   /* block_len */
+                         ins.offset,                   /* orig_block_len */
+                         async_extent->ram_size,       /* ram_bytes */
+                         async_extent->compress_type,
+                         BTRFS_ORDERED_COMPRESSED);
+       if (IS_ERR(em)) {
+               ret = PTR_ERR(em);
+               goto out_free_reserve;
+       }
+       free_extent_map(em);
   
-               /*
-                * clear dirty, set writeback and unlock the pages.
-                */
-               extent_clear_unlock_delalloc(inode, async_extent->start,
-                               async_extent->start +
-                               async_extent->ram_size - 1,
-                               NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
-                               PAGE_UNLOCK | PAGE_START_WRITEBACK);
-               if (btrfs_submit_compressed_write(inode, async_extent->start,
-                                   async_extent->ram_size,
-                                   ins.objectid,
-                                   ins.offset, async_extent->pages,
-                                   async_extent->nr_pages,
-                                   async_chunk->write_flags,
-                                   async_chunk->blkcg_css)) {
-                       struct page *p = async_extent->pages[0];
-                       const u64 start = async_extent->start;
-                       const u64 end = start + async_extent->ram_size - 1;
- 
-                       p->mapping = inode->vfs_inode.i_mapping;
-                       btrfs_writepage_endio_finish_ordered(inode, p, start,
-                                                            end, false);
- 
-                       p->mapping = NULL;
-                       extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
-                                                    PAGE_END_WRITEBACK |
-                                                    PAGE_SET_ERROR);
-                       free_async_extent_pages(async_extent);
-               }
-               alloc_hint = ins.objectid + ins.offset;
-               kfree(async_extent);
-               cond_resched();
+       ret = btrfs_add_ordered_extent_compress(inode, start,   /* file_offset */
+                                       ins.objectid,           /* disk_bytenr */
+                                       async_extent->ram_size, /* num_bytes */
+                                       ins.offset,             /* disk_num_bytes */
+                                       async_extent->compress_type);
+       if (ret) {
+               btrfs_drop_extent_cache(inode, start, end, 0);
+               goto out_free_reserve;
         }
-       return;
+       btrfs_dec_block_group_reservations(fs_info, ins.objectid);
+ 
+       /* Clear dirty, set writeback and unlock the pages. */
+       extent_clear_unlock_delalloc(inode, start, end,
+                       NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
+                       PAGE_UNLOCK | PAGE_START_WRITEBACK);
+       if (btrfs_submit_compressed_write(inode, start, /* file_offset */
+                           async_extent->ram_size,     /* num_bytes */
+                           ins.objectid,               /* disk_bytenr */
+                           ins.offset,                 /* compressed_len */
+                           async_extent->pages,        /* compressed_pages */
+                           async_extent->nr_pages,
+                           async_chunk->write_flags,
+                           async_chunk->blkcg_css)) {
+               const u64 start = async_extent->start;
+               const u64 end = start + async_extent->ram_size - 1;
+ 
+               btrfs_writepage_endio_finish_ordered(inode, NULL, start, end, 0);
+ 
+               extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
+                                            PAGE_END_WRITEBACK | PAGE_SET_ERROR);
+               free_async_extent_pages(async_extent);
+       }
+       *alloc_hint = ins.objectid + ins.offset;
+       kfree(async_extent);
+       return ret;
+ 
   out_free_reserve:
         btrfs_dec_block_group_reservations(fs_info, ins.objectid);
         btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
   out_free:
-       extent_clear_unlock_delalloc(inode, async_extent->start,
-                                    async_extent->start +
-                                    async_extent->ram_size - 1,
+       extent_clear_unlock_delalloc(inode, start, end,
                                      NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
                                      EXTENT_DELALLOC_NEW |
                                      EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
@@@ -1009,7 -1028,39 +1030,39 @@@
                                      PAGE_END_WRITEBACK | PAGE_SET_ERROR);
         free_async_extent_pages(async_extent);
         kfree(async_extent);
-       goto again;
+       return ret;
+ }
+ 
+ /*
+  * Phase two of compressed writeback.  This is the ordered portion of the code,
+  * which only gets called in the order the work was queued.  We walk all the
+  * async extents created by compress_file_range and send them down to the disk.
+  */
+ static noinline void submit_compressed_extents(struct async_chunk *async_chunk)
+ {
+       struct btrfs_inode *inode = BTRFS_I(async_chunk->inode);
+       struct btrfs_fs_info *fs_info = inode->root->fs_info;
+       struct async_extent *async_extent;
+       u64 alloc_hint = 0;
+       int ret = 0;
+ 
+       while (!list_empty(&async_chunk->extents)) {
+               u64 extent_start;
+               u64 ram_size;
+ 
+               async_extent = list_entry(async_chunk->extents.next,
+                                         struct async_extent, list);
+               list_del(&async_extent->list);
+               extent_start = async_extent->start;
+               ram_size = async_extent->ram_size;
+ 
+               ret = submit_one_async_extent(inode, async_chunk, async_extent,
+                                             &alloc_hint);
+               btrfs_debug(fs_info,
+ "async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d",
+                           inode->root->root_key.objectid,
+                           btrfs_ino(inode), extent_start, ram_size, ret);
+       }
   }
   
   static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
@@@ -1152,7 -1203,7 +1205,7 @@@ static noinline int cow_file_range(stru
          * fails during the stage where it updates the bytenr of file extent
          * items.
          */
-       if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+       if (btrfs_is_data_reloc_root(root))
                 min_alloc_size = num_bytes;
         else
                 min_alloc_size = fs_info->sectorsize;
@@@ -1188,8 -1239,7 +1241,7 @@@
                 if (ret)
                         goto out_drop_extent_cache;
   
-               if (root->root_key.objectid ==
-                   BTRFS_DATA_RELOC_TREE_OBJECTID) {
+               if (btrfs_is_data_reloc_root(root)) {
                         ret = btrfs_reloc_clone_csums(inode, start,
                                                       cur_alloc_size);
                         /*
@@@ -1327,18 -1377,17 +1379,17 @@@ static noinline void async_cow_submit(s
   static noinline void async_cow_free(struct btrfs_work *work)
   {
         struct async_chunk *async_chunk;
+       struct async_cow *async_cow;
   
         async_chunk = container_of(work, struct async_chunk, work);
         if (async_chunk->inode)
                 btrfs_add_delayed_iput(async_chunk->inode);
         if (async_chunk->blkcg_css)
                 css_put(async_chunk->blkcg_css);
-       /*
-        * Since the pointer to 'pending' is at the beginning of the array of
-        * async_chunk's, freeing it ensures the whole array has been freed.
-        */
-       if (atomic_dec_and_test(async_chunk->pending))
-               kvfree(async_chunk->pending);
+ 
+       async_cow = async_chunk->async_cow;
+       if (atomic_dec_and_test(&async_cow->num_chunks))
+               kvfree(async_cow);
   }
   
   static int cow_file_range_async(struct btrfs_inode *inode,
@@@ -1399,7 -1448,7 +1450,7 @@@
                  * lightweight reference for the callback lifetime
                  */
                 ihold(&inode->vfs_inode);
-               async_chunk[i].pending = &ctx->num_chunks;
+               async_chunk[i].async_cow = ctx;
                 async_chunk[i].inode = &inode->vfs_inode;
                 async_chunk[i].start = start;
                 async_chunk[i].end = cur_end;
@@@ -1472,7 -1521,7 +1523,7 @@@ static noinline int run_delalloc_zoned(
   
         __set_page_dirty_nobuffers(locked_page);
         account_page_redirty(locked_page);
-       extent_write_locked_range(&inode->vfs_inode, start, end, WB_SYNC_ALL);
+       extent_write_locked_range(&inode->vfs_inode, start, end);
         *page_started = 1;
   
         return 0;
@@@ -1505,8 -1554,7 +1556,7 @@@ static int fallback_to_cow(struct btrfs
                            int *page_started, unsigned long *nr_written)
   {
         const bool is_space_ino = btrfs_is_free_space_inode(inode);
-       const bool is_reloc_ino = (inode->root->root_key.objectid ==
-                                  BTRFS_DATA_RELOC_TREE_OBJECTID);
+       const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root);
         const u64 range_bytes = end + 1 - start;
         struct extent_io_tree *io_tree = &inode->io_tree;
         u64 range_start = start;
@@@ -1868,8 -1916,7 +1918,7 @@@ out_check
                         btrfs_dec_nocow_writers(fs_info, disk_bytenr);
                 nocow = false;
   
-               if (root->root_key.objectid ==
-                   BTRFS_DATA_RELOC_TREE_OBJECTID)
+               if (btrfs_is_data_reloc_root(root))
                         /*
                          * Error handled later, as we must prevent
                          * extent_clear_unlock_delalloc() in error handler
@@@ -1948,8 -1995,23 +1997,23 @@@ int btrfs_run_delalloc_range(struct btr
         int ret;
         const bool zoned = btrfs_is_zoned(inode->root->fs_info);
   
+       /*
+        * The range must cover part of the @locked_page, or the returned
+        * @page_started can confuse the caller.
+        */
+       ASSERT(!(end <= page_offset(locked_page) ||
+                start >= page_offset(locked_page) + PAGE_SIZE));
+ 
         if (should_nocow(inode, start, end)) {
-               ASSERT(!zoned);
+               /*
+                * Normally on a zoned device we're only doing COW writes, but
+                * in case of relocation on a zoned filesystem we have taken
+                * precaution, that we're only writing sequentially. It's safe
+                * to use run_delalloc_nocow() here, like for  regular
+                * preallocated inodes.
+                */
+               ASSERT(!zoned ||
+                      (zoned && btrfs_is_data_reloc_root(inode->root)));
                 ret = run_delalloc_nocow(inode, locked_page, start, end,
                                          page_started, nr_written);
         } else if (!inode_can_compress(inode) ||
@@@ -2208,7 -2270,7 +2272,7 @@@ void btrfs_clear_delalloc_extent(struc
                 if (btrfs_is_testing(fs_info))
                         return;
   
-               if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID &&
+               if (!btrfs_is_data_reloc_root(root) &&
                     do_list && !(state->state & EXTENT_NORESERVE) &&
                     (*bits & EXTENT_CLEAR_DATA_RESV))
                         btrfs_free_reserved_data_space_noquota(fs_info, len);
@@@ -2236,48 -2298,6 +2300,6 @@@
   }
   
   /*
-  * btrfs_bio_fits_in_stripe - Checks whether the size of the given bio will fit
-  * in a chunk's stripe. This function ensures that bios do not span a
-  * stripe/chunk
-  *
-  * @page - The page we are about to add to the bio
-  * @size - size we want to add to the bio
-  * @bio - bio we want to ensure is smaller than a stripe
-  * @bio_flags - flags of the bio
-  *
-  * return 1 if page cannot be added to the bio
-  * return 0 if page can be added to the bio
-  * return error otherwise
-  */
- int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio,
-                            unsigned long bio_flags)
- {
-       struct inode *inode = page->mapping->host;
-       struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-       u64 logical = bio->bi_iter.bi_sector << 9;
-       u32 bio_len = bio->bi_iter.bi_size;
-       struct extent_map *em;
-       int ret = 0;
-       struct btrfs_io_geometry geom;
- 
-       if (bio_flags & EXTENT_BIO_COMPRESSED)
-               return 0;
- 
-       em = btrfs_get_chunk_map(fs_info, logical, fs_info->sectorsize);
-       if (IS_ERR(em))
-               return PTR_ERR(em);
-       ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), logical, &geom);
-       if (ret < 0)
-               goto out;
- 
-       if (geom.len < bio_len + size)
-               ret = 1;
- out:
-       free_extent_map(em);
-       return ret;
- }
- 
- /*
    * in order to insert checksums into the metadata in large chunks,
    * we wait until bio submission time.   All the pages in the bio are
    * checksummed and sums are attached onto the ordered extent record.
@@@ -2533,7 -2553,7 +2555,7 @@@ blk_status_t btrfs_submit_data_bio(stru
                 goto mapit;
         } else if (async && !skip_sum) {
                 /* csum items have already been cloned */
-               if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+               if (btrfs_is_data_reloc_root(root))
                         goto mapit;
                 /* we're doing a write, do the async checksumming */
                 ret = btrfs_wq_submit_bio(inode, bio, mirror_num, bio_flags,
@@@ -2766,7 -2786,7 +2788,7 @@@ out_page
                 clear_page_dirty_for_io(page);
                 SetPageError(page);
         }
-       ClearPageChecked(page);
+       btrfs_page_clear_checked(inode->root->fs_info, page, page_start, PAGE_SIZE);
         unlock_page(page);
         put_page(page);
         kfree(fixup);
@@@ -2821,7 -2841,7 +2843,7 @@@ int btrfs_writepage_cow_fixup(struct pa
          * page->mapping outside of the page lock.
          */
         ihold(inode);
-       SetPageChecked(page);
+       btrfs_page_set_checked(fs_info, page, page_offset(page), PAGE_SIZE);
         get_page(page);
         btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
         fixup->page = page;
@@@ -3012,8 -3032,12 +3034,12 @@@ static int btrfs_finish_ordered_io(stru
                 goto out;
         }
   
-       if (ordered_extent->bdev)
+       /* A valid bdev implies a write on a sequential zone */
+       if (ordered_extent->bdev) {
                 btrfs_rewrite_logical_zoned(ordered_extent);
+               btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
+                                       ordered_extent->disk_num_bytes);
+       }
   
         btrfs_free_io_failure_record(inode, start, end);
   
@@@ -3210,7 -3234,7 +3236,7 @@@ void btrfs_writepage_endio_finish_order
    *
    * The length of such check is always one sector size.
    */
- static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio,
+ static int check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
                            u32 bio_offset, struct page *page, u32 pgoff,
                            u64 start)
   {
@@@ -3226,7 -3250,7 +3252,7 @@@
         ASSERT(pgoff + len <= PAGE_SIZE);
   
         offset_sectors = bio_offset >> fs_info->sectorsize_bits;
-       csum_expected = ((u8 *)io_bio->csum) + offset_sectors * csum_size;
+       csum_expected = ((u8 *)bbio->csum) + offset_sectors * csum_size;
   
         kaddr = kmap_atomic(page);
         shash->tfm = fs_info->csum_shash;
@@@ -3240,9 -3264,9 +3266,9 @@@
         return 0;
   zeroit:
         btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected,
-                                   io_bio->mirror_num);
-       if (io_bio->device)
-               btrfs_dev_stat_inc_and_print(io_bio->device,
+                                   bbio->mirror_num);
+       if (bbio->device)
+               btrfs_dev_stat_inc_and_print(bbio->device,
                                              BTRFS_DEV_STAT_CORRUPTION_ERRS);
         memset(kaddr + pgoff, 1, len);
         flush_dcache_page(page);
@@@ -3262,33 -3286,29 +3288,29 @@@
    * Return a bitmap where bit set means a csum mismatch, and bit not set means
    * csum match.
    */
- unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
-                                   struct page *page, u64 start, u64 end)
+ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
+                                   u32 bio_offset, struct page *page,
+                                   u64 start, u64 end)
   {
         struct inode *inode = page->mapping->host;
+       struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
         struct btrfs_root *root = BTRFS_I(inode)->root;
         const u32 sectorsize = root->fs_info->sectorsize;
         u32 pg_off;
         unsigned int result = 0;
   
-       if (PageChecked(page)) {
-               ClearPageChecked(page);
+       if (btrfs_page_test_checked(fs_info, page, start, end + 1 - start)) {
+               btrfs_page_clear_checked(fs_info, page, start, end + 1 - start);
                 return 0;
         }
   
         /*
-        * For subpage case, above PageChecked is not safe as it's not subpage
-        * compatible.
-        * But for now only cow fixup and compressed read utilize PageChecked
-        * flag, while in this context we can easily use io_bio->csum to
-        * determine if we really need to do csum verification.
-        *
-        * So for now, just exit if io_bio->csum is NULL, as it means it's
-        * compressed read, and its compressed data csum has already been
-        * verified.
+        * This only happens for NODATASUM or compressed read.
+        * Normally this should be covered by above check for compressed read
+        * or the next check for NODATASUM.  Just do a quicker exit here.
          */
-       if (io_bio->csum == NULL)
+       if (bbio->csum == NULL)
                 return 0;
   
         if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
@@@ -3305,7 -3325,7 +3327,7 @@@
                 u64 file_offset = pg_off + page_offset(page);
                 int ret;
   
-               if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
+               if (btrfs_is_data_reloc_root(root) &&
                     test_range_bit(io_tree, file_offset,
                                    file_offset + sectorsize - 1,
                                    EXTENT_NODATASUM, 1, NULL)) {
@@@ -3315,7 -3335,7 +3337,7 @@@
                                           EXTENT_NODATASUM);
                         continue;
                 }
-               ret = check_data_csum(inode, io_bio, bio_offset, page, pg_off,
+               ret = check_data_csum(inode, bbio, bio_offset, page, pg_off,
                                       page_offset(page) + pg_off);
                 if (ret < 0) {
                         const int nr_bit = (pg_off - offset_in_page(start)) >>
@@@ -4006,7 -4026,7 +4028,7 @@@ noinline int btrfs_update_inode(struct 
          * without delay
          */
         if (!btrfs_is_free_space_inode(inode)
-           && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
+           && !btrfs_is_data_reloc_root(root)
             && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
                 btrfs_update_root_times(trans, root);
   
@@@ -4036,11 -4056,11 +4058,11 @@@ int btrfs_update_inode_fallback(struct 
    * also drops the back refs in the inode to the directory
    */
   static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
-                               struct btrfs_root *root,
                                 struct btrfs_inode *dir,
                                 struct btrfs_inode *inode,
                                 const char *name, int name_len)
   {
+       struct btrfs_root *root = dir->root;
         struct btrfs_fs_info *fs_info = root->fs_info;
         struct btrfs_path *path;
         int ret = 0;
@@@ -4100,19 -4120,9 +4122,9 @@@ skip_backref
                 goto err;
         }
   
-       ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode,
-                       dir_ino);
-       if (ret != 0 && ret != -ENOENT) {
-               btrfs_abort_transaction(trans, ret);
-               goto err;
-       }
- 
-       ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir,
-                       index);
-       if (ret == -ENOENT)
-               ret = 0;
-       else if (ret)
-               btrfs_abort_transaction(trans, ret);
+       btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode,
+                                  dir_ino);
+       btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir, index);
   
         /*
          * If we have a pending delayed iput we could end up with the final iput
@@@ -4140,15 -4150,14 +4152,14 @@@ out
   }
   
   int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
-                      struct btrfs_root *root,
                        struct btrfs_inode *dir, struct btrfs_inode *inode,
                        const char *name, int name_len)
   {
         int ret;
-       ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
+       ret = __btrfs_unlink_inode(trans, dir, inode, name, name_len);
         if (!ret) {
                 drop_nlink(&inode->vfs_inode);
-               ret = btrfs_update_inode(trans, root, inode);
+               ret = btrfs_update_inode(trans, inode->root, inode);
         }
         return ret;
   }
@@@ -4177,7 -4186,6 +4188,6 @@@ static struct btrfs_trans_handle *__unl
   
   static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
   {
-       struct btrfs_root *root = BTRFS_I(dir)->root;
         struct btrfs_trans_handle *trans;
         struct inode *inode = d_inode(dentry);
         int ret;
@@@ -4189,7 -4197,7 +4199,7 @@@
         btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
                         0);
   
-       ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
+       ret = btrfs_unlink_inode(trans, BTRFS_I(dir),
                         BTRFS_I(d_inode(dentry)), dentry->d_name.name,
                         dentry->d_name.len);
         if (ret)
@@@ -4203,7 -4211,7 +4213,7 @@@
   
   out:
         btrfs_end_transaction(trans);
-       btrfs_btree_balance_dirty(root->fs_info);
+       btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info);
         return ret;
   }
   
@@@ -4370,7 -4378,7 +4380,7 @@@ static void btrfs_prune_dentries(struc
         struct inode *inode;
         u64 objectid = 0;
   
-       if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+       if (!BTRFS_FS_ERROR(fs_info))
                 WARN_ON(btrfs_root_refs(&root->root_item) != 0);
   
         spin_lock(&root->inode_lock);
@@@ -4554,7 -4562,6 +4564,6 @@@ static int btrfs_rmdir(struct inode *di
   {
         struct inode *inode = d_inode(dentry);
         int err = 0;
-       struct btrfs_root *root = BTRFS_I(dir)->root;
         struct btrfs_trans_handle *trans;
         u64 last_unlink_trans;
   
@@@ -4579,7 -4586,7 +4588,7 @@@
         last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
   
         /* now the directory is empty */
-       err = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
+       err = btrfs_unlink_inode(trans, BTRFS_I(dir),
                         BTRFS_I(d_inode(dentry)), dentry->d_name.name,
                         dentry->d_name.len);
         if (!err) {
@@@ -4600,7 -4607,7 +4609,7 @@@
         }
   out:
         btrfs_end_transaction(trans);
-       btrfs_btree_balance_dirty(root->fs_info);
+       btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info);
   
         return err;
   }
@@@ -4909,9 -4916,9 +4918,9 @@@ delete
   
                         btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF,
                                         extent_start, extent_num_bytes, 0);
-                       ref.real_root = root->root_key.objectid;
                         btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
-                                       ino, extent_offset);
+                                       ino, extent_offset,
+                                       root->root_key.objectid, false);
                         ret = btrfs_free_extent(trans, &ref);
                         if (ret) {
                                 btrfs_abort_transaction(trans, ret);
@@@ -5107,7 -5114,8 +5116,8 @@@ again
                                      len);
                 flush_dcache_page(page);
         }
-       ClearPageChecked(page);
+       btrfs_page_clear_checked(fs_info, page, block_start,
+                                block_end + 1 - block_start);
         btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start);
         unlock_extent_cached(io_tree, block_start, block_end, &cached_state);
   
@@@ -6437,7 -6445,7 +6447,7 @@@ static struct inode *btrfs_new_inode(st
         struct btrfs_inode_ref *ref;
         struct btrfs_key key[2];
         u32 sizes[2];
-       int nitems = name ? 2 : 1;
+       struct btrfs_item_batch batch;
         unsigned long ptr;
         unsigned int nofs_flag;
         int ret;
@@@ -6529,7 -6537,11 +6539,11 @@@
                 goto fail;
         }
   
-       ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems);
+       batch.keys = &key[0];
+       batch.data_sizes = &sizes[0];
+       batch.total_data_size = sizes[0] + (name ? sizes[1] : 0);
+       batch.nr = name ? 2 : 1;
+       ret = btrfs_insert_empty_items(trans, root, path, &batch);
         if (ret != 0)
                 goto fail_unlock;
   
@@@ -7963,7 -7975,7 +7977,7 @@@ static int btrfs_dio_iomap_begin(struc
                 iomap->type = IOMAP_MAPPED;
         }
         iomap->offset = start;
-       iomap->bdev = fs_info->fs_devices->latest_bdev;
+       iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
         iomap->length = len;
   
         if (write && btrfs_use_zone_append(BTRFS_I(inode), em->block_start))
@@@ -8040,13 -8052,13 +8054,13 @@@ static void btrfs_dio_private_put(struc
   
         if (btrfs_op(dip->dio_bio) == BTRFS_MAP_WRITE) {
                 __endio_write_update_ordered(BTRFS_I(dip->inode),
-                                            dip->logical_offset,
+                                            dip->file_offset,
                                              dip->bytes,
                                              !dip->dio_bio->bi_status);
         } else {
                 unlock_extent(&BTRFS_I(dip->inode)->io_tree,
-                             dip->logical_offset,
-                             dip->logical_offset + dip->bytes - 1);
+                             dip->file_offset,
+                             dip->file_offset + dip->bytes - 1);
         }
   
         bio_endio(dip->dio_bio);
@@@ -8074,10 -8086,11 +8088,11 @@@ static blk_status_t submit_dio_repair_b
         return ret;
   }
   
- static blk_status_t btrfs_check_read_dio_bio(struct inode *inode,
-                                            struct btrfs_io_bio *io_bio,
+ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
+                                            struct btrfs_bio *bbio,
                                              const bool uptodate)
   {
+       struct inode *inode = dip->inode;
         struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
         const u32 sectorsize = fs_info->sectorsize;
         struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
@@@ -8085,11 -8098,12 +8100,12 @@@
         const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
         struct bio_vec bvec;
         struct bvec_iter iter;
-       u64 start = io_bio->logical;
+       const u64 orig_file_offset = dip->file_offset;
+       u64 start = orig_file_offset;
         u32 bio_offset = 0;
         blk_status_t err = BLK_STS_OK;
   
-       __bio_for_each_segment(bvec, &io_bio->bio, iter, io_bio->iter) {
+       __bio_for_each_segment(bvec, &bbio->bio, iter, bbio->iter) {
                 unsigned int i, nr_sectors, pgoff;
   
                 nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len);
@@@ -8097,7 -8111,7 +8113,7 @@@
                 for (i = 0; i < nr_sectors; i++) {
                         ASSERT(pgoff < PAGE_SIZE);
                         if (uptodate &&
-                           (!csum || !check_data_csum(inode, io_bio,
+                           (!csum || !check_data_csum(inode, bbio,
                                                        bio_offset, bvec.bv_page,
                                                        pgoff, start))) {
                                 clean_io_failure(fs_info, failure_tree, io_tree,
@@@ -8107,12 -8121,12 +8123,12 @@@
                         } else {
                                 int ret;
   
-                               ASSERT((start - io_bio->logical) < UINT_MAX);
+                               ASSERT((start - orig_file_offset) < UINT_MAX);
                                 ret = btrfs_repair_one_sector(inode,
-                                               &io_bio->bio,
-                                               start - io_bio->logical,
+                                               &bbio->bio,
+                                               start - orig_file_offset,
                                                 bvec.bv_page, pgoff,
-                                               start, io_bio->mirror_num,
+                                               start, bbio->mirror_num,
                                                 submit_dio_repair_bio);
                                 if (ret)
                                         err = errno_to_blk_status(ret);
@@@ -8153,15 -8167,13 +8169,13 @@@ static void btrfs_end_dio_bio(struct bi
                            bio->bi_opf, bio->bi_iter.bi_sector,
                            bio->bi_iter.bi_size, err);
   
-       if (bio_op(bio) == REQ_OP_READ) {
-               err = btrfs_check_read_dio_bio(dip->inode, btrfs_io_bio(bio),
-                                              !err);
-       }
+       if (bio_op(bio) == REQ_OP_READ)
+               err = btrfs_check_read_dio_bio(dip, btrfs_bio(bio), !err);
   
         if (err)
                 dip->dio_bio->bi_status = err;
   
-       btrfs_record_physical_zoned(dip->inode, dip->logical_offset, bio);
+       btrfs_record_physical_zoned(dip->inode, dip->file_offset, bio);
   
         bio_put(bio);
         btrfs_dio_private_put(dip);
@@@ -8203,10 -8215,10 +8217,10 @@@ static inline blk_status_t btrfs_submit
         } else {
                 u64 csum_offset;
   
-               csum_offset = file_offset - dip->logical_offset;
+               csum_offset = file_offset - dip->file_offset;
                 csum_offset >>= fs_info->sectorsize_bits;
                 csum_offset *= fs_info->csum_size;
-               btrfs_io_bio(bio)->csum = dip->csums + csum_offset;
+               btrfs_bio(bio)->csum = dip->csums + csum_offset;
         }
   map:
         ret = btrfs_map_bio(fs_info, bio, 0);
@@@ -8241,7 -8253,7 +8255,7 @@@ static struct btrfs_dio_private *btrfs_
                 return NULL;
   
         dip->inode = inode;
-       dip->logical_offset = file_offset;
+       dip->file_offset = file_offset;
         dip->bytes = dio_bio->bi_iter.bi_size;
         dip->disk_bytenr = dio_bio->bi_iter.bi_sector << 9;
         dip->dio_bio = dio_bio;
@@@ -8249,7 -8261,7 +8263,7 @@@
         return dip;
   }
   
- -static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter,
+ +static void btrfs_submit_direct(const struct iomap_iter *iter,
                 struct bio *dio_bio, loff_t file_offset)
   {
         struct inode *inode = iter->inode;
@@@ -8279,7 -8291,7 +8293,7 @@@
                 }
                 dio_bio->bi_status = BLK_STS_RESOURCE;
                 bio_endio(dio_bio);
- -              return BLK_QC_T_NONE;
+ +              return;
         }
   
         if (!write) {
@@@ -8322,7 -8334,6 +8336,6 @@@
                 bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len);
                 bio->bi_private = dip;
                 bio->bi_end_io = btrfs_end_dio_bio;
-               btrfs_io_bio(bio)->logical = file_offset;
   
                 if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
                         status = extract_ordered_extent(BTRFS_I(inode), bio,
@@@ -8373,13 -8384,15 +8386,13 @@@
   
                 free_extent_map(em);
         } while (submit_len > 0);
- -      return BLK_QC_T_NONE;
+ +      return;
   
   out_err_em:
         free_extent_map(em);
   out_err:
         dip->dio_bio->bi_status = status;
         btrfs_dio_private_put(dip);
- -
- -      return BLK_QC_T_NONE;
   }
   
   const struct iomap_ops btrfs_dio_iomap_ops = {
@@@ -8696,9 -8709,9 +8709,9 @@@ next
          * did something wrong.
          */
         ASSERT(!PageOrdered(page));
+       btrfs_page_clear_checked(fs_info, page, page_offset(page), PAGE_SIZE);
         if (!inode_evicting)
                 __btrfs_releasepage(page, GFP_NOFS);
-       ClearPageChecked(page);
         clear_page_extent_mapped(page);
   }
   
@@@ -8842,7 -8855,7 +8855,7 @@@ again
                 memzero_page(page, zero_start, PAGE_SIZE - zero_start);
                 flush_dcache_page(page);
         }
-       ClearPageChecked(page);
+       btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE);
         btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start);
         btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start);
   
@@@ -9152,8 -9165,10 +9165,10 @@@ void btrfs_destroy_inode(struct inode *
         WARN_ON(inode->block_rsv.reserved);
         WARN_ON(inode->block_rsv.size);
         WARN_ON(inode->outstanding_extents);
-       WARN_ON(inode->delalloc_bytes);
-       WARN_ON(inode->new_delalloc_bytes);
+       if (!S_ISDIR(vfs_inode->i_mode)) {
+               WARN_ON(inode->delalloc_bytes);
+               WARN_ON(inode->new_delalloc_bytes);
+       }
         WARN_ON(inode->csum_bytes);
         WARN_ON(inode->defrag_bytes);
   
@@@ -9450,7 -9465,7 +9465,7 @@@ static int btrfs_rename_exchange(struc
         if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
                 ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
         } else { /* src is an inode */
-               ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir),
+               ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
                                            BTRFS_I(old_dentry->d_inode),
                                            old_dentry->d_name.name,
                                            old_dentry->d_name.len);
@@@ -9466,7 -9481,7 +9481,7 @@@
         if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
                 ret = btrfs_unlink_subvol(trans, new_dir, new_dentry);
         } else { /* dest is an inode */
-               ret = __btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir),
+               ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir),
                                            BTRFS_I(new_dentry->d_inode),
                                            new_dentry->d_name.name,
                                            new_dentry->d_name.len);
@@@ -9741,7 -9756,7 +9756,7 @@@ static int btrfs_rename(struct user_nam
                  */
                 btrfs_pin_log_trans(root);
                 log_pinned = true;
-               ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir),
+               ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
                                         BTRFS_I(d_inode(old_dentry)),
                                         old_dentry->d_name.name,
                                         old_dentry->d_name.len);
@@@ -9761,7 -9776,7 +9776,7 @@@
                         ret = btrfs_unlink_subvol(trans, new_dir, new_dentry);
                         BUG_ON(new_inode->i_nlink == 0);
                 } else {
-                       ret = btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir),
+                       ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir),
                                                  BTRFS_I(d_inode(new_dentry)),
                                                  new_dentry->d_name.name,
                                                  new_dentry->d_name.len);
@@@ -9979,7 -9994,7 +9994,7 @@@ int btrfs_start_delalloc_snapshot(struc
         };
         struct btrfs_fs_info *fs_info = root->fs_info;
   
-       if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+       if (BTRFS_FS_ERROR(fs_info))
                 return -EROFS;
   
         return start_delalloc_inodes(root, &wbc, true, in_reclaim_context);
@@@ -9998,7 -10013,7 +10013,7 @@@ int btrfs_start_delalloc_roots(struct b
         struct list_head splice;
         int ret;
   
-       if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+       if (BTRFS_FS_ERROR(fs_info))
                 return -EROFS;
   
         INIT_LIST_HEAD(&splice);
diff --combined fs/btrfs/ioctl.c

index 36ff713,92424a2..02ff085
--- 1/fs/btrfs/ioctl.c
--- 2/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@@ -48,6 -48,7 +48,7 @@@
   #include "space-info.h"
   #include "delalloc-space.h"
   #include "block-group.h"
+ #include "subpage.h"
   
   #ifdef CONFIG_64BIT
   /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
@@@ -81,7 -82,8 +82,8 @@@ struct btrfs_ioctl_send_args_32 
         compat_uptr_t clone_sources;    /* in */
         __u64 parent_root;              /* in */
         __u64 flags;                    /* in */
-       __u64 reserved[4];              /* in */
+       __u32 version;                  /* in */
+       __u8  reserved[28];             /* in */
   } __attribute__ ((__packed__));
   
   #define BTRFS_IOC_SEND_32 _IOW(BTRFS_IOCTL_MAGIC, 38, \
@@@ -985,129 -987,32 +987,32 @@@ out
         return ret;
   }
   
- /*
-  * When we're defragging a range, we don't want to kick it off again
-  * if it is really just waiting for delalloc to send it down.
-  * If we find a nice big extent or delalloc range for the bytes in the
-  * file you want to defrag, we return 0 to let you know to skip this
-  * part of the file
-  */
- static int check_defrag_in_cache(struct inode *inode, u64 offset, u32 thresh)
- {
-       struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
-       struct extent_map *em = NULL;
-       struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
-       u64 end;
- 
-       read_lock(&em_tree->lock);
-       em = lookup_extent_mapping(em_tree, offset, PAGE_SIZE);
-       read_unlock(&em_tree->lock);
- 
-       if (em) {
-               end = extent_map_end(em);
-               free_extent_map(em);
-               if (end - offset > thresh)
-                       return 0;
-       }
-       /* if we already have a nice delalloc here, just stop */
-       thresh /= 2;
-       end = count_range_bits(io_tree, &offset, offset + thresh,
-                              thresh, EXTENT_DELALLOC, 1);
-       if (end >= thresh)
-               return 0;
-       return 1;
- }
- 
- /*
-  * helper function to walk through a file and find extents
-  * newer than a specific transid, and smaller than thresh.
-  *
-  * This is used by the defragging code to find new and small
-  * extents
-  */
- static int find_new_extents(struct btrfs_root *root,
-                           struct inode *inode, u64 newer_than,
-                           u64 *off, u32 thresh)
- {
-       struct btrfs_path *path;
-       struct btrfs_key min_key;
-       struct extent_buffer *leaf;
-       struct btrfs_file_extent_item *extent;
-       int type;
-       int ret;
-       u64 ino = btrfs_ino(BTRFS_I(inode));
- 
-       path = btrfs_alloc_path();
-       if (!path)
-               return -ENOMEM;
- 
-       min_key.objectid = ino;
-       min_key.type = BTRFS_EXTENT_DATA_KEY;
-       min_key.offset = *off;
- 
-       while (1) {
-               ret = btrfs_search_forward(root, &min_key, path, newer_than);
-               if (ret != 0)
-                       goto none;
- process_slot:
-               if (min_key.objectid != ino)
-                       goto none;
-               if (min_key.type != BTRFS_EXTENT_DATA_KEY)
-                       goto none;
- 
-               leaf = path->nodes[0];
-               extent = btrfs_item_ptr(leaf, path->slots[0],
-                                       struct btrfs_file_extent_item);
- 
-               type = btrfs_file_extent_type(leaf, extent);
-               if (type == BTRFS_FILE_EXTENT_REG &&
-                   btrfs_file_extent_num_bytes(leaf, extent) < thresh &&
-                   check_defrag_in_cache(inode, min_key.offset, thresh)) {
-                       *off = min_key.offset;
-                       btrfs_free_path(path);
-                       return 0;
-               }
- 
-               path->slots[0]++;
-               if (path->slots[0] < btrfs_header_nritems(leaf)) {
-                       btrfs_item_key_to_cpu(leaf, &min_key, path->slots[0]);
-                       goto process_slot;
-               }
- 
-               if (min_key.offset == (u64)-1)
-                       goto none;
- 
-               min_key.offset++;
-               btrfs_release_path(path);
-       }
- none:
-       btrfs_free_path(path);
-       return -ENOENT;
- }
- 
- static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start)
+ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
+                                              bool locked)
   {
         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
         struct extent_map *em;
-       u64 len = PAGE_SIZE;
+       const u32 sectorsize = BTRFS_I(inode)->root->fs_info->sectorsize;
   
         /*
          * hopefully we have this extent in the tree already, try without
          * the full extent lock
          */
         read_lock(&em_tree->lock);
-       em = lookup_extent_mapping(em_tree, start, len);
+       em = lookup_extent_mapping(em_tree, start, sectorsize);
         read_unlock(&em_tree->lock);
   
         if (!em) {
                 struct extent_state *cached = NULL;
-               u64 end = start + len - 1;
+               u64 end = start + sectorsize - 1;
   
                 /* get the big lock and read metadata off disk */
-               lock_extent_bits(io_tree, start, end, &cached);
-               em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
-               unlock_extent_cached(io_tree, start, end, &cached);
+               if (!locked)
+                       lock_extent_bits(io_tree, start, end, &cached);
+               em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, sectorsize);
+               if (!locked)
+                       unlock_extent_cached(io_tree, start, end, &cached);
   
                 if (IS_ERR(em))
                         return NULL;
@@@ -1116,7 -1021,8 +1021,8 @@@
         return em;
   }
   
- static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
+ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
+                                    bool locked)
   {
         struct extent_map *next;
         bool ret = true;
@@@ -1125,7 -1031,7 +1031,7 @@@
         if (em->start + em->len >= i_size_read(inode))
                 return false;
   
-       next = defrag_lookup_extent(inode, em->start + em->len);
+       next = defrag_lookup_extent(inode, em->start + em->len, locked);
         if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
                 ret = false;
         else if ((em->block_start + em->block_len == next->block_start) &&
@@@ -1136,297 -1042,435 +1042,435 @@@
         return ret;
   }
   
- static int should_defrag_range(struct inode *inode, u64 start, u32 thresh,
-                              u64 *last_len, u64 *skip, u64 *defrag_end,
-                              int compress)
+ /*
+  * Prepare one page to be defragged.
+  *
+  * This will ensure:
+  *
+  * - Returned page is locked and has been set up properly.
+  * - No ordered extent exists in the page.
+  * - The page is uptodate.
+  *
+  * NOTE: Caller should also wait for page writeback after the cluster is
+  * prepared, here we don't do writeback wait for each page.
+  */
+ static struct page *defrag_prepare_one_page(struct btrfs_inode *inode,
+                                           pgoff_t index)
   {
-       struct extent_map *em;
-       int ret = 1;
-       bool next_mergeable = true;
-       bool prev_mergeable = true;
+       struct address_space *mapping = inode->vfs_inode.i_mapping;
+       gfp_t mask = btrfs_alloc_write_mask(mapping);
+       u64 page_start = (u64)index << PAGE_SHIFT;
+       u64 page_end = page_start + PAGE_SIZE - 1;
+       struct extent_state *cached_state = NULL;
+       struct page *page;
+       int ret;
+ 
+ again:
+       page = find_or_create_page(mapping, index, mask);
+       if (!page)
+               return ERR_PTR(-ENOMEM);
   
         /*
-        * make sure that once we start defragging an extent, we keep on
-        * defragging it
+        * Since we can defragment files opened read-only, we can encounter
+        * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We
+        * can't do I/O using huge pages yet, so return an error for now.
+        * Filesystem transparent huge pages are typically only used for
+        * executables that explicitly enable them, so this isn't very
+        * restrictive.
          */
-       if (start < *defrag_end)
-               return 1;
+       if (PageCompound(page)) {
+               unlock_page(page);
+               put_page(page);
+               return ERR_PTR(-ETXTBSY);
+       }
   
-       *skip = 0;
+       ret = set_page_extent_mapped(page);
+       if (ret < 0) {
+               unlock_page(page);
+               put_page(page);
+               return ERR_PTR(ret);
+       }
   
-       em = defrag_lookup_extent(inode, start);
-       if (!em)
-               return 0;
+       /* Wait for any existing ordered extent in the range */
+       while (1) {
+               struct btrfs_ordered_extent *ordered;
   
-       /* this will cover holes, and inline extents */
-       if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
-               ret = 0;
-               goto out;
-       }
+               lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state);
+               ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
+               unlock_extent_cached(&inode->io_tree, page_start, page_end,
+                                    &cached_state);
+               if (!ordered)
+                       break;
   
-       if (!*defrag_end)
-               prev_mergeable = false;
+               unlock_page(page);
+               btrfs_start_ordered_extent(ordered, 1);
+               btrfs_put_ordered_extent(ordered);
+               lock_page(page);
+               /*
+                * We unlocked the page above, so we need check if it was
+                * released or not.
+                */
+               if (page->mapping != mapping || !PagePrivate(page)) {
+                       unlock_page(page);
+                       put_page(page);
+                       goto again;
+               }
+       }
   
-       next_mergeable = defrag_check_next_extent(inode, em);
-       /*
-        * we hit a real extent, if it is big or the next extent is not a
-        * real extent, don't bother defragging it
-        */
-       if (!compress && (*last_len == 0 || *last_len >= thresh) &&
-           (em->len >= thresh || (!next_mergeable && !prev_mergeable)))
-               ret = 0;
- out:
         /*
-        * last_len ends up being a counter of how many bytes we've defragged.
-        * every time we choose not to defrag an extent, we reset *last_len
-        * so that the next tiny extent will force a defrag.
-        *
-        * The end result of this is that tiny extents before a single big
-        * extent will force at least part of that big extent to be defragged.
+        * Now the page range has no ordered extent any more.  Read the page to
+        * make it uptodate.
          */
-       if (ret) {
-               *defrag_end = extent_map_end(em);
-       } else {
-               *last_len = 0;
-               *skip = extent_map_end(em);
-               *defrag_end = 0;
+       if (!PageUptodate(page)) {
+               btrfs_readpage(NULL, page);
+               lock_page(page);
+               if (page->mapping != mapping || !PagePrivate(page)) {
+                       unlock_page(page);
+                       put_page(page);
+                       goto again;
+               }
+               if (!PageUptodate(page)) {
+                       unlock_page(page);
+                       put_page(page);
+                       return ERR_PTR(-EIO);
+               }
         }
- 
-       free_extent_map(em);
-       return ret;
+       return page;
   }
   
+ struct defrag_target_range {
+       struct list_head list;
+       u64 start;
+       u64 len;
+ };
+ 
   /*
-  * it doesn't do much good to defrag one or two pages
-  * at a time.  This pulls in a nice chunk of pages
-  * to COW and defrag.
-  *
-  * It also makes sure the delalloc code has enough
-  * dirty data to avoid making new small extents as part
-  * of the defrag
+  * Collect all valid target extents.
    *
-  * It's a good idea to start RA on this range
-  * before calling this.
+  * @start:       file offset to lookup
+  * @len:         length to lookup
+  * @extent_thresh: file extent size threshold, any extent size >= this value
+  *               will be ignored
+  * @newer_than:    only defrag extents newer than this value
+  * @do_compress:   whether the defrag is doing compression
+  *               if true, @extent_thresh will be ignored and all regular
+  *               file extents meeting @newer_than will be targets.
+  * @locked:      if the range has already held extent lock
+  * @target_list:   list of targets file extents
    */
- static int cluster_pages_for_defrag(struct inode *inode,
-                                   struct page **pages,
-                                   unsigned long start_index,
-                                   unsigned long num_pages)
+ static int defrag_collect_targets(struct btrfs_inode *inode,
+                                 u64 start, u64 len, u32 extent_thresh,
+                                 u64 newer_than, bool do_compress,
+                                 bool locked, struct list_head *target_list)
   {
-       unsigned long file_end;
-       u64 isize = i_size_read(inode);
-       u64 page_start;
-       u64 page_end;
-       u64 page_cnt;
-       u64 start = (u64)start_index << PAGE_SHIFT;
-       u64 search_start;
-       int ret;
-       int i;
-       int i_done;
-       struct btrfs_ordered_extent *ordered;
-       struct extent_state *cached_state = NULL;
-       struct extent_io_tree *tree;
-       struct extent_changeset *data_reserved = NULL;
-       gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
+       u64 cur = start;
+       int ret = 0;
   
-       file_end = (isize - 1) >> PAGE_SHIFT;
-       if (!isize || start_index > file_end)
-               return 0;
+       while (cur < start + len) {
+               struct extent_map *em;
+               struct defrag_target_range *new;
+               bool next_mergeable = true;
+               u64 range_len;
   
-       page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1);
+               em = defrag_lookup_extent(&inode->vfs_inode, cur, locked);
+               if (!em)
+                       break;
   
-       ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
-                       start, page_cnt << PAGE_SHIFT);
-       if (ret)
-               return ret;
-       i_done = 0;
-       tree = &BTRFS_I(inode)->io_tree;
+               /* Skip hole/inline/preallocated extents */
+               if (em->block_start >= EXTENT_MAP_LAST_BYTE ||
+                   test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+                       goto next;
   
-       /* step one, lock all the pages */
-       for (i = 0; i < page_cnt; i++) {
-               struct page *page;
- again:
-               page = find_or_create_page(inode->i_mapping,
-                                          start_index + i, mask);
-               if (!page)
-                       break;
+               /* Skip older extent */
+               if (em->generation < newer_than)
+                       goto next;
   
-               ret = set_page_extent_mapped(page);
-               if (ret < 0) {
-                       unlock_page(page);
-                       put_page(page);
-                       break;
+               /*
+                * For do_compress case, we want to compress all valid file
+                * extents, thus no @extent_thresh or mergeable check.
+                */
+               if (do_compress)
+                       goto add;
+ 
+               /* Skip too large extent */
+               if (em->len >= extent_thresh)
+                       goto next;
+ 
+               next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em,
+                                                         locked);
+               if (!next_mergeable) {
+                       struct defrag_target_range *last;
+ 
+                       /* Empty target list, no way to merge with last entry */
+                       if (list_empty(target_list))
+                               goto next;
+                       last = list_entry(target_list->prev,
+                                         struct defrag_target_range, list);
+                       /* Not mergeable with last entry */
+                       if (last->start + last->len != cur)
+                               goto next;
+ 
+                       /* Mergeable, fall through to add it to @target_list. */
                 }
   
-               page_start = page_offset(page);
-               page_end = page_start + PAGE_SIZE - 1;
-               while (1) {
-                       lock_extent_bits(tree, page_start, page_end,
-                                        &cached_state);
-                       ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode),
-                                                             page_start);
-                       unlock_extent_cached(tree, page_start, page_end,
-                                            &cached_state);
-                       if (!ordered)
-                               break;
- 
-                       unlock_page(page);
-                       btrfs_start_ordered_extent(ordered, 1);
-                       btrfs_put_ordered_extent(ordered);
-                       lock_page(page);
-                       /*
-                        * we unlocked the page above, so we need check if
-                        * it was released or not.
-                        */
-                       if (page->mapping != inode->i_mapping) {
-                               unlock_page(page);
-                               put_page(page);
-                               goto again;
+ add:
+               range_len = min(extent_map_end(em), start + len) - cur;
+               /*
+                * This one is a good target, check if it can be merged into
+                * last range of the target list.
+                */
+               if (!list_empty(target_list)) {
+                       struct defrag_target_range *last;
+ 
+                       last = list_entry(target_list->prev,
+                                         struct defrag_target_range, list);
+                       ASSERT(last->start + last->len <= cur);
+                       if (last->start + last->len == cur) {
+                               /* Mergeable, enlarge the last entry */
+                               last->len += range_len;
+                               goto next;
                         }
+                       /* Fall through to allocate a new entry */
                 }
   
-               if (!PageUptodate(page)) {
-                       btrfs_readpage(NULL, page);
-                       lock_page(page);
-                       if (!PageUptodate(page)) {
-                               unlock_page(page);
-                               put_page(page);
-                               ret = -EIO;
-                               break;
-                       }
+               /* Allocate new defrag_target_range */
+               new = kmalloc(sizeof(*new), GFP_NOFS);
+               if (!new) {
+                       free_extent_map(em);
+                       ret = -ENOMEM;
+                       break;
                 }
+               new->start = cur;
+               new->len = range_len;
+               list_add_tail(&new->list, target_list);
   
-               if (page->mapping != inode->i_mapping) {
-                       unlock_page(page);
-                       put_page(page);
-                       goto again;
+ next:
+               cur = extent_map_end(em);
+               free_extent_map(em);
+       }
+       if (ret < 0) {
+               struct defrag_target_range *entry;
+               struct defrag_target_range *tmp;
+ 
+               list_for_each_entry_safe(entry, tmp, target_list, list) {
+                       list_del_init(&entry->list);
+                       kfree(entry);
                 }
+       }
+       return ret;
+ }
+ 
+ #define CLUSTER_SIZE  (SZ_256K)
+ 
+ /*
+  * Defrag one contiguous target range.
+  *
+  * @inode:    target inode
+  * @target:   target range to defrag
+  * @pages:    locked pages covering the defrag range
+  * @nr_pages: number of locked pages
+  *
+  * Caller should ensure:
+  *
+  * - Pages are prepared
+  *   Pages should be locked, no ordered extent in the pages range,
+  *   no writeback.
+  *
+  * - Extent bits are locked
+  */
+ static int defrag_one_locked_target(struct btrfs_inode *inode,
+                                   struct defrag_target_range *target,
+                                   struct page **pages, int nr_pages,
+                                   struct extent_state **cached_state)
+ {
+       struct btrfs_fs_info *fs_info = inode->root->fs_info;
+       struct extent_changeset *data_reserved = NULL;
+       const u64 start = target->start;
+       const u64 len = target->len;
+       unsigned long last_index = (start + len - 1) >> PAGE_SHIFT;
+       unsigned long start_index = start >> PAGE_SHIFT;
+       unsigned long first_index = page_index(pages[0]);
+       int ret = 0;
+       int i;
+ 
+       ASSERT(last_index - first_index + 1 <= nr_pages);
+ 
+       ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len);
+       if (ret < 0)
+               return ret;
+       clear_extent_bit(&inode->io_tree, start, start + len - 1,
+                        EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
+                        EXTENT_DEFRAG, 0, 0, cached_state);
+       set_extent_defrag(&inode->io_tree, start, start + len - 1, cached_state);
   
-               pages[i] = page;
-               i_done++;
+       /* Update the page status */
+       for (i = start_index - first_index; i <= last_index - first_index; i++) {
+               ClearPageChecked(pages[i]);
+               btrfs_page_clamp_set_dirty(fs_info, pages[i], start, len);
         }
-       if (!i_done || ret)
-               goto out;
+       btrfs_delalloc_release_extents(inode, len);
+       extent_changeset_free(data_reserved);
   
-       if (!(inode->i_sb->s_flags & SB_ACTIVE))
-               goto out;
+       return ret;
+ }
   
-       /*
-        * so now we have a nice long stream of locked
-        * and up to date pages, lets wait on them
-        */
-       for (i = 0; i < i_done; i++)
-               wait_on_page_writeback(pages[i]);
+ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
+                           u32 extent_thresh, u64 newer_than, bool do_compress)
+ {
+       struct extent_state *cached_state = NULL;
+       struct defrag_target_range *entry;
+       struct defrag_target_range *tmp;
+       LIST_HEAD(target_list);
+       struct page **pages;
+       const u32 sectorsize = inode->root->fs_info->sectorsize;
+       u64 last_index = (start + len - 1) >> PAGE_SHIFT;
+       u64 start_index = start >> PAGE_SHIFT;
+       unsigned int nr_pages = last_index - start_index + 1;
+       int ret = 0;
+       int i;
   
-       page_start = page_offset(pages[0]);
-       page_end = page_offset(pages[i_done - 1]) + PAGE_SIZE;
+       ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE);
+       ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize));
   
-       lock_extent_bits(&BTRFS_I(inode)->io_tree,
-                        page_start, page_end - 1, &cached_state);
+       pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
+       if (!pages)
+               return -ENOMEM;
   
+       /* Prepare all pages */
+       for (i = 0; i < nr_pages; i++) {
+               pages[i] = defrag_prepare_one_page(inode, start_index + i);
+               if (IS_ERR(pages[i])) {
+                       ret = PTR_ERR(pages[i]);
+                       pages[i] = NULL;
+                       goto free_pages;
+               }
+       }
+       for (i = 0; i < nr_pages; i++)
+               wait_on_page_writeback(pages[i]);
+ 
+       /* Lock the pages range */
+       lock_extent_bits(&inode->io_tree, start_index << PAGE_SHIFT,
+                        (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
+                        &cached_state);
         /*
-        * When defragmenting we skip ranges that have holes or inline extents,
-        * (check should_defrag_range()), to avoid unnecessary IO and wasting
-        * space. At btrfs_defrag_file(), we check if a range should be defragged
-        * before locking the inode and then, if it should, we trigger a sync
-        * page cache readahead - we lock the inode only after that to avoid
-        * blocking for too long other tasks that possibly want to operate on
-        * other file ranges. But before we were able to get the inode lock,
-        * some other task may have punched a hole in the range, or we may have
-        * now an inline extent, in which case we should not defrag. So check
-        * for that here, where we have the inode and the range locked, and bail
-        * out if that happened.
+        * Now we have a consistent view about the extent map, re-check
+        * which range really needs to be defragged.
+        *
+        * And this time we have extent locked already, pass @locked = true
+        * so that we won't relock the extent range and cause deadlock.
          */
-       search_start = page_start;
-       while (search_start < page_end) {
-               struct extent_map *em;
+       ret = defrag_collect_targets(inode, start, len, extent_thresh,
+                                    newer_than, do_compress, true,
+                                    &target_list);
+       if (ret < 0)
+               goto unlock_extent;
   
-               em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, search_start,
-                                     page_end - search_start);
-               if (IS_ERR(em)) {
-                       ret = PTR_ERR(em);
-                       goto out_unlock_range;
-               }
-               if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
-                       free_extent_map(em);
-                       /* Ok, 0 means we did not defrag anything */
-                       ret = 0;
-                       goto out_unlock_range;
+       list_for_each_entry(entry, &target_list, list) {
+               ret = defrag_one_locked_target(inode, entry, pages, nr_pages,
+                                              &cached_state);
+               if (ret < 0)
+                       break;
+       }
+ 
+       list_for_each_entry_safe(entry, tmp, &target_list, list) {
+               list_del_init(&entry->list);
+               kfree(entry);
+       }
+ unlock_extent:
+       unlock_extent_cached(&inode->io_tree, start_index << PAGE_SHIFT,
+                            (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
+                            &cached_state);
+ free_pages:
+       for (i = 0; i < nr_pages; i++) {
+               if (pages[i]) {
+                       unlock_page(pages[i]);
+                       put_page(pages[i]);
                 }
-               search_start = extent_map_end(em);
-               free_extent_map(em);
         }
+       kfree(pages);
+       return ret;
+ }
   
-       clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,
-                         page_end - 1, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
-                         EXTENT_DEFRAG, 0, 0, &cached_state);
+ static int defrag_one_cluster(struct btrfs_inode *inode,
+                             struct file_ra_state *ra,
+                             u64 start, u32 len, u32 extent_thresh,
+                             u64 newer_than, bool do_compress,
+                             unsigned long *sectors_defragged,
+                             unsigned long max_sectors)
+ {
+       const u32 sectorsize = inode->root->fs_info->sectorsize;
+       struct defrag_target_range *entry;
+       struct defrag_target_range *tmp;
+       LIST_HEAD(target_list);
+       int ret;
   
-       if (i_done != page_cnt) {
-               spin_lock(&BTRFS_I(inode)->lock);
-               btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
-               spin_unlock(&BTRFS_I(inode)->lock);
-               btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
-                               start, (page_cnt - i_done) << PAGE_SHIFT, true);
-       }
+       BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE));
+       ret = defrag_collect_targets(inode, start, len, extent_thresh,
+                                    newer_than, do_compress, false,
+                                    &target_list);
+       if (ret < 0)
+               goto out;
   
+       list_for_each_entry(entry, &target_list, list) {
+               u32 range_len = entry->len;
   
-       set_extent_defrag(&BTRFS_I(inode)->io_tree, page_start, page_end - 1,
-                         &cached_state);
+               /* Reached the limit */
+               if (max_sectors && max_sectors == *sectors_defragged)
+                       break;
   
-       unlock_extent_cached(&BTRFS_I(inode)->io_tree,
-                            page_start, page_end - 1, &cached_state);
+               if (max_sectors)
+                       range_len = min_t(u32, range_len,
+                               (max_sectors - *sectors_defragged) * sectorsize);
   
-       for (i = 0; i < i_done; i++) {
-               clear_page_dirty_for_io(pages[i]);
-               ClearPageChecked(pages[i]);
-               set_page_dirty(pages[i]);
-               unlock_page(pages[i]);
-               put_page(pages[i]);
+               if (ra)
+                       page_cache_sync_readahead(inode->vfs_inode.i_mapping,
+                               ra, NULL, entry->start >> PAGE_SHIFT,
+                               ((entry->start + range_len - 1) >> PAGE_SHIFT) -
+                               (entry->start >> PAGE_SHIFT) + 1);
+               /*
+                * Here we may not defrag any range if holes are punched before
+                * we locked the pages.
+                * But that's fine, it only affects the @sectors_defragged
+                * accounting.
+                */
+               ret = defrag_one_range(inode, entry->start, range_len,
+                                      extent_thresh, newer_than, do_compress);
+               if (ret < 0)
+                       break;
+               *sectors_defragged += range_len;
         }
-       btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
-       extent_changeset_free(data_reserved);
-       return i_done;
- 
- out_unlock_range:
-       unlock_extent_cached(&BTRFS_I(inode)->io_tree,
-                            page_start, page_end - 1, &cached_state);
   out:
-       for (i = 0; i < i_done; i++) {
-               unlock_page(pages[i]);
-               put_page(pages[i]);
+       list_for_each_entry_safe(entry, tmp, &target_list, list) {
+               list_del_init(&entry->list);
+               kfree(entry);
         }
-       btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
-                       start, page_cnt << PAGE_SHIFT, true);
-       btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
-       extent_changeset_free(data_reserved);
         return ret;
- 
   }
   
- int btrfs_defrag_file(struct inode *inode, struct file *file,
+ /*
+  * Entry point to file defragmentation.
+  *
+  * @inode:       inode to be defragged
+  * @ra:                  readahead state (can be NUL)
+  * @range:       defrag options including range and flags
+  * @newer_than:          minimum transid to defrag
+  * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode
+  *               will be defragged.
+  */
+ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
                       struct btrfs_ioctl_defrag_range_args *range,
                       u64 newer_than, unsigned long max_to_defrag)
   {
         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-       struct btrfs_root *root = BTRFS_I(inode)->root;
-       struct file_ra_state *ra = NULL;
-       unsigned long last_index;
+       unsigned long sectors_defragged = 0;
         u64 isize = i_size_read(inode);
-       u64 last_len = 0;
-       u64 skip = 0;
-       u64 defrag_end = 0;
-       u64 newer_off = range->start;
-       unsigned long i;
-       unsigned long ra_index = 0;
-       int ret;
-       int defrag_count = 0;
+       u64 cur;
+       u64 last_byte;
+       bool do_compress = range->flags & BTRFS_DEFRAG_RANGE_COMPRESS;
+       bool ra_allocated = false;
         int compress_type = BTRFS_COMPRESS_ZLIB;
+       int ret = 0;
         u32 extent_thresh = range->extent_thresh;
-       unsigned long max_cluster = SZ_256K >> PAGE_SHIFT;
-       unsigned long cluster = max_cluster;
-       u64 new_align = ~((u64)SZ_128K - 1);
-       struct page **pages = NULL;
-       bool do_compress = range->flags & BTRFS_DEFRAG_RANGE_COMPRESS;
   
         if (isize == 0)
                 return 0;
@@@ -1444,172 -1488,87 +1488,87 @@@
         if (extent_thresh == 0)
                 extent_thresh = SZ_256K;
   
+       if (range->start + range->len > range->start) {
+               /* Got a specific range */
+               last_byte = min(isize, range->start + range->len) - 1;
+       } else {
+               /* Defrag until file end */
+               last_byte = isize - 1;
+       }
+ 
         /*
-        * If we were not given a file, allocate a readahead context. As
+        * If we were not given a ra, allocate a readahead context. As
          * readahead is just an optimization, defrag will work without it so
          * we don't error out.
          */
-       if (!file) {
+       if (!ra) {
+               ra_allocated = true;
                 ra = kzalloc(sizeof(*ra), GFP_KERNEL);
                 if (ra)
                         file_ra_state_init(ra, inode->i_mapping);
-       } else {
-               ra = &file->f_ra;
-       }
- 
-       pages = kmalloc_array(max_cluster, sizeof(struct page *), GFP_KERNEL);
-       if (!pages) {
-               ret = -ENOMEM;
-               goto out_ra;
-       }
- 
-       /* find the last page to defrag */
-       if (range->start + range->len > range->start) {
-               last_index = min_t(u64, isize - 1,
-                        range->start + range->len - 1) >> PAGE_SHIFT;
-       } else {
-               last_index = (isize - 1) >> PAGE_SHIFT;
-       }
- 
-       if (newer_than) {
-               ret = find_new_extents(root, inode, newer_than,
-                                      &newer_off, SZ_64K);
-               if (!ret) {
-                       range->start = newer_off;
-                       /*
-                        * we always align our defrag to help keep
-                        * the extents in the file evenly spaced
-                        */
-                       i = (newer_off & new_align) >> PAGE_SHIFT;
-               } else
-                       goto out_ra;
-       } else {
-               i = range->start >> PAGE_SHIFT;
         }
-       if (!max_to_defrag)
-               max_to_defrag = last_index - i + 1;
   
-       /*
-        * make writeback starts from i, so the defrag range can be
-        * written sequentially.
-        */
-       if (i < inode->i_mapping->writeback_index)
-               inode->i_mapping->writeback_index = i;
- 
-       while (i <= last_index && defrag_count < max_to_defrag &&
-              (i < DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE))) {
-               /*
-                * make sure we stop running if someone unmounts
-                * the FS
-                */
-               if (!(inode->i_sb->s_flags & SB_ACTIVE))
-                       break;
- 
-               if (btrfs_defrag_cancelled(fs_info)) {
-                       btrfs_debug(fs_info, "defrag_file cancelled");
-                       ret = -EAGAIN;
-                       goto error;
-               }
+       /* Align the range */
+       cur = round_down(range->start, fs_info->sectorsize);
+       last_byte = round_up(last_byte, fs_info->sectorsize) - 1;
   
-               if (!should_defrag_range(inode, (u64)i << PAGE_SHIFT,
-                                        extent_thresh, &last_len, &skip,
-                                        &defrag_end, do_compress)){
-                       unsigned long next;
-                       /*
-                        * the should_defrag function tells us how much to skip
-                        * bump our counter by the suggested amount
-                        */
-                       next = DIV_ROUND_UP(skip, PAGE_SIZE);
-                       i = max(i + 1, next);
-                       continue;
-               }
+       while (cur < last_byte) {
+               u64 cluster_end;
   
-               if (!newer_than) {
-                       cluster = (PAGE_ALIGN(defrag_end) >>
-                                  PAGE_SHIFT) - i;
-                       cluster = min(cluster, max_cluster);
-               } else {
-                       cluster = max_cluster;
-               }
+               /* The cluster size 256K should always be page aligned */
+               BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE));
   
-               if (i + cluster > ra_index) {
-                       ra_index = max(i, ra_index);
-                       if (ra)
-                               page_cache_sync_readahead(inode->i_mapping, ra,
-                                               file, ra_index, cluster);
-                       ra_index += cluster;
-               }
+               /* We want the cluster end at page boundary when possible */
+               cluster_end = (((cur >> PAGE_SHIFT) +
+                              (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1;
+               cluster_end = min(cluster_end, last_byte);
   
                 btrfs_inode_lock(inode, 0);
                 if (IS_SWAPFILE(inode)) {
                         ret = -ETXTBSY;
-               } else {
-                       if (do_compress)
-                               BTRFS_I(inode)->defrag_compress = compress_type;
-                       ret = cluster_pages_for_defrag(inode, pages, i, cluster);
+                       btrfs_inode_unlock(inode, 0);
+                       break;
                 }
-               if (ret < 0) {
+               if (!(inode->i_sb->s_flags & SB_ACTIVE)) {
                         btrfs_inode_unlock(inode, 0);
-                       goto out_ra;
+                       break;
                 }
- 
-               defrag_count += ret;
-               balance_dirty_pages_ratelimited(inode->i_mapping);
+               if (do_compress)
+                       BTRFS_I(inode)->defrag_compress = compress_type;
+               ret = defrag_one_cluster(BTRFS_I(inode), ra, cur,
+                               cluster_end + 1 - cur, extent_thresh,
+                               newer_than, do_compress,
+                               &sectors_defragged, max_to_defrag);
                 btrfs_inode_unlock(inode, 0);
- 
-               if (newer_than) {
-                       if (newer_off == (u64)-1)
-                               break;
- 
-                       if (ret > 0)
-                               i += ret;
- 
-                       newer_off = max(newer_off + 1,
-                                       (u64)i << PAGE_SHIFT);
- 
-                       ret = find_new_extents(root, inode, newer_than,
-                                              &newer_off, SZ_64K);
-                       if (!ret) {
-                               range->start = newer_off;
-                               i = (newer_off & new_align) >> PAGE_SHIFT;
-                       } else {
-                               break;
-                       }
-               } else {
-                       if (ret > 0) {
-                               i += ret;
-                               last_len += ret << PAGE_SHIFT;
-                       } else {
-                               i++;
-                               last_len = 0;
-                       }
-               }
+               if (ret < 0)
+                       break;
+               cur = cluster_end + 1;
         }
   
-       ret = defrag_count;
- error:
-       if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) {
-               filemap_flush(inode->i_mapping);
-               if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
-                            &BTRFS_I(inode)->runtime_flags))
+       if (ra_allocated)
+               kfree(ra);
+       if (sectors_defragged) {
+               /*
+                * We have defragged some sectors, for compression case they
+                * need to be written back immediately.
+                */
+               if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) {
                         filemap_flush(inode->i_mapping);
+                       if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+                                    &BTRFS_I(inode)->runtime_flags))
+                               filemap_flush(inode->i_mapping);
+               }
+               if (range->compress_type == BTRFS_COMPRESS_LZO)
+                       btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
+               else if (range->compress_type == BTRFS_COMPRESS_ZSTD)
+                       btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
+               ret = sectors_defragged;
         }
- 
-       if (range->compress_type == BTRFS_COMPRESS_LZO) {
-               btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
-       } else if (range->compress_type == BTRFS_COMPRESS_ZSTD) {
-               btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
-       }
- 
- out_ra:
         if (do_compress) {
                 btrfs_inode_lock(inode, 0);
                 BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE;
                 btrfs_inode_unlock(inode, 0);
         }
-       if (!file)
-               kfree(ra);
-       kfree(pages);
         return ret;
   }
   
@@@ -1658,6 -1617,7 +1617,7 @@@ static int exclop_start_or_cancel_reloc
   static noinline int btrfs_ioctl_resize(struct file *file,
                                         void __user *arg)
   {
+       BTRFS_DEV_LOOKUP_ARGS(args);
         struct inode *inode = file_inode(file);
         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
         u64 new_size;
@@@ -1713,7 -1673,8 +1673,8 @@@
                 btrfs_info(fs_info, "resizing devid %llu", devid);
         }
   
-       device = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
+       args.devid = devid;
+       device = btrfs_find_device(fs_info->fs_devices, &args);
         if (!device) {
                 btrfs_info(fs_info, "resizer unable to find device %llu",
                            devid);
@@@ -1730,7 -1691,7 +1691,7 @@@
         }
   
         if (!strcmp(sizestr, "max"))
- -              new_size = device->bdev->bd_inode->i_size;
+ +              new_size = bdev_nr_bytes(device->bdev);
         else {
                 if (sizestr[0] == '-') {
                         mod = -1;
@@@ -1771,7 -1732,7 +1732,7 @@@
                 ret = -EINVAL;
                 goto out_finish;
         }
- -      if (new_size > device->bdev->bd_inode->i_size) {
+ +      if (new_size > bdev_nr_bytes(device->bdev)) {
                 ret = -EFBIG;
                 goto out_finish;
         }
@@@ -3136,12 -3097,6 +3097,6 @@@ static int btrfs_ioctl_defrag(struct fi
                 goto out;
         }
   
-       /* Subpage defrag will be supported in later commits */
-       if (root->fs_info->sectorsize < PAGE_SIZE) {
-               ret = -ENOTTY;
-               goto out;
-       }
- 
         switch (inode->i_mode & S_IFMT) {
         case S_IFDIR:
                 if (!capable(CAP_SYS_ADMIN)) {
@@@ -3176,7 -3131,7 +3131,7 @@@
                         /* the rest are all set to zero by kzalloc */
                         range.len = (u64)-1;
                 }
-               ret = btrfs_defrag_file(file_inode(file), file,
+               ret = btrfs_defrag_file(file_inode(file), &file->f_ra,
                                         &range, BTRFS_OLDEST_GENERATION, 0);
                 if (ret > 0)
                         ret = 0;
@@@ -3220,6 -3175,7 +3175,7 @@@ out
   
   static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
   {
+       BTRFS_DEV_LOOKUP_ARGS(args);
         struct inode *inode = file_inode(file);
         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
         struct btrfs_ioctl_vol_args_v2 *vol_args;
@@@ -3231,35 -3187,39 +3187,39 @@@
         if (!capable(CAP_SYS_ADMIN))
                 return -EPERM;
   
-       ret = mnt_want_write_file(file);
-       if (ret)
-               return ret;
- 
         vol_args = memdup_user(arg, sizeof(*vol_args));
         if (IS_ERR(vol_args)) {
                 ret = PTR_ERR(vol_args);
-               goto err_drop;
+               goto out;
         }
   
         if (vol_args->flags & ~BTRFS_DEVICE_REMOVE_ARGS_MASK) {
                 ret = -EOPNOTSUPP;
                 goto out;
         }
+ 
         vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
-       if (!(vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) &&
-           strcmp("cancel", vol_args->name) == 0)
+       if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) {
+               args.devid = vol_args->devid;
+       } else if (!strcmp("cancel", vol_args->name)) {
                 cancel = true;
+       } else {
+               ret = btrfs_get_dev_args_from_path(fs_info, &args, vol_args->name);
+               if (ret)
+                       goto out;
+       }
+ 
+       ret = mnt_want_write_file(file);
+       if (ret)
+               goto out;
   
         ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE,
                                            cancel);
         if (ret)
-               goto out;
-       /* Exclusive operation is now claimed */
+               goto err_drop;
   
-       if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID)
-               ret = btrfs_rm_device(fs_info, NULL, vol_args->devid, &bdev, &mode);
-       else
-               ret = btrfs_rm_device(fs_info, vol_args->name, 0, &bdev, &mode);
+       /* Exclusive operation is now claimed */
+       ret = btrfs_rm_device(fs_info, &args, &bdev, &mode);
   
         btrfs_exclop_finish(fs_info);
   
@@@ -3271,17 -3231,19 +3231,19 @@@
                         btrfs_info(fs_info, "device deleted: %s",
                                         vol_args->name);
         }
- out:
-       kfree(vol_args);
   err_drop:
         mnt_drop_write_file(file);
         if (bdev)
                 blkdev_put(bdev, mode);
+ out:
+       btrfs_put_dev_args_from_path(&args);
+       kfree(vol_args);
         return ret;
   }
   
   static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
   {
+       BTRFS_DEV_LOOKUP_ARGS(args);
         struct inode *inode = file_inode(file);
         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
         struct btrfs_ioctl_vol_args *vol_args;
@@@ -3293,32 -3255,38 +3255,38 @@@
         if (!capable(CAP_SYS_ADMIN))
                 return -EPERM;
   
-       ret = mnt_want_write_file(file);
-       if (ret)
-               return ret;
- 
         vol_args = memdup_user(arg, sizeof(*vol_args));
-       if (IS_ERR(vol_args)) {
-               ret = PTR_ERR(vol_args);
-               goto out_drop_write;
-       }
+       if (IS_ERR(vol_args))
+               return PTR_ERR(vol_args);
+ 
         vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
-       cancel = (strcmp("cancel", vol_args->name) == 0);
+       if (!strcmp("cancel", vol_args->name)) {
+               cancel = true;
+       } else {
+               ret = btrfs_get_dev_args_from_path(fs_info, &args, vol_args->name);
+               if (ret)
+                       goto out;
+       }
+ 
+       ret = mnt_want_write_file(file);
+       if (ret)
+               goto out;
   
         ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE,
                                            cancel);
         if (ret == 0) {
-               ret = btrfs_rm_device(fs_info, vol_args->name, 0, &bdev, &mode);
+               ret = btrfs_rm_device(fs_info, &args, &bdev, &mode);
                 if (!ret)
                         btrfs_info(fs_info, "disk deleted %s", vol_args->name);
                 btrfs_exclop_finish(fs_info);
         }
   
-       kfree(vol_args);
- out_drop_write:
         mnt_drop_write_file(file);
         if (bdev)
                 blkdev_put(bdev, mode);
+ out:
+       btrfs_put_dev_args_from_path(&args);
+       kfree(vol_args);
         return ret;
   }
   
@@@ -3379,22 -3347,21 +3347,21 @@@ static long btrfs_ioctl_fs_info(struct 
   static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info,
                                  void __user *arg)
   {
+       BTRFS_DEV_LOOKUP_ARGS(args);
         struct btrfs_ioctl_dev_info_args *di_args;
         struct btrfs_device *dev;
         int ret = 0;
-       char *s_uuid = NULL;
   
         di_args = memdup_user(arg, sizeof(*di_args));
         if (IS_ERR(di_args))
                 return PTR_ERR(di_args);
   
+       args.devid = di_args->devid;
         if (!btrfs_is_empty_uuid(di_args->uuid))
-               s_uuid = di_args->uuid;
+               args.uuid = di_args->uuid;
   
         rcu_read_lock();
-       dev = btrfs_find_device(fs_info->fs_devices, di_args->devid, s_uuid,
-                               NULL);
- 
+       dev = btrfs_find_device(fs_info->fs_devices, &args);
         if (!dev) {
                 ret = -ENODEV;
                 goto out;
@@@ -4430,7 -4397,6 +4397,6 @@@ static long btrfs_ioctl_quota_rescan_st
                                                 void __user *arg)
   {
         struct btrfs_ioctl_quota_rescan_args qsa = {0};
-       int ret = 0;
   
         if (!capable(CAP_SYS_ADMIN))
                 return -EPERM;
@@@ -4441,9 -4407,9 +4407,9 @@@
         }
   
         if (copy_to_user(arg, &qsa, sizeof(qsa)))
-               ret = -EFAULT;
+               return -EFAULT;
   
-       return ret;
+       return 0;
   }
   
   static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info,
diff --combined fs/btrfs/lzo.c

index 295bbc1,00cffc1..65cb076
--- 1/fs/btrfs/lzo.c
--- 2/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@@ -32,19 -32,19 +32,19 @@@
    *     payload.
    *     One regular LZO compressed extent can have one or more segments.
    *     For inlined LZO compressed extent, only one segment is allowed.
-  *     One segment represents at most one page of uncompressed data.
+  *     One segment represents at most one sector of uncompressed data.
    *
    * 2.1 Segment header
    *     Fixed size. LZO_LEN (4) bytes long, LE32.
    *     Records the total size of the segment (not including the header).
-  *     Segment header never crosses page boundary, thus it's possible to
-  *     have at most 3 padding zeros at the end of the page.
+  *     Segment header never crosses sector boundary, thus it's possible to
+  *     have at most 3 padding zeros at the end of the sector.
    *
    * 2.2 Data Payload
-  *     Variable size. Size up limit should be lzo1x_worst_compress(PAGE_SIZE)
-  *     which is 4419 for a 4KiB page.
+  *     Variable size. Size up limit should be lzo1x_worst_compress(sectorsize)
+  *     which is 4419 for a 4KiB sectorsize.
    *
-  * Example:
+  * Example with 4K sectorsize:
    * Page 1:
    *          0     0x2   0x4   0x6   0x8   0xa   0xc   0xe     0x10
    * 0x0000   |  Header   | SegHdr 01 | Data payload 01 ...     |
@@@ -112,170 -112,161 +112,174 @@@ static inline size_t read_compress_leng
         return le32_to_cpu(dlen);
   }
   
- -      write_compress_length(page_address(cur_page) + offset_in_page(*cur_out),
+ /*
+  * Will do:
+  *
+  * - Write a segment header into the destination
+  * - Copy the compressed buffer into the destination
+  * - Make sure we have enough space in the last sector to fit a segment header
+  *   If not, we will pad at most (LZO_LEN (4)) - 1 bytes of zeros.
+  *
+  * Will allocate new pages when needed.
+  */
+ static int copy_compressed_data_to_page(char *compressed_data,
+                                       size_t compressed_size,
+                                       struct page **out_pages,
+                                       u32 *cur_out,
+                                       const u32 sectorsize)
+ {
+       u32 sector_bytes_left;
+       u32 orig_out;
+       struct page *cur_page;
++      char *kaddr;
+ 
+       /*
+        * We never allow a segment header crossing sector boundary, previous
+        * run should ensure we have enough space left inside the sector.
+        */
+       ASSERT((*cur_out / sectorsize) == (*cur_out + LZO_LEN - 1) / sectorsize);
+ 
+       cur_page = out_pages[*cur_out / PAGE_SIZE];
+       /* Allocate a new page */
+       if (!cur_page) {
+               cur_page = alloc_page(GFP_NOFS);
+               if (!cur_page)
+                       return -ENOMEM;
+               out_pages[*cur_out / PAGE_SIZE] = cur_page;
+       }
+ 
- -              memcpy(page_address(cur_page) + offset_in_page(*cur_out),
++      kaddr = kmap(cur_page);
++      write_compress_length(kaddr + offset_in_page(*cur_out),
+                             compressed_size);
+       *cur_out += LZO_LEN;
+ 
+       orig_out = *cur_out;
+ 
+       /* Copy compressed data */
+       while (*cur_out - orig_out < compressed_size) {
+               u32 copy_len = min_t(u32, sectorsize - *cur_out % sectorsize,
+                                    orig_out + compressed_size - *cur_out);
+ 
++              kunmap(cur_page);
+               cur_page = out_pages[*cur_out / PAGE_SIZE];
+               /* Allocate a new page */
+               if (!cur_page) {
+                       cur_page = alloc_page(GFP_NOFS);
+                       if (!cur_page)
+                               return -ENOMEM;
+                       out_pages[*cur_out / PAGE_SIZE] = cur_page;
+               }
++              kaddr = kmap(cur_page);
+ 
- -              return 0;
++              memcpy(kaddr + offset_in_page(*cur_out),
+                      compressed_data + *cur_out - orig_out, copy_len);
+ 
+               *cur_out += copy_len;
+       }
+ 
+       /*
+        * Check if we can fit the next segment header into the remaining space
+        * of the sector.
+        */
+       sector_bytes_left = round_up(*cur_out, sectorsize) - *cur_out;
+       if (sector_bytes_left >= LZO_LEN || sector_bytes_left == 0)
- -      memset(page_address(cur_page) + offset_in_page(*cur_out), 0,
++              goto out;
+ 
+       /* The remaining size is not enough, pad it with zeros */
++      memset(kaddr + offset_in_page(*cur_out), 0,
+              sector_bytes_left);
+       *cur_out += sector_bytes_left;
++
++out:
++      kunmap(cur_page);
+       return 0;
+ }
+ 
   int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
                 u64 start, struct page **pages, unsigned long *out_pages,
                 unsigned long *total_in, unsigned long *total_out)
   {
         struct workspace *workspace = list_entry(ws, struct workspace, list);
+       const u32 sectorsize = btrfs_sb(mapping->host->i_sb)->sectorsize;
+       struct page *page_in = NULL;
++      char *sizes_ptr;
         int ret = 0;
-       char *data_in;
-       char *cpage_out, *sizes_ptr;
-       int nr_pages = 0;
-       struct page *in_page = NULL;
-       struct page *out_page = NULL;
-       unsigned long bytes_left;
-       unsigned long len = *total_out;
-       unsigned long nr_dest_pages = *out_pages;
-       const unsigned long max_out = nr_dest_pages * PAGE_SIZE;
-       size_t in_len;
-       size_t out_len;
-       char *buf;
-       unsigned long tot_in = 0;
-       unsigned long tot_out = 0;
-       unsigned long pg_bytes_left;
-       unsigned long out_offset;
-       unsigned long bytes;
+       /* Points to the file offset of input data */
+       u64 cur_in = start;
+       /* Points to the current output byte */
+       u32 cur_out = 0;
+       u32 len = *total_out;
   
         *out_pages = 0;
         *total_out = 0;
         *total_in = 0;
   
-       in_page = find_get_page(mapping, start >> PAGE_SHIFT);
-       data_in = kmap(in_page);
- 
         /*
-        * store the size of all chunks of compressed data in
-        * the first 4 bytes
+        * Skip the header for now, we will later come back and write the total
+        * compressed size
          */
-       out_page = alloc_page(GFP_NOFS);
-       if (out_page == NULL) {
-               ret = -ENOMEM;
-               goto out;
-       }
-       cpage_out = kmap(out_page);
-       out_offset = LZO_LEN;
-       tot_out = LZO_LEN;
-       pages[0] = out_page;
-       nr_pages = 1;
-       pg_bytes_left = PAGE_SIZE - LZO_LEN;
- 
-       /* compress at most one page of data each time */
-       in_len = min(len, PAGE_SIZE);
-       while (tot_in < len) {
-               ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf,
-                                      &out_len, workspace->mem);
-               if (ret != LZO_E_OK) {
-                       pr_debug("BTRFS: lzo in loop returned %d\n",
-                              ret);
+       cur_out += LZO_LEN;
+       while (cur_in < start + len) {
++              char *data_in;
+               const u32 sectorsize_mask = sectorsize - 1;
+               u32 sector_off = (cur_in - start) & sectorsize_mask;
+               u32 in_len;
+               size_t out_len;
+ 
+               /* Get the input page first */
+               if (!page_in) {
+                       page_in = find_get_page(mapping, cur_in >> PAGE_SHIFT);
+                       ASSERT(page_in);
+               }
+ 
+               /* Compress at most one sector of data each time */
+               in_len = min_t(u32, start + len - cur_in, sectorsize - sector_off);
+               ASSERT(in_len);
- -              ret = lzo1x_1_compress(page_address(page_in) +
++              data_in = kmap(page_in);
++              ret = lzo1x_1_compress(data_in +
+                                      offset_in_page(cur_in), in_len,
+                                      workspace->cbuf, &out_len,
+                                      workspace->mem);
++              kunmap(page_in);
+               if (ret < 0) {
+                       pr_debug("BTRFS: lzo in loop returned %d\n", ret);
                         ret = -EIO;
                         goto out;
                 }
   
-               /* store the size of this chunk of compressed data */
-               write_compress_length(cpage_out + out_offset, out_len);
-               tot_out += LZO_LEN;
-               out_offset += LZO_LEN;
-               pg_bytes_left -= LZO_LEN;
- 
-               tot_in += in_len;
-               tot_out += out_len;
- 
-               /* copy bytes from the working buffer into the pages */
-               buf = workspace->cbuf;
-               while (out_len) {
-                       bytes = min_t(unsigned long, pg_bytes_left, out_len);
- 
-                       memcpy(cpage_out + out_offset, buf, bytes);
- 
-                       out_len -= bytes;
-                       pg_bytes_left -= bytes;
-                       buf += bytes;
-                       out_offset += bytes;
- 
-                       /*
-                        * we need another page for writing out.
-                        *
-                        * Note if there's less than 4 bytes left, we just
-                        * skip to a new page.
-                        */
-                       if ((out_len == 0 && pg_bytes_left < LZO_LEN) ||
-                           pg_bytes_left == 0) {
-                               if (pg_bytes_left) {
-                                       memset(cpage_out + out_offset, 0,
-                                              pg_bytes_left);
-                                       tot_out += pg_bytes_left;
-                               }
- 
-                               /* we're done, don't allocate new page */
-                               if (out_len == 0 && tot_in >= len)
-                                       break;
- 
-                               kunmap(out_page);
-                               if (nr_pages == nr_dest_pages) {
-                                       out_page = NULL;
-                                       ret = -E2BIG;
-                                       goto out;
-                               }
- 
-                               out_page = alloc_page(GFP_NOFS);
-                               if (out_page == NULL) {
-                                       ret = -ENOMEM;
-                                       goto out;
-                               }
-                               cpage_out = kmap(out_page);
-                               pages[nr_pages++] = out_page;
- 
-                               pg_bytes_left = PAGE_SIZE;
-                               out_offset = 0;
-                       }
-               }
+               ret = copy_compressed_data_to_page(workspace->cbuf, out_len,
+                                                  pages, &cur_out, sectorsize);
+               if (ret < 0)
+                       goto out;
   
-               /* we're making it bigger, give up */
-               if (tot_in > 8192 && tot_in < tot_out) {
+               cur_in += in_len;
+ 
+               /*
+                * Check if we're making it bigger after two sectors.  And if
+                * it is so, give up.
+                */
+               if (cur_in - start > sectorsize * 2 && cur_in - start < cur_out) {
                         ret = -E2BIG;
                         goto out;
                 }
   
-               /* we're all done */
-               if (tot_in >= len)
-                       break;
- 
-               if (tot_out > max_out)
-                       break;
- 
-               bytes_left = len - tot_in;
-               kunmap(in_page);
-               put_page(in_page);
- 
-               start += PAGE_SIZE;
-               in_page = find_get_page(mapping, start >> PAGE_SHIFT);
-               data_in = kmap(in_page);
-               in_len = min(bytes_left, PAGE_SIZE);
-       }
- 
-       if (tot_out >= tot_in) {
-               ret = -E2BIG;
-               goto out;
+               /* Check if we have reached page boundary */
+               if (IS_ALIGNED(cur_in, PAGE_SIZE)) {
+                       put_page(page_in);
+                       page_in = NULL;
+               }
         }
   
-       /* store the size of all chunks of compressed data */
+       /* Store the size of all chunks of compressed data */
- -      write_compress_length(page_address(pages[0]), cur_out);
+ +      sizes_ptr = kmap_local_page(pages[0]);
-       write_compress_length(sizes_ptr, tot_out);
++      write_compress_length(sizes_ptr, cur_out);
+ +      kunmap_local(sizes_ptr);
   
         ret = 0;
-       *total_out = tot_out;
-       *total_in = tot_in;
+       *total_out = cur_out;
+       *total_in = cur_in - start;
   out:
-       *out_pages = nr_pages;
-       if (out_page)
-               kunmap(out_page);
- 
-       if (in_page) {
-               kunmap(in_page);
-               put_page(in_page);
-       }
- 
+       *out_pages = DIV_ROUND_UP(cur_out, PAGE_SIZE);
         return ret;
   }
   
@@@ -290,7 -281,6 +294,7 @@@ static void copy_compressed_segment(str
         u32 orig_in = *cur_in;
   
         while (*cur_in < orig_in + len) {
+ +              char *kaddr;
                 struct page *cur_page;
                 u32 copy_len = min_t(u32, PAGE_SIZE - offset_in_page(*cur_in),
                                           orig_in + len - *cur_in);
@@@ -298,11 -288,9 +302,11 @@@
                 ASSERT(copy_len);
                 cur_page = cb->compressed_pages[*cur_in / PAGE_SIZE];
   
+ +              kaddr = kmap(cur_page);
                 memcpy(dest + *cur_in - orig_in,
- -                      page_address(cur_page) + offset_in_page(*cur_in),
+ +                      kaddr + offset_in_page(*cur_in),
                         copy_len);
+ +              kunmap(cur_page);
   
                 *cur_in += copy_len;
         }
@@@ -313,7 -301,6 +317,7 @@@ int lzo_decompress_bio(struct list_hea
         struct workspace *workspace = list_entry(ws, struct workspace, list);
         const struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
         const u32 sectorsize = fs_info->sectorsize;
+ +      char *kaddr;
         int ret;
         /* Compressed data length, can be unaligned */
         u32 len_in;
@@@ -322,9 -309,7 +326,9 @@@
         /* Bytes decompressed so far */
         u32 cur_out = 0;
   
- -      len_in = read_compress_length(page_address(cb->compressed_pages[0]));
+ +      kaddr = kmap(cb->compressed_pages[0]);
+ +      len_in = read_compress_length(kaddr);
+ +      kunmap(cb->compressed_pages[0]);
         cur_in += LZO_LEN;
   
         /*
@@@ -358,9 -343,8 +362,9 @@@
                        (cur_in + LZO_LEN - 1) / sectorsize);
                 cur_page = cb->compressed_pages[cur_in / PAGE_SIZE];
                 ASSERT(cur_page);
- -              seg_len = read_compress_length(page_address(cur_page) +
- -                                             offset_in_page(cur_in));
+ +              kaddr = kmap(cur_page);
+ +              seg_len = read_compress_length(kaddr + offset_in_page(cur_in));
+ +              kunmap(cur_page);
                 cur_in += LZO_LEN;
   
                 /* Copy the compressed segment payload into workspace */
@@@ -445,7 -429,7 +449,7 @@@ int lzo_decompress(struct list_head *ws
         destlen = min_t(unsigned long, destlen, PAGE_SIZE);
         bytes = min_t(unsigned long, destlen, out_len - start_byte);
   
- -      kaddr = page_address(dest_page);
+ +      kaddr = kmap_local_page(dest_page);
         memcpy(kaddr, workspace->buf + start_byte, bytes);
   
         /*
@@@ -455,7 -439,6 +459,7 @@@
          */
         if (bytes < destlen)
                 memset(kaddr+bytes, 0, destlen-bytes);
+ +      kunmap_local(kaddr);
   out:
         return ret;
   }
diff --combined fs/btrfs/volumes.c

index 9533f35,546bf11..61ac57b
--- 1/fs/btrfs/volumes.c
--- 2/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@@ -14,6 -14,7 +14,7 @@@
   #include <linux/semaphore.h>
   #include <linux/uuid.h>
   #include <linux/list_sort.h>
+ #include <linux/namei.h>
   #include "misc.h"
   #include "ctree.h"
   #include "extent_map.h"
@@@ -250,7 -251,7 +251,7 @@@ static void btrfs_dev_stat_print_on_loa
   static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
                              enum btrfs_map_op op,
                              u64 logical, u64 *length,
-                            struct btrfs_bio **bbio_ret,
+                            struct btrfs_io_context **bioc_ret,
                              int mirror_num, int need_raid_map);
   
   /*
@@@ -508,7 -509,7 +509,7 @@@ btrfs_get_bdev_and_sb(const char *devic
         }
   
         if (flush)
- -              filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
+ +              sync_blockdev(*bdev);
         ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
         if (ret) {
                 blkdev_put(*bdev, flags);
@@@ -812,9 -813,13 +813,13 @@@ static noinline struct btrfs_device *de
   
                 device = NULL;
         } else {
+               struct btrfs_dev_lookup_args args = {
+                       .devid = devid,
+                       .uuid = disk_super->dev_item.uuid,
+               };
+ 
                 mutex_lock(&fs_devices->device_list_mutex);
-               device = btrfs_find_device(fs_devices, devid,
-                               disk_super->dev_item.uuid, NULL);
+               device = btrfs_find_device(fs_devices, &args);
   
                 /*
                  * If this disk has been pulled into an fs devices created by
@@@ -1091,7 -1096,7 +1096,7 @@@ void btrfs_free_extra_devids(struct btr
         list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list)
                 __btrfs_free_extra_devids(seed_dev, &latest_dev);
   
-       fs_devices->latest_bdev = latest_dev->bdev;
+       fs_devices->latest_dev = latest_dev;
   
         mutex_unlock(&uuid_mutex);
   }
@@@ -1122,8 -1127,10 +1127,10 @@@ static void btrfs_close_one_device(stru
         if (device->devid == BTRFS_DEV_REPLACE_DEVID)
                 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
   
-       if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
+       if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
+               clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
                 fs_devices->missing_devices--;
+       }
   
         btrfs_close_bdev(device);
         if (device->bdev) {
@@@ -1222,7 -1229,7 +1229,7 @@@ static int open_fs_devices(struct btrfs
                 return -EINVAL;
   
         fs_devices->opened = 1;
-       fs_devices->latest_bdev = latest_dev->bdev;
+       fs_devices->latest_dev = latest_dev;
         fs_devices->total_rw_bytes = 0;
         fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
         fs_devices->read_policy = BTRFS_READ_POLICY_PID;
@@@ -1286,7 -1293,7 +1293,7 @@@ static struct btrfs_super_block *btrfs_
         pgoff_t index;
   
         /* make sure our super fits in the device */
- -      if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
+ +      if (bytenr + PAGE_SIZE >= bdev_nr_bytes(bdev))
                 return ERR_PTR(-EINVAL);
   
         /* make sure our super fits in the page */
@@@ -1843,8 -1850,10 +1850,10 @@@ static int btrfs_add_dev_item(struct bt
         key.type = BTRFS_DEV_ITEM_KEY;
         key.offset = device->devid;
   
+       btrfs_reserve_chunk_metadata(trans, true);
         ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path,
                                       &key, sizeof(*dev_item));
+       btrfs_trans_release_chunk_metadata(trans);
         if (ret)
                 goto out;
   
@@@ -1882,18 -1891,22 +1891,22 @@@ out
   /*
    * Function to update ctime/mtime for a given device path.
    * Mainly used for ctime/mtime based probe like libblkid.
+  *
+  * We don't care about errors here, this is just to be kind to userspace.
    */
- static void update_dev_time(struct block_device *bdev)
+ static void update_dev_time(const char *device_path)
   {
-       struct inode *inode = bdev->bd_inode;
+       struct path path;
         struct timespec64 now;
+       int ret;
   
-       /* Shouldn't happen but just in case. */
-       if (!inode)
+       ret = kern_path(device_path, LOOKUP_FOLLOW, &path);
+       if (ret)
                 return;
   
-       now = current_time(inode);
-       generic_update_time(inode, &now, S_MTIME | S_CTIME);
+       now = current_time(d_inode(path.dentry));
+       inode_update_time(d_inode(path.dentry), &now, S_MTIME | S_CTIME);
+       path_put(&path);
   }
   
   static int btrfs_rm_dev_item(struct btrfs_device *device)
@@@ -1917,7 -1930,9 +1930,9 @@@
         key.type = BTRFS_DEV_ITEM_KEY;
         key.offset = device->devid;
   
+       btrfs_reserve_chunk_metadata(trans, false);
         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+       btrfs_trans_release_chunk_metadata(trans);
         if (ret) {
                 if (ret > 0)
                         ret = -ENOENT;
@@@ -1986,7 -2001,7 +2001,7 @@@ static struct btrfs_device * btrfs_find
   }
   
   /*
-  * Helper function to check if the given device is part of s_bdev / latest_bdev
+  * Helper function to check if the given device is part of s_bdev / latest_dev
    * and replace it with the provided or the next active device, in the context
    * where this function called, there should be always be another device (or
    * this_dev) which is active.
@@@ -2005,8 -2020,8 +2020,8 @@@ void __cold btrfs_assign_next_active_de
                         (fs_info->sb->s_bdev == device->bdev))
                 fs_info->sb->s_bdev = next_device->bdev;
   
-       if (fs_info->fs_devices->latest_bdev == device->bdev)
-               fs_info->fs_devices->latest_bdev = next_device->bdev;
+       if (fs_info->fs_devices->latest_dev->bdev == device->bdev)
+               fs_info->fs_devices->latest_dev = next_device;
   }
   
   /*
@@@ -2069,11 -2084,12 +2084,12 @@@ void btrfs_scratch_superblocks(struct b
         btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
   
         /* Update ctime/mtime for device path for libblkid */
-       update_dev_time(bdev);
+       update_dev_time(device_path);
   }
   
- int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
-                   u64 devid, struct block_device **bdev, fmode_t *mode)
+ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
+                   struct btrfs_dev_lookup_args *args,
+                   struct block_device **bdev, fmode_t *mode)
   {
         struct btrfs_device *device;
         struct btrfs_fs_devices *cur_devices;
@@@ -2081,22 -2097,23 +2097,23 @@@
         u64 num_devices;
         int ret = 0;
   
-       mutex_lock(&uuid_mutex);
- 
+       /*
+        * The device list in fs_devices is accessed without locks (neither
+        * uuid_mutex nor device_list_mutex) as it won't change on a mounted
+        * filesystem and another device rm cannot run.
+        */
         num_devices = btrfs_num_devices(fs_info);
   
         ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
         if (ret)
                 goto out;
   
-       device = btrfs_find_device_by_devspec(fs_info, devid, device_path);
- 
-       if (IS_ERR(device)) {
-               if (PTR_ERR(device) == -ENOENT &&
-                   device_path && strcmp(device_path, "missing") == 0)
+       device = btrfs_find_device(fs_info->fs_devices, args);
+       if (!device) {
+               if (args->missing)
                         ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
                 else
-                       ret = PTR_ERR(device);
+                       ret = -ENOENT;
                 goto out;
         }
   
@@@ -2126,11 -2143,9 +2143,9 @@@
                 mutex_unlock(&fs_info->chunk_mutex);
         }
   
-       mutex_unlock(&uuid_mutex);
         ret = btrfs_shrink_device(device, 0);
         if (!ret)
                 btrfs_reada_remove_dev(device);
-       mutex_lock(&uuid_mutex);
         if (ret)
                 goto error_undo;
   
@@@ -2159,7 -2174,7 +2174,7 @@@
         /*
          * In normal cases the cur_devices == fs_devices. But in case
          * of deleting a seed device, the cur_devices should point to
-        * its own fs_devices listed under the fs_devices->seed.
+        * its own fs_devices listed under the fs_devices->seed_list.
          */
         cur_devices = device->fs_devices;
         mutex_lock(&fs_devices->device_list_mutex);
@@@ -2210,14 -2225,21 +2225,21 @@@
         synchronize_rcu();
         btrfs_free_device(device);
   
-       if (cur_devices->open_devices == 0) {
+       /*
+        * This can happen if cur_devices is the private seed devices list.  We
+        * cannot call close_fs_devices() here because it expects the uuid_mutex
+        * to be held, but in fact we don't need that for the private
+        * seed_devices, we can simply decrement cur_devices->opened and then
+        * remove it from our list and free the fs_devices.
+        */
+       if (cur_devices->num_devices == 0) {
                 list_del_init(&cur_devices->seed_list);
-               close_fs_devices(cur_devices);
+               ASSERT(cur_devices->opened == 1);
+               cur_devices->opened--;
                 free_fs_devices(cur_devices);
         }
   
   out:
-       mutex_unlock(&uuid_mutex);
         return ret;
   
   error_undo:
@@@ -2305,13 -2327,6 +2327,6 @@@ void btrfs_destroy_dev_replace_tgtdev(s
   
         mutex_unlock(&fs_devices->device_list_mutex);
   
-       /*
-        * The update_dev_time() with in btrfs_scratch_superblocks()
-        * may lead to a call to btrfs_show_devname() which will try
-        * to hold device_list_mutex. And here this device
-        * is already out of device list, so we don't have to hold
-        * the device_list_mutex lock.
-        */
         btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev,
                                   tgtdev->name->str);
   
@@@ -2320,69 -2335,98 +2335,98 @@@
         btrfs_free_device(tgtdev);
   }
   
- static struct btrfs_device *btrfs_find_device_by_path(
-               struct btrfs_fs_info *fs_info, const char *device_path)
+ /**
+  * Populate args from device at path
+  *
+  * @fs_info:  the filesystem
+  * @args:     the args to populate
+  * @path:     the path to the device
+  *
+  * This will read the super block of the device at @path and populate @args with
+  * the devid, fsid, and uuid.  This is meant to be used for ioctls that need to
+  * lookup a device to operate on, but need to do it before we take any locks.
+  * This properly handles the special case of "missing" that a user may pass in,
+  * and does some basic sanity checks.  The caller must make sure that @path is
+  * properly NUL terminated before calling in, and must call
+  * btrfs_put_dev_args_from_path() in order to free up the temporary fsid and
+  * uuid buffers.
+  *
+  * Return: 0 for success, -errno for failure
+  */
+ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
+                                struct btrfs_dev_lookup_args *args,
+                                const char *path)
   {
-       int ret = 0;
         struct btrfs_super_block *disk_super;
-       u64 devid;
-       u8 *dev_uuid;
         struct block_device *bdev;
-       struct btrfs_device *device;
+       int ret;
   
-       ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
-                                   fs_info->bdev_holder, 0, &bdev, &disk_super);
-       if (ret)
-               return ERR_PTR(ret);
+       if (!path || !path[0])
+               return -EINVAL;
+       if (!strcmp(path, "missing")) {
+               args->missing = true;
+               return 0;
+       }
   
-       devid = btrfs_stack_device_id(&disk_super->dev_item);
-       dev_uuid = disk_super->dev_item.uuid;
+       args->uuid = kzalloc(BTRFS_UUID_SIZE, GFP_KERNEL);
+       args->fsid = kzalloc(BTRFS_FSID_SIZE, GFP_KERNEL);
+       if (!args->uuid || !args->fsid) {
+               btrfs_put_dev_args_from_path(args);
+               return -ENOMEM;
+       }
+ 
+       ret = btrfs_get_bdev_and_sb(path, FMODE_READ, fs_info->bdev_holder, 0,
+                                   &bdev, &disk_super);
+       if (ret)
+               return ret;
+       args->devid = btrfs_stack_device_id(&disk_super->dev_item);
+       memcpy(args->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE);
         if (btrfs_fs_incompat(fs_info, METADATA_UUID))
-               device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
-                                          disk_super->metadata_uuid);
+               memcpy(args->fsid, disk_super->metadata_uuid, BTRFS_FSID_SIZE);
         else
-               device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
-                                          disk_super->fsid);
- 
+               memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
         btrfs_release_disk_super(disk_super);
-       if (!device)
-               device = ERR_PTR(-ENOENT);
         blkdev_put(bdev, FMODE_READ);
-       return device;
+       return 0;
   }
   
   /*
-  * Lookup a device given by device id, or the path if the id is 0.
+  * Only use this jointly with btrfs_get_dev_args_from_path() because we will
+  * allocate our ->uuid and ->fsid pointers, everybody else uses local variables
+  * that don't need to be freed.
    */
+ void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args)
+ {
+       kfree(args->uuid);
+       kfree(args->fsid);
+       args->uuid = NULL;
+       args->fsid = NULL;
+ }
+ 
   struct btrfs_device *btrfs_find_device_by_devspec(
                 struct btrfs_fs_info *fs_info, u64 devid,
                 const char *device_path)
   {
+       BTRFS_DEV_LOOKUP_ARGS(args);
         struct btrfs_device *device;
+       int ret;
   
         if (devid) {
-               device = btrfs_find_device(fs_info->fs_devices, devid, NULL,
-                                          NULL);
+               args.devid = devid;
+               device = btrfs_find_device(fs_info->fs_devices, &args);
                 if (!device)
                         return ERR_PTR(-ENOENT);
                 return device;
         }
   
-       if (!device_path || !device_path[0])
-               return ERR_PTR(-EINVAL);
- 
-       if (strcmp(device_path, "missing") == 0) {
-               /* Find first missing device */
-               list_for_each_entry(device, &fs_info->fs_devices->devices,
-                                   dev_list) {
-                       if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
-                                    &device->dev_state) && !device->bdev)
-                               return device;
-               }
+       ret = btrfs_get_dev_args_from_path(fs_info, &args, device_path);
+       if (ret)
+               return ERR_PTR(ret);
+       device = btrfs_find_device(fs_info->fs_devices, &args);
+       btrfs_put_dev_args_from_path(&args);
+       if (!device)
                 return ERR_PTR(-ENOENT);
-       }
- 
-       return btrfs_find_device_by_path(fs_info, device_path);
+       return device;
   }
   
   /*
@@@ -2459,6 -2503,7 +2503,7 @@@ static int btrfs_prepare_sprout(struct 
    */
   static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
   {
+       BTRFS_DEV_LOOKUP_ARGS(args);
         struct btrfs_fs_info *fs_info = trans->fs_info;
         struct btrfs_root *root = fs_info->chunk_root;
         struct btrfs_path *path;
@@@ -2468,7 -2513,6 +2513,6 @@@
         struct btrfs_key key;
         u8 fs_uuid[BTRFS_FSID_SIZE];
         u8 dev_uuid[BTRFS_UUID_SIZE];
-       u64 devid;
         int ret;
   
         path = btrfs_alloc_path();
@@@ -2480,7 -2524,9 +2524,9 @@@
         key.type = BTRFS_DEV_ITEM_KEY;
   
         while (1) {
+               btrfs_reserve_chunk_metadata(trans, false);
                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+               btrfs_trans_release_chunk_metadata(trans);
                 if (ret < 0)
                         goto error;
   
@@@ -2505,13 -2551,14 +2551,14 @@@ next_slot
   
                 dev_item = btrfs_item_ptr(leaf, path->slots[0],
                                           struct btrfs_dev_item);
-               devid = btrfs_device_id(leaf, dev_item);
+               args.devid = btrfs_device_id(leaf, dev_item);
                 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
                                    BTRFS_UUID_SIZE);
                 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
                                    BTRFS_FSID_SIZE);
-               device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
-                                          fs_uuid);
+               args.uuid = dev_uuid;
+               args.fsid = fs_uuid;
+               device = btrfs_find_device(fs_info->fs_devices, &args);
                 BUG_ON(!device); /* Logic error */
   
                 if (device->fs_devices->seeding) {
@@@ -2610,8 -2657,8 +2657,8 @@@ int btrfs_init_new_device(struct btrfs_
         device->io_width = fs_info->sectorsize;
         device->io_align = fs_info->sectorsize;
         device->sector_size = fs_info->sectorsize;
- -      device->total_bytes = round_down(i_size_read(bdev->bd_inode),
- -                                       fs_info->sectorsize);
+ +      device->total_bytes =
+ +              round_down(bdev_nr_bytes(bdev), fs_info->sectorsize);
         device->disk_total_bytes = device->total_bytes;
         device->commit_total_bytes = device->total_bytes;
         set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
@@@ -2627,6 -2674,8 +2674,8 @@@
                         btrfs_abort_transaction(trans, ret);
                         goto error_trans;
                 }
+               btrfs_assign_next_active_device(fs_info->fs_devices->latest_dev,
+                                               device);
         }
   
         device->fs_devices = fs_devices;
@@@ -2733,7 -2782,7 +2782,7 @@@
         btrfs_forget_devices(device_path);
   
         /* Update ctime/mtime for blkid or udev */
-       update_dev_time(bdev);
+       update_dev_time(device_path);
   
         return ret;
   
@@@ -2826,6 -2875,7 +2875,7 @@@ int btrfs_grow_device(struct btrfs_tran
         struct btrfs_super_block *super_copy = fs_info->super_copy;
         u64 old_total;
         u64 diff;
+       int ret;
   
         if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
                 return -EACCES;
@@@ -2854,7 -2904,11 +2904,11 @@@
                               &trans->transaction->dev_update_list);
         mutex_unlock(&fs_info->chunk_mutex);
   
-       return btrfs_update_device(trans, device);
+       btrfs_reserve_chunk_metadata(trans, false);
+       ret = btrfs_update_device(trans, device);
+       btrfs_trans_release_chunk_metadata(trans);
+ 
+       return ret;
   }
   
   static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
@@@ -3096,7 -3150,7 +3150,7 @@@ int btrfs_remove_chunk(struct btrfs_tra
                 const u64 sys_flags = btrfs_system_alloc_profile(fs_info);
                 struct btrfs_block_group *sys_bg;
   
-               sys_bg = btrfs_alloc_chunk(trans, sys_flags);
+               sys_bg = btrfs_create_chunk(trans, sys_flags);
                 if (IS_ERR(sys_bg)) {
                         ret = PTR_ERR(sys_bg);
                         btrfs_abort_transaction(trans, ret);
@@@ -4889,8 -4943,10 +4943,10 @@@ again
                         round_down(old_total - diff, fs_info->sectorsize));
         mutex_unlock(&fs_info->chunk_mutex);
   
+       btrfs_reserve_chunk_metadata(trans, false);
         /* Now btrfs_update_device() will change the on-disk size. */
         ret = btrfs_update_device(trans, device);
+       btrfs_trans_release_chunk_metadata(trans);
         if (ret < 0) {
                 btrfs_abort_transaction(trans, ret);
                 btrfs_end_transaction(trans);
@@@ -4973,7 -5029,7 +5029,7 @@@ static void check_raid1c34_incompat_fla
   }
   
   /*
-  * Structure used internally for __btrfs_alloc_chunk() function.
+  * Structure used internally for btrfs_create_chunk() function.
    * Wraps needed parameters.
    */
   struct alloc_chunk_ctl {
@@@ -5377,7 -5433,7 +5433,7 @@@ error_del_extent
         return block_group;
   }
   
- struct btrfs_block_group *btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
                                             u64 type)
   {
         struct btrfs_fs_info *info = trans->fs_info;
@@@ -5578,12 -5634,12 +5634,12 @@@ static noinline int init_first_rw_devic
          */
   
         alloc_profile = btrfs_metadata_alloc_profile(fs_info);
-       meta_bg = btrfs_alloc_chunk(trans, alloc_profile);
+       meta_bg = btrfs_create_chunk(trans, alloc_profile);
         if (IS_ERR(meta_bg))
                 return PTR_ERR(meta_bg);
   
         alloc_profile = btrfs_system_alloc_profile(fs_info);
-       sys_bg = btrfs_alloc_chunk(trans, alloc_profile);
+       sys_bg = btrfs_create_chunk(trans, alloc_profile);
         if (IS_ERR(sys_bg))
                 return PTR_ERR(sys_bg);
   
@@@ -5597,17 -5653,17 +5653,17 @@@ static inline int btrfs_chunk_max_error
         return btrfs_raid_array[index].tolerated_failures;
   }
   
- int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
+ bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset)
   {
         struct extent_map *em;
         struct map_lookup *map;
-       int readonly = 0;
         int miss_ndevs = 0;
         int i;
+       bool ret = true;
   
         em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
         if (IS_ERR(em))
-               return 1;
+               return false;
   
         map = em->map_lookup;
         for (i = 0; i < map->num_stripes; i++) {
@@@ -5618,21 -5674,20 +5674,20 @@@
                 }
                 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
                                         &map->stripes[i].dev->dev_state)) {
-                       readonly = 1;
+                       ret = false;
                         goto end;
                 }
         }
   
         /*
-        * If the number of missing devices is larger than max errors,
-        * we can not write the data into that chunk successfully, so
-        * set it readonly.
+        * If the number of missing devices is larger than max errors, we can
+        * not write the data into that chunk successfully.
          */
         if (miss_ndevs > btrfs_chunk_max_errors(map))
-               readonly = 1;
+               ret = false;
   end:
         free_extent_map(em);
-       return readonly;
+       return ret;
   }
   
   void btrfs_mapping_tree_free(struct extent_map_tree *tree)
@@@ -5795,7 -5850,7 +5850,7 @@@ static int find_live_mirror(struct btrf
   }
   
   /* Bubble-sort the stripe set to put the parity/syndrome stripes last */
- static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
+ static void sort_parity_stripes(struct btrfs_io_context *bioc, int num_stripes)
   {
         int i;
         int again = 1;
@@@ -5804,52 -5859,55 +5859,55 @@@
                 again = 0;
                 for (i = 0; i < num_stripes - 1; i++) {
                         /* Swap if parity is on a smaller index */
-                       if (bbio->raid_map[i] > bbio->raid_map[i + 1]) {
-                               swap(bbio->stripes[i], bbio->stripes[i + 1]);
-                               swap(bbio->raid_map[i], bbio->raid_map[i + 1]);
+                       if (bioc->raid_map[i] > bioc->raid_map[i + 1]) {
+                               swap(bioc->stripes[i], bioc->stripes[i + 1]);
+                               swap(bioc->raid_map[i], bioc->raid_map[i + 1]);
                                 again = 1;
                         }
                 }
         }
   }
   
- static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
+ static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
+                                                      int total_stripes,
+                                                      int real_stripes)
   {
-       struct btrfs_bio *bbio = kzalloc(
-                /* the size of the btrfs_bio */
-               sizeof(struct btrfs_bio) +
-               /* plus the variable array for the stripes */
-               sizeof(struct btrfs_bio_stripe) * (total_stripes) +
-               /* plus the variable array for the tgt dev */
+       struct btrfs_io_context *bioc = kzalloc(
+                /* The size of btrfs_io_context */
+               sizeof(struct btrfs_io_context) +
+               /* Plus the variable array for the stripes */
+               sizeof(struct btrfs_io_stripe) * (total_stripes) +
+               /* Plus the variable array for the tgt dev */
                 sizeof(int) * (real_stripes) +
                 /*
-                * plus the raid_map, which includes both the tgt dev
-                * and the stripes
+                * Plus the raid_map, which includes both the tgt dev
+                * and the stripes.
                  */
                 sizeof(u64) * (total_stripes),
                 GFP_NOFS|__GFP_NOFAIL);
   
-       atomic_set(&bbio->error, 0);
-       refcount_set(&bbio->refs, 1);
+       atomic_set(&bioc->error, 0);
+       refcount_set(&bioc->refs, 1);
   
-       bbio->tgtdev_map = (int *)(bbio->stripes + total_stripes);
-       bbio->raid_map = (u64 *)(bbio->tgtdev_map + real_stripes);
+       bioc->fs_info = fs_info;
+       bioc->tgtdev_map = (int *)(bioc->stripes + total_stripes);
+       bioc->raid_map = (u64 *)(bioc->tgtdev_map + real_stripes);
   
-       return bbio;
+       return bioc;
   }
   
- void btrfs_get_bbio(struct btrfs_bio *bbio)
+ void btrfs_get_bioc(struct btrfs_io_context *bioc)
   {
-       WARN_ON(!refcount_read(&bbio->refs));
-       refcount_inc(&bbio->refs);
+       WARN_ON(!refcount_read(&bioc->refs));
+       refcount_inc(&bioc->refs);
   }
   
- void btrfs_put_bbio(struct btrfs_bio *bbio)
+ void btrfs_put_bioc(struct btrfs_io_context *bioc)
   {
-       if (!bbio)
+       if (!bioc)
                 return;
-       if (refcount_dec_and_test(&bbio->refs))
-               kfree(bbio);
+       if (refcount_dec_and_test(&bioc->refs))
+               kfree(bioc);
   }
   
   /* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */
@@@ -5859,11 -5917,11 +5917,11 @@@
    */
   static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
                                          u64 logical, u64 *length_ret,
-                                        struct btrfs_bio **bbio_ret)
+                                        struct btrfs_io_context **bioc_ret)
   {
         struct extent_map *em;
         struct map_lookup *map;
-       struct btrfs_bio *bbio;
+       struct btrfs_io_context *bioc;
         u64 length = *length_ret;
         u64 offset;
         u64 stripe_nr;
@@@ -5882,8 -5940,8 +5940,8 @@@
         int ret = 0;
         int i;
   
-       /* discard always return a bbio */
-       ASSERT(bbio_ret);
+       /* Discard always returns a bioc. */
+       ASSERT(bioc_ret);
   
         em = btrfs_get_chunk_map(fs_info, logical, length);
         if (IS_ERR(em))
@@@ -5946,26 -6004,25 +6004,25 @@@
                                         &stripe_index);
         }
   
-       bbio = alloc_btrfs_bio(num_stripes, 0);
-       if (!bbio) {
+       bioc = alloc_btrfs_io_context(fs_info, num_stripes, 0);
+       if (!bioc) {
                 ret = -ENOMEM;
                 goto out;
         }
   
         for (i = 0; i < num_stripes; i++) {
-               bbio->stripes[i].physical =
+               bioc->stripes[i].physical =
                         map->stripes[stripe_index].physical +
                         stripe_offset + stripe_nr * map->stripe_len;
-               bbio->stripes[i].dev = map->stripes[stripe_index].dev;
+               bioc->stripes[i].dev = map->stripes[stripe_index].dev;
   
                 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
                                  BTRFS_BLOCK_GROUP_RAID10)) {
-                       bbio->stripes[i].length = stripes_per_dev *
+                       bioc->stripes[i].length = stripes_per_dev *
                                 map->stripe_len;
   
                         if (i / sub_stripes < remaining_stripes)
-                               bbio->stripes[i].length +=
-                                       map->stripe_len;
+                               bioc->stripes[i].length += map->stripe_len;
   
                         /*
                          * Special for the first stripe and
@@@ -5976,19 -6033,17 +6033,17 @@@
                          *    off     end_off
                          */
                         if (i < sub_stripes)
-                               bbio->stripes[i].length -=
-                                       stripe_offset;
+                               bioc->stripes[i].length -= stripe_offset;
   
                         if (stripe_index >= last_stripe &&
                             stripe_index <= (last_stripe +
                                              sub_stripes - 1))
-                               bbio->stripes[i].length -=
-                                       stripe_end_offset;
+                               bioc->stripes[i].length -= stripe_end_offset;
   
                         if (i == sub_stripes - 1)
                                 stripe_offset = 0;
                 } else {
-                       bbio->stripes[i].length = length;
+                       bioc->stripes[i].length = length;
                 }
   
                 stripe_index++;
@@@ -5998,9 -6053,9 +6053,9 @@@
                 }
         }
   
-       *bbio_ret = bbio;
-       bbio->map_type = map->type;
-       bbio->num_stripes = num_stripes;
+       *bioc_ret = bioc;
+       bioc->map_type = map->type;
+       bioc->num_stripes = num_stripes;
   out:
         free_extent_map(em);
         return ret;
@@@ -6024,7 -6079,7 +6079,7 @@@ static int get_extra_mirror_from_replac
                                          u64 srcdev_devid, int *mirror_num,
                                          u64 *physical)
   {
-       struct btrfs_bio *bbio = NULL;
+       struct btrfs_io_context *bioc = NULL;
         int num_stripes;
         int index_srcdev = 0;
         int found = 0;
@@@ -6033,20 -6088,20 +6088,20 @@@
         int ret = 0;
   
         ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
-                               logical, &length, &bbio, 0, 0);
+                               logical, &length, &bioc, 0, 0);
         if (ret) {
-               ASSERT(bbio == NULL);
+               ASSERT(bioc == NULL);
                 return ret;
         }
   
-       num_stripes = bbio->num_stripes;
+       num_stripes = bioc->num_stripes;
         if (*mirror_num > num_stripes) {
                 /*
                  * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror,
                  * that means that the requested area is not left of the left
                  * cursor
                  */
-               btrfs_put_bbio(bbio);
+               btrfs_put_bioc(bioc);
                 return -EIO;
         }
   
@@@ -6056,7 -6111,7 +6111,7 @@@
          * pointer to the one of the target drive.
          */
         for (i = 0; i < num_stripes; i++) {
-               if (bbio->stripes[i].dev->devid != srcdev_devid)
+               if (bioc->stripes[i].dev->devid != srcdev_devid)
                         continue;
   
                 /*
@@@ -6064,15 -6119,15 +6119,15 @@@
                  * mirror with the lowest physical address
                  */
                 if (found &&
-                   physical_of_found <= bbio->stripes[i].physical)
+                   physical_of_found <= bioc->stripes[i].physical)
                         continue;
   
                 index_srcdev = i;
                 found = 1;
-               physical_of_found = bbio->stripes[i].physical;
+               physical_of_found = bioc->stripes[i].physical;
         }
   
-       btrfs_put_bbio(bbio);
+       btrfs_put_bioc(bioc);
   
         ASSERT(found);
         if (!found)
@@@ -6103,12 -6158,12 +6158,12 @@@ static bool is_block_group_to_copy(stru
   }
   
   static void handle_ops_on_dev_replace(enum btrfs_map_op op,
-                                     struct btrfs_bio **bbio_ret,
+                                     struct btrfs_io_context **bioc_ret,
                                       struct btrfs_dev_replace *dev_replace,
                                       u64 logical,
                                       int *num_stripes_ret, int *max_errors_ret)
   {
-       struct btrfs_bio *bbio = *bbio_ret;
+       struct btrfs_io_context *bioc = *bioc_ret;
         u64 srcdev_devid = dev_replace->srcdev->devid;
         int tgtdev_indexes = 0;
         int num_stripes = *num_stripes_ret;
@@@ -6138,17 -6193,17 +6193,17 @@@
                  */
                 index_where_to_add = num_stripes;
                 for (i = 0; i < num_stripes; i++) {
-                       if (bbio->stripes[i].dev->devid == srcdev_devid) {
+                       if (bioc->stripes[i].dev->devid == srcdev_devid) {
                                 /* write to new disk, too */
-                               struct btrfs_bio_stripe *new =
-                                       bbio->stripes + index_where_to_add;
-                               struct btrfs_bio_stripe *old =
-                                       bbio->stripes + i;
+                               struct btrfs_io_stripe *new =
+                                       bioc->stripes + index_where_to_add;
+                               struct btrfs_io_stripe *old =
+                                       bioc->stripes + i;
   
                                 new->physical = old->physical;
                                 new->length = old->length;
                                 new->dev = dev_replace->tgtdev;
-                               bbio->tgtdev_map[i] = index_where_to_add;
+                               bioc->tgtdev_map[i] = index_where_to_add;
                                 index_where_to_add++;
                                 max_errors++;
                                 tgtdev_indexes++;
@@@ -6168,30 -6223,29 +6223,29 @@@
                  * full copy of the source drive.
                  */
                 for (i = 0; i < num_stripes; i++) {
-                       if (bbio->stripes[i].dev->devid == srcdev_devid) {
+                       if (bioc->stripes[i].dev->devid == srcdev_devid) {
                                 /*
                                  * In case of DUP, in order to keep it simple,
                                  * only add the mirror with the lowest physical
                                  * address
                                  */
                                 if (found &&
-                                   physical_of_found <=
-                                    bbio->stripes[i].physical)
+                                   physical_of_found <= bioc->stripes[i].physical)
                                         continue;
                                 index_srcdev = i;
                                 found = 1;
-                               physical_of_found = bbio->stripes[i].physical;
+                               physical_of_found = bioc->stripes[i].physical;
                         }
                 }
                 if (found) {
-                       struct btrfs_bio_stripe *tgtdev_stripe =
-                               bbio->stripes + num_stripes;
+                       struct btrfs_io_stripe *tgtdev_stripe =
+                               bioc->stripes + num_stripes;
   
                         tgtdev_stripe->physical = physical_of_found;
                         tgtdev_stripe->length =
-                               bbio->stripes[index_srcdev].length;
+                               bioc->stripes[index_srcdev].length;
                         tgtdev_stripe->dev = dev_replace->tgtdev;
-                       bbio->tgtdev_map[index_srcdev] = num_stripes;
+                       bioc->tgtdev_map[index_srcdev] = num_stripes;
   
                         tgtdev_indexes++;
                         num_stripes++;
@@@ -6200,8 -6254,8 +6254,8 @@@
   
         *num_stripes_ret = num_stripes;
         *max_errors_ret = max_errors;
-       bbio->num_tgtdevs = tgtdev_indexes;
-       *bbio_ret = bbio;
+       bioc->num_tgtdevs = tgtdev_indexes;
+       *bioc_ret = bioc;
   }
   
   static bool need_full_stripe(enum btrfs_map_op op)
@@@ -6304,7 -6358,7 +6358,7 @@@ int btrfs_get_io_geometry(struct btrfs_
   static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
                              enum btrfs_map_op op,
                              u64 logical, u64 *length,
-                            struct btrfs_bio **bbio_ret,
+                            struct btrfs_io_context **bioc_ret,
                              int mirror_num, int need_raid_map)
   {
         struct extent_map *em;
@@@ -6319,7 -6373,7 +6373,7 @@@
         int num_stripes;
         int max_errors = 0;
         int tgtdev_indexes = 0;
-       struct btrfs_bio *bbio = NULL;
+       struct btrfs_io_context *bioc = NULL;
         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
         int dev_replace_is_ongoing = 0;
         int num_alloc_stripes;
@@@ -6328,7 -6382,7 +6382,7 @@@
         u64 raid56_full_stripe_start = (u64)-1;
         struct btrfs_io_geometry geom;
   
-       ASSERT(bbio_ret);
+       ASSERT(bioc_ret);
         ASSERT(op != BTRFS_MAP_DISCARD);
   
         em = btrfs_get_chunk_map(fs_info, logical, *length);
@@@ -6472,20 -6526,20 +6526,20 @@@
                 tgtdev_indexes = num_stripes;
         }
   
-       bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes);
-       if (!bbio) {
+       bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes, tgtdev_indexes);
+       if (!bioc) {
                 ret = -ENOMEM;
                 goto out;
         }
   
         for (i = 0; i < num_stripes; i++) {
-               bbio->stripes[i].physical = map->stripes[stripe_index].physical +
+               bioc->stripes[i].physical = map->stripes[stripe_index].physical +
                         stripe_offset + stripe_nr * map->stripe_len;
-               bbio->stripes[i].dev = map->stripes[stripe_index].dev;
+               bioc->stripes[i].dev = map->stripes[stripe_index].dev;
                 stripe_index++;
         }
   
-       /* build raid_map */
+       /* Build raid_map */
         if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
             (need_full_stripe(op) || mirror_num > 1)) {
                 u64 tmp;
@@@ -6497,15 -6551,15 +6551,15 @@@
                 /* Fill in the logical address of each stripe */
                 tmp = stripe_nr * data_stripes;
                 for (i = 0; i < data_stripes; i++)
-                       bbio->raid_map[(i+rot) % num_stripes] =
+                       bioc->raid_map[(i + rot) % num_stripes] =
                                 em->start + (tmp + i) * map->stripe_len;
   
-               bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
+               bioc->raid_map[(i + rot) % map->num_stripes] = RAID5_P_STRIPE;
                 if (map->type & BTRFS_BLOCK_GROUP_RAID6)
-                       bbio->raid_map[(i+rot+1) % num_stripes] =
+                       bioc->raid_map[(i + rot + 1) % num_stripes] =
                                 RAID6_Q_STRIPE;
   
-               sort_parity_stripes(bbio, num_stripes);
+               sort_parity_stripes(bioc, num_stripes);
         }
   
         if (need_full_stripe(op))
@@@ -6513,15 -6567,15 +6567,15 @@@
   
         if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
             need_full_stripe(op)) {
-               handle_ops_on_dev_replace(op, &bbio, dev_replace, logical,
+               handle_ops_on_dev_replace(op, &bioc, dev_replace, logical,
                                           &num_stripes, &max_errors);
         }
   
-       *bbio_ret = bbio;
-       bbio->map_type = map->type;
-       bbio->num_stripes = num_stripes;
-       bbio->max_errors = max_errors;
-       bbio->mirror_num = mirror_num;
+       *bioc_ret = bioc;
+       bioc->map_type = map->type;
+       bioc->num_stripes = num_stripes;
+       bioc->max_errors = max_errors;
+       bioc->mirror_num = mirror_num;
   
         /*
          * this is the case that REQ_READ && dev_replace_is_ongoing &&
@@@ -6530,9 -6584,9 +6584,9 @@@
          */
         if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
                 WARN_ON(num_stripes > 1);
-               bbio->stripes[0].dev = dev_replace->tgtdev;
-               bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
-               bbio->mirror_num = map->num_stripes + 1;
+               bioc->stripes[0].dev = dev_replace->tgtdev;
+               bioc->stripes[0].physical = physical_to_patch_in_first_stripe;
+               bioc->mirror_num = map->num_stripes + 1;
         }
   out:
         if (dev_replace_is_ongoing) {
@@@ -6546,43 -6600,43 +6600,43 @@@
   
   int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
                       u64 logical, u64 *length,
-                     struct btrfs_bio **bbio_ret, int mirror_num)
+                     struct btrfs_io_context **bioc_ret, int mirror_num)
   {
         if (op == BTRFS_MAP_DISCARD)
                 return __btrfs_map_block_for_discard(fs_info, logical,
-                                                    length, bbio_ret);
+                                                    length, bioc_ret);
   
-       return __btrfs_map_block(fs_info, op, logical, length, bbio_ret,
+       return __btrfs_map_block(fs_info, op, logical, length, bioc_ret,
                                  mirror_num, 0);
   }
   
   /* For Scrub/replace */
   int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
                      u64 logical, u64 *length,
-                    struct btrfs_bio **bbio_ret)
+                    struct btrfs_io_context **bioc_ret)
   {
-       return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1);
+       return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, 0, 1);
   }
   
- static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio)
+ static inline void btrfs_end_bioc(struct btrfs_io_context *bioc, struct bio *bio)
   {
-       bio->bi_private = bbio->private;
-       bio->bi_end_io = bbio->end_io;
+       bio->bi_private = bioc->private;
+       bio->bi_end_io = bioc->end_io;
         bio_endio(bio);
   
-       btrfs_put_bbio(bbio);
+       btrfs_put_bioc(bioc);
   }
   
   static void btrfs_end_bio(struct bio *bio)
   {
-       struct btrfs_bio *bbio = bio->bi_private;
+       struct btrfs_io_context *bioc = bio->bi_private;
         int is_orig_bio = 0;
   
         if (bio->bi_status) {
-               atomic_inc(&bbio->error);
+               atomic_inc(&bioc->error);
                 if (bio->bi_status == BLK_STS_IOERR ||
                     bio->bi_status == BLK_STS_TARGET) {
-                       struct btrfs_device *dev = btrfs_io_bio(bio)->device;
+                       struct btrfs_device *dev = btrfs_bio(bio)->device;
   
                         ASSERT(dev->bdev);
                         if (btrfs_op(bio) == BTRFS_MAP_WRITE)
@@@ -6597,22 -6651,22 +6651,22 @@@
                 }
         }
   
-       if (bio == bbio->orig_bio)
+       if (bio == bioc->orig_bio)
                 is_orig_bio = 1;
   
-       btrfs_bio_counter_dec(bbio->fs_info);
+       btrfs_bio_counter_dec(bioc->fs_info);
   
-       if (atomic_dec_and_test(&bbio->stripes_pending)) {
+       if (atomic_dec_and_test(&bioc->stripes_pending)) {
                 if (!is_orig_bio) {
                         bio_put(bio);
-                       bio = bbio->orig_bio;
+                       bio = bioc->orig_bio;
                 }
   
-               btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
+               btrfs_bio(bio)->mirror_num = bioc->mirror_num;
                 /* only send an error to the higher layers if it is
                  * beyond the tolerance of the btrfs bio
                  */
-               if (atomic_read(&bbio->error) > bbio->max_errors) {
+               if (atomic_read(&bioc->error) > bioc->max_errors) {
                         bio->bi_status = BLK_STS_IOERR;
                 } else {
                         /*
@@@ -6622,19 -6676,19 +6676,19 @@@
                         bio->bi_status = BLK_STS_OK;
                 }
   
-               btrfs_end_bbio(bbio, bio);
+               btrfs_end_bioc(bioc, bio);
         } else if (!is_orig_bio) {
                 bio_put(bio);
         }
   }
   
- static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
+ static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio,
                               u64 physical, struct btrfs_device *dev)
   {
-       struct btrfs_fs_info *fs_info = bbio->fs_info;
+       struct btrfs_fs_info *fs_info = bioc->fs_info;
   
-       bio->bi_private = bbio;
-       btrfs_io_bio(bio)->device = dev;
+       bio->bi_private = bioc;
+       btrfs_bio(bio)->device = dev;
         bio->bi_end_io = btrfs_end_bio;
         bio->bi_iter.bi_sector = physical >> 9;
         /*
@@@ -6663,20 -6717,20 +6717,20 @@@
         btrfsic_submit_bio(bio);
   }
   
- static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
+ static void bioc_error(struct btrfs_io_context *bioc, struct bio *bio, u64 logical)
   {
-       atomic_inc(&bbio->error);
-       if (atomic_dec_and_test(&bbio->stripes_pending)) {
+       atomic_inc(&bioc->error);
+       if (atomic_dec_and_test(&bioc->stripes_pending)) {
                 /* Should be the original bio. */
-               WARN_ON(bio != bbio->orig_bio);
+               WARN_ON(bio != bioc->orig_bio);
   
-               btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
+               btrfs_bio(bio)->mirror_num = bioc->mirror_num;
                 bio->bi_iter.bi_sector = logical >> 9;
-               if (atomic_read(&bbio->error) > bbio->max_errors)
+               if (atomic_read(&bioc->error) > bioc->max_errors)
                         bio->bi_status = BLK_STS_IOERR;
                 else
                         bio->bi_status = BLK_STS_OK;
-               btrfs_end_bbio(bbio, bio);
+               btrfs_end_bioc(bioc, bio);
         }
   }
   
@@@ -6691,36 -6745,34 +6745,34 @@@ blk_status_t btrfs_map_bio(struct btrfs
         int ret;
         int dev_nr;
         int total_devs;
-       struct btrfs_bio *bbio = NULL;
+       struct btrfs_io_context *bioc = NULL;
   
         length = bio->bi_iter.bi_size;
         map_length = length;
   
         btrfs_bio_counter_inc_blocked(fs_info);
         ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical,
-                               &map_length, &bbio, mirror_num, 1);
+                               &map_length, &bioc, mirror_num, 1);
         if (ret) {
                 btrfs_bio_counter_dec(fs_info);
                 return errno_to_blk_status(ret);
         }
   
-       total_devs = bbio->num_stripes;
-       bbio->orig_bio = first_bio;
-       bbio->private = first_bio->bi_private;
-       bbio->end_io = first_bio->bi_end_io;
-       bbio->fs_info = fs_info;
-       atomic_set(&bbio->stripes_pending, bbio->num_stripes);
+       total_devs = bioc->num_stripes;
+       bioc->orig_bio = first_bio;
+       bioc->private = first_bio->bi_private;
+       bioc->end_io = first_bio->bi_end_io;
+       atomic_set(&bioc->stripes_pending, bioc->num_stripes);
   
-       if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
+       if ((bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
             ((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) {
                 /* In this case, map_length has been set to the length of
                    a single stripe; not the whole write */
                 if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
-                       ret = raid56_parity_write(fs_info, bio, bbio,
-                                                 map_length);
+                       ret = raid56_parity_write(bio, bioc, map_length);
                 } else {
-                       ret = raid56_parity_recover(fs_info, bio, bbio,
-                                                   map_length, mirror_num, 1);
+                       ret = raid56_parity_recover(bio, bioc, map_length,
+                                                   mirror_num, 1);
                 }
   
                 btrfs_bio_counter_dec(fs_info);
@@@ -6735,12 -6787,12 +6787,12 @@@
         }
   
         for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
-               dev = bbio->stripes[dev_nr].dev;
+               dev = bioc->stripes[dev_nr].dev;
                 if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING,
                                                    &dev->dev_state) ||
                     (btrfs_op(first_bio) == BTRFS_MAP_WRITE &&
                     !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
-                       bbio_error(bbio, first_bio, logical);
+                       bioc_error(bioc, first_bio, logical);
                         continue;
                 }
   
@@@ -6749,12 -6801,39 +6801,39 @@@
                 else
                         bio = first_bio;
   
-               submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, dev);
+               submit_stripe_bio(bioc, bio, bioc->stripes[dev_nr].physical, dev);
         }
         btrfs_bio_counter_dec(fs_info);
         return BLK_STS_OK;
   }
   
+ static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args,
+                                     const struct btrfs_fs_devices *fs_devices)
+ {
+       if (args->fsid == NULL)
+               return true;
+       if (memcmp(fs_devices->metadata_uuid, args->fsid, BTRFS_FSID_SIZE) == 0)
+               return true;
+       return false;
+ }
+ 
+ static bool dev_args_match_device(const struct btrfs_dev_lookup_args *args,
+                                 const struct btrfs_device *device)
+ {
+       ASSERT((args->devid != (u64)-1) || args->missing);
+ 
+       if ((args->devid != (u64)-1) && device->devid != args->devid)
+               return false;
+       if (args->uuid && memcmp(device->uuid, args->uuid, BTRFS_UUID_SIZE) != 0)
+               return false;
+       if (!args->missing)
+               return true;
+       if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) &&
+           !device->bdev)
+               return true;
+       return false;
+ }
+ 
   /*
    * Find a device specified by @devid or @uuid in the list of @fs_devices, or
    * return NULL.
@@@ -6762,31 -6841,25 +6841,25 @@@
    * If devid and uuid are both specified, the match must be exact, otherwise
    * only devid is used.
    */
- struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
-                                      u64 devid, u8 *uuid, u8 *fsid)
+ struct btrfs_device *btrfs_find_device(const struct btrfs_fs_devices *fs_devices,
+                                      const struct btrfs_dev_lookup_args *args)
   {
         struct btrfs_device *device;
         struct btrfs_fs_devices *seed_devs;
   
-       if (!fsid || !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
+       if (dev_args_match_fs_devices(args, fs_devices)) {
                 list_for_each_entry(device, &fs_devices->devices, dev_list) {
-                       if (device->devid == devid &&
-                           (!uuid || memcmp(device->uuid, uuid,
-                                            BTRFS_UUID_SIZE) == 0))
+                       if (dev_args_match_device(args, device))
                                 return device;
                 }
         }
   
         list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
-               if (!fsid ||
-                   !memcmp(seed_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
-                       list_for_each_entry(device, &seed_devs->devices,
-                                           dev_list) {
-                               if (device->devid == devid &&
-                                   (!uuid || memcmp(device->uuid, uuid,
-                                                    BTRFS_UUID_SIZE) == 0))
-                                       return device;
-                       }
+               if (!dev_args_match_fs_devices(args, seed_devs))
+                       continue;
+               list_for_each_entry(device, &seed_devs->devices, dev_list) {
+                       if (dev_args_match_device(args, device))
+                               return device;
                 }
         }
   
@@@ -6952,6 -7025,7 +7025,7 @@@ static void warn_32bit_meta_chunk(struc
   static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
                           struct btrfs_chunk *chunk)
   {
+       BTRFS_DEV_LOOKUP_ARGS(args);
         struct btrfs_fs_info *fs_info = leaf->fs_info;
         struct extent_map_tree *map_tree = &fs_info->mapping_tree;
         struct map_lookup *map;
@@@ -7029,11 -7103,12 +7103,12 @@@
                 map->stripes[i].physical =
                         btrfs_stripe_offset_nr(leaf, chunk, i);
                 devid = btrfs_stripe_devid_nr(leaf, chunk, i);
+               args.devid = devid;
                 read_extent_buffer(leaf, uuid, (unsigned long)
                                    btrfs_stripe_dev_uuid_nr(chunk, i),
                                    BTRFS_UUID_SIZE);
-               map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices,
-                                                       devid, uuid, NULL);
+               args.uuid = uuid;
+               map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, &args);
                 if (!map->stripes[i].dev &&
                     !btrfs_test_opt(fs_info, DEGRADED)) {
                         free_extent_map(em);
@@@ -7151,6 -7226,7 +7226,7 @@@ static struct btrfs_fs_devices *open_se
   static int read_one_dev(struct extent_buffer *leaf,
                         struct btrfs_dev_item *dev_item)
   {
+       BTRFS_DEV_LOOKUP_ARGS(args);
         struct btrfs_fs_info *fs_info = leaf->fs_info;
         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
         struct btrfs_device *device;
@@@ -7159,11 -7235,13 +7235,13 @@@
         u8 fs_uuid[BTRFS_FSID_SIZE];
         u8 dev_uuid[BTRFS_UUID_SIZE];
   
-       devid = btrfs_device_id(leaf, dev_item);
+       devid = args.devid = btrfs_device_id(leaf, dev_item);
         read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
                            BTRFS_UUID_SIZE);
         read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
                            BTRFS_FSID_SIZE);
+       args.uuid = dev_uuid;
+       args.fsid = fs_uuid;
   
         if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) {
                 fs_devices = open_seed_devices(fs_info, fs_uuid);
@@@ -7171,8 -7249,7 +7249,7 @@@
                         return PTR_ERR(fs_devices);
         }
   
-       device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
-                                  fs_uuid);
+       device = btrfs_find_device(fs_info->fs_devices, &args);
         if (!device) {
                 if (!btrfs_test_opt(fs_info, DEGRADED)) {
                         btrfs_report_missing_device(fs_info, devid,
@@@ -7236,7 -7313,7 +7313,7 @@@
   
         fill_device_from_item(leaf, dev_item, device);
         if (device->bdev) {
- -              u64 max_total_bytes = i_size_read(device->bdev->bd_inode);
+ +              u64 max_total_bytes = bdev_nr_bytes(device->bdev);
   
                 if (device->total_bytes > max_total_bytes) {
                         btrfs_err(fs_info,
@@@ -7841,12 -7918,14 +7918,14 @@@ static void btrfs_dev_stat_print_on_loa
   int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
                         struct btrfs_ioctl_get_dev_stats *stats)
   {
+       BTRFS_DEV_LOOKUP_ARGS(args);
         struct btrfs_device *dev;
         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
         int i;
   
         mutex_lock(&fs_devices->device_list_mutex);
-       dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL);
+       args.devid = stats->devid;
+       dev = btrfs_find_device(fs_info->fs_devices, &args);
         mutex_unlock(&fs_devices->device_list_mutex);
   
         if (!dev) {
@@@ -7922,6 -8001,7 +8001,7 @@@ static int verify_one_dev_extent(struc
                                  u64 chunk_offset, u64 devid,
                                  u64 physical_offset, u64 physical_len)
   {
+       struct btrfs_dev_lookup_args args = { .devid = devid };
         struct extent_map_tree *em_tree = &fs_info->mapping_tree;
         struct extent_map *em;
         struct map_lookup *map;
@@@ -7977,7 -8057,7 +8057,7 @@@
         }
   
         /* Make sure no dev extent is beyond device boundary */
-       dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
+       dev = btrfs_find_device(fs_info->fs_devices, &args);
         if (!dev) {
                 btrfs_err(fs_info, "failed to find devid %llu", devid);
                 ret = -EUCLEAN;
diff --combined include/linux/fs.h

index 0dcb902,56eba72..f3cfca5
--- 1/include/linux/fs.h
--- 2/include/linux/fs.h
+++ b/include/linux/fs.h
@@@ -48,7 -48,6 +48,7 @@@
   struct backing_dev_info;
   struct bdi_writeback;
   struct bio;
+ +struct io_comp_batch;
   struct export_operations;
   struct fiemap_extent_info;
   struct hd_geometry;
@@@ -330,12 -329,16 +330,12 @@@ struct kiocb 
         randomized_struct_fields_start
   
         loff_t                  ki_pos;
- -      void (*ki_complete)(struct kiocb *iocb, long ret, long ret2);
+ +      void (*ki_complete)(struct kiocb *iocb, long ret);
         void                    *private;
         int                     ki_flags;
         u16                     ki_hint;
         u16                     ki_ioprio; /* See linux/ioprio.h */
- -      union {
- -              unsigned int            ki_cookie; /* for ->iopoll */
- -              struct wait_page_queue  *ki_waitq; /* for async buffered IO */
- -      };
- -
+ +      struct wait_page_queue  *ki_waitq; /* for async buffered IO */
         randomized_struct_fields_end
   };
   
@@@ -2072,8 -2075,7 +2072,8 @@@ struct file_operations 
         ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
         ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
         ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
- -      int (*iopoll)(struct kiocb *kiocb, bool spin);
+ +      int (*iopoll)(struct kiocb *kiocb, struct io_comp_batch *,
+ +                      unsigned int flags);
         int (*iterate) (struct file *, struct dir_context *);
         int (*iterate_shared) (struct file *, struct dir_context *);
         __poll_t (*poll) (struct file *, struct poll_table_struct *);
@@@ -2496,6 -2498,8 +2496,8 @@@ enum file_time_flags 
   
   extern bool atime_needs_update(const struct path *, struct inode *);
   extern void touch_atime(const struct path *);
+ int inode_update_time(struct inode *inode, struct timespec64 *time, int flags);
+ 
   static inline void file_accessed(struct file *file)
   {
         if (!(file->f_flags & O_NOATIME))
author	Linus Torvalds <torvalds@linux-foundation.org>
	Mon, 1 Nov 2021 19:48:25 +0000 (12:48 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Mon, 1 Nov 2021 19:48:25 +0000 (12:48 -0700)
		1	2
fs/btrfs/compression.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/ctree.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/dev-replace.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/disk-io.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/ioctl.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/lzo.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/volumes.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/fs.h	patch \|	diff1 \|	diff2 \|	blob \| history