From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 1 Nov 2021 19:48:25 +0000 (-0700)
Subject: Merge tag 'for-5.16-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
X-Git-Tag: v6.1-rc5~2812
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=037c50bfbeb33b4c74e120eef5b8b99d8f025418;hp=-c;p=platform%2Fkernel%2Flinux-starfive.git

Merge tag 'for-5.16-tag' of git://git./linux/kernel/git/kdave/linux

Pull btrfs updates from David Sterba:
 "The updates this time are more under the hood and enhancing existing
  features (subpage with compression and zoned namespaces).

  Performance related:

   - misc small inode logging improvements (+3% throughput, -11% latency
     on sample dbench workload)

   - more efficient directory logging: bulk item insertion, less tree
     searches and locking

   - speed up bulk insertion of items into a b-tree, which is used when
     logging directories, when running delayed items for directories
     (fsync and transaction commits) and when running the slow path
     (full sync) of an fsync (bulk creation run time -4%, deletion -12%)

  Core:

   - continued subpage support
      - make defragmentation work
      - make compression write work

   - zoned mode
      - support ZNS (zoned namespaces), zone capacity is number of
        usable blocks in each zone
      - add dedicated block group (zoned) for relocation, to prevent
        out of order writes in some cases
      - greedy block group reclaim, pick the ones with least usable
        space first

   - preparatory work for send protocol updates

   - error handling improvements

   - cleanups and refactoring

  Fixes:

   - lockdep warnings
      - in show_devname callback, on seeding device
      - device delete on loop device due to conversions to workqueues

   - fix deadlock between chunk allocation and chunk btree modifications

   - fix tracking of missing device count and status"

* tag 'for-5.16-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (140 commits)
  btrfs: remove root argument from check_item_in_log()
  btrfs: remove root argument from add_link()
  btrfs: remove root argument from btrfs_unlink_inode()
  btrfs: remove root argument from drop_one_dir_item()
  btrfs: clear MISSING device status bit in btrfs_close_one_device
  btrfs: call btrfs_check_rw_degradable only if there is a missing device
  btrfs: send: prepare for v2 protocol
  btrfs: fix comment about sector sizes supported in 64K systems
  btrfs: update device path inode time instead of bd_inode
  fs: export an inode_update_time helper
  btrfs: fix deadlock when defragging transparent huge pages
  btrfs: sysfs: convert scnprintf and snprintf to sysfs_emit
  btrfs: make btrfs_super_block size match BTRFS_SUPER_INFO_SIZE
  btrfs: update comments for chunk allocation -ENOSPC cases
  btrfs: fix deadlock between chunk allocation and chunk btree modifications
  btrfs: zoned: use greedy gc for auto reclaim
  btrfs: check-integrity: stop storing the block device name in btrfsic_dev_state
  btrfs: use btrfs_get_dev_args_from_path in dev removal ioctls
  btrfs: add a btrfs_get_dev_args_from_path helper
  btrfs: handle device lookup with btrfs_dev_lookup_args
  ...
---

037c50bfbeb33b4c74e120eef5b8b99d8f025418
diff --combined fs/btrfs/compression.c
index 6c7eb80,1d071c8..32da97c
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@@ -9,7 -9,6 +9,7 @@@
  #include <linux/fs.h>
  #include <linux/pagemap.h>
  #include <linux/highmem.h>
 +#include <linux/kthread.h>
  #include <linux/time.h>
  #include <linux/init.h>
  #include <linux/string.h>
@@@ -29,6 -28,7 +29,7 @@@
  #include "compression.h"
  #include "extent_io.h"
  #include "extent_map.h"
+ #include "subpage.h"
  #include "zoned.h"
  
  static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" };
@@@ -173,17 -173,16 +174,17 @@@ static int check_compressed_csum(struc
  		/* Hash through the page sector by sector */
  		for (pg_offset = 0; pg_offset < bytes_left;
  		     pg_offset += sectorsize) {
 -			kaddr = page_address(page);
 +			kaddr = kmap_atomic(page);
  			crypto_shash_digest(shash, kaddr + pg_offset,
  					    sectorsize, csum);
 +			kunmap_atomic(kaddr);
  
  			if (memcmp(&csum, cb_sum, csum_size) != 0) {
  				btrfs_print_data_csum_error(inode, disk_start,
  						csum, cb_sum, cb->mirror_num);
- 				if (btrfs_io_bio(bio)->device)
+ 				if (btrfs_bio(bio)->device)
  					btrfs_dev_stat_inc_and_print(
- 						btrfs_io_bio(bio)->device,
+ 						btrfs_bio(bio)->device,
  						BTRFS_DEV_STAT_CORRUPTION_ERRS);
  				return -EIO;
  			}
@@@ -194,6 -193,87 +195,87 @@@
  	return 0;
  }
  
+ /*
+  * Reduce bio and io accounting for a compressed_bio with its corresponding bio.
+  *
+  * Return true if there is no pending bio nor io.
+  * Return false otherwise.
+  */
+ static bool dec_and_test_compressed_bio(struct compressed_bio *cb, struct bio *bio)
+ {
+ 	struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
+ 	unsigned int bi_size = 0;
+ 	bool last_io = false;
+ 	struct bio_vec *bvec;
+ 	struct bvec_iter_all iter_all;
+ 
+ 	/*
+ 	 * At endio time, bi_iter.bi_size doesn't represent the real bio size.
+ 	 * Thus here we have to iterate through all segments to grab correct
+ 	 * bio size.
+ 	 */
+ 	bio_for_each_segment_all(bvec, bio, iter_all)
+ 		bi_size += bvec->bv_len;
+ 
+ 	if (bio->bi_status)
+ 		cb->errors = 1;
+ 
+ 	ASSERT(bi_size && bi_size <= cb->compressed_len);
+ 	last_io = refcount_sub_and_test(bi_size >> fs_info->sectorsize_bits,
+ 					&cb->pending_sectors);
+ 	/*
+ 	 * Here we must wake up the possible error handler after all other
+ 	 * operations on @cb finished, or we can race with
+ 	 * finish_compressed_bio_*() which may free @cb.
+ 	 */
+ 	wake_up_var(cb);
+ 
+ 	return last_io;
+ }
+ 
+ static void finish_compressed_bio_read(struct compressed_bio *cb, struct bio *bio)
+ {
+ 	unsigned int index;
+ 	struct page *page;
+ 
+ 	/* Release the compressed pages */
+ 	for (index = 0; index < cb->nr_pages; index++) {
+ 		page = cb->compressed_pages[index];
+ 		page->mapping = NULL;
+ 		put_page(page);
+ 	}
+ 
+ 	/* Do io completion on the original bio */
+ 	if (cb->errors) {
+ 		bio_io_error(cb->orig_bio);
+ 	} else {
+ 		struct bio_vec *bvec;
+ 		struct bvec_iter_all iter_all;
+ 
+ 		ASSERT(bio);
+ 		ASSERT(!bio->bi_status);
+ 		/*
+ 		 * We have verified the checksum already, set page checked so
+ 		 * the end_io handlers know about it
+ 		 */
+ 		ASSERT(!bio_flagged(bio, BIO_CLONED));
+ 		bio_for_each_segment_all(bvec, cb->orig_bio, iter_all) {
+ 			u64 bvec_start = page_offset(bvec->bv_page) +
+ 					 bvec->bv_offset;
+ 
+ 			btrfs_page_set_checked(btrfs_sb(cb->inode->i_sb),
+ 					bvec->bv_page, bvec_start,
+ 					bvec->bv_len);
+ 		}
+ 
+ 		bio_endio(cb->orig_bio);
+ 	}
+ 
+ 	/* Finally free the cb struct */
+ 	kfree(cb->compressed_pages);
+ 	kfree(cb);
+ }
+ 
  /* when we finish reading compressed pages from the disk, we
   * decompress them and then run the bio end_io routines on the
   * decompressed pages (in the inode address space).
@@@ -208,25 -288,17 +290,17 @@@ static void end_compressed_bio_read(str
  {
  	struct compressed_bio *cb = bio->bi_private;
  	struct inode *inode;
- 	struct page *page;
- 	unsigned int index;
- 	unsigned int mirror = btrfs_io_bio(bio)->mirror_num;
+ 	unsigned int mirror = btrfs_bio(bio)->mirror_num;
  	int ret = 0;
  
- 	if (bio->bi_status)
- 		cb->errors = 1;
- 
- 	/* if there are more bios still pending for this compressed
- 	 * extent, just exit
- 	 */
- 	if (!refcount_dec_and_test(&cb->pending_bios))
+ 	if (!dec_and_test_compressed_bio(cb, bio))
  		goto out;
  
  	/*
  	 * Record the correct mirror_num in cb->orig_bio so that
  	 * read-repair can work properly.
  	 */
- 	btrfs_io_bio(cb->orig_bio)->mirror_num = mirror;
+ 	btrfs_bio(cb->orig_bio)->mirror_num = mirror;
  	cb->mirror_num = mirror;
  
  	/*
@@@ -250,36 -322,7 +324,7 @@@
  csum_failed:
  	if (ret)
  		cb->errors = 1;
- 
- 	/* release the compressed pages */
- 	index = 0;
- 	for (index = 0; index < cb->nr_pages; index++) {
- 		page = cb->compressed_pages[index];
- 		page->mapping = NULL;
- 		put_page(page);
- 	}
- 
- 	/* do io completion on the original bio */
- 	if (cb->errors) {
- 		bio_io_error(cb->orig_bio);
- 	} else {
- 		struct bio_vec *bvec;
- 		struct bvec_iter_all iter_all;
- 
- 		/*
- 		 * we have verified the checksum already, set page
- 		 * checked so the end_io handlers know about it
- 		 */
- 		ASSERT(!bio_flagged(bio, BIO_CLONED));
- 		bio_for_each_segment_all(bvec, cb->orig_bio, iter_all)
- 			SetPageChecked(bvec->bv_page);
- 
- 		bio_endio(cb->orig_bio);
- 	}
- 
- 	/* finally free the cb struct */
- 	kfree(cb->compressed_pages);
- 	kfree(cb);
+ 	finish_compressed_bio_read(cb, bio);
  out:
  	bio_put(bio);
  }
@@@ -291,6 -334,7 +336,7 @@@
  static noinline void end_compressed_writeback(struct inode *inode,
  					      const struct compressed_bio *cb)
  {
+ 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
  	unsigned long index = cb->start >> PAGE_SHIFT;
  	unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT;
  	struct page *pages[16];
@@@ -313,7 -357,8 +359,8 @@@
  		for (i = 0; i < ret; i++) {
  			if (cb->errors)
  				SetPageError(pages[i]);
- 			end_page_writeback(pages[i]);
+ 			btrfs_page_clamp_clear_writeback(fs_info, pages[i],
+ 							 cb->start, cb->len);
  			put_page(pages[i]);
  		}
  		nr_pages -= ret;
@@@ -322,60 -367,127 +369,127 @@@
  	/* the inode may be gone now */
  }
  
- /*
-  * do the cleanup once all the compressed pages hit the disk.
-  * This will clear writeback on the file pages and free the compressed
-  * pages.
-  *
-  * This also calls the writeback end hooks for the file pages so that
-  * metadata and checksums can be updated in the file.
-  */
- static void end_compressed_bio_write(struct bio *bio)
+ static void finish_compressed_bio_write(struct compressed_bio *cb)
  {
- 	struct compressed_bio *cb = bio->bi_private;
- 	struct inode *inode;
- 	struct page *page;
+ 	struct inode *inode = cb->inode;
  	unsigned int index;
  
- 	if (bio->bi_status)
- 		cb->errors = 1;
- 
- 	/* if there are more bios still pending for this compressed
- 	 * extent, just exit
- 	 */
- 	if (!refcount_dec_and_test(&cb->pending_bios))
- 		goto out;
- 
- 	/* ok, we're the last bio for this extent, step one is to
- 	 * call back into the FS and do all the end_io operations
+ 	/*
+ 	 * Ok, we're the last bio for this extent, step one is to call back
+ 	 * into the FS and do all the end_io operations.
  	 */
- 	inode = cb->inode;
- 	btrfs_record_physical_zoned(inode, cb->start, bio);
  	btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), NULL,
  			cb->start, cb->start + cb->len - 1,
  			!cb->errors);
  
  	end_compressed_writeback(inode, cb);
- 	/* note, our inode could be gone now */
+ 	/* Note, our inode could be gone now */
  
  	/*
- 	 * release the compressed pages, these came from alloc_page and
+ 	 * Release the compressed pages, these came from alloc_page and
  	 * are not attached to the inode at all
  	 */
- 	index = 0;
  	for (index = 0; index < cb->nr_pages; index++) {
- 		page = cb->compressed_pages[index];
+ 		struct page *page = cb->compressed_pages[index];
+ 
  		page->mapping = NULL;
  		put_page(page);
  	}
  
- 	/* finally free the cb struct */
+ 	/* Finally free the cb struct */
  	kfree(cb->compressed_pages);
  	kfree(cb);
+ }
+ 
+ /*
+  * Do the cleanup once all the compressed pages hit the disk.  This will clear
+  * writeback on the file pages and free the compressed pages.
+  *
+  * This also calls the writeback end hooks for the file pages so that metadata
+  * and checksums can be updated in the file.
+  */
+ static void end_compressed_bio_write(struct bio *bio)
+ {
+ 	struct compressed_bio *cb = bio->bi_private;
+ 
+ 	if (!dec_and_test_compressed_bio(cb, bio))
+ 		goto out;
+ 
+ 	btrfs_record_physical_zoned(cb->inode, cb->start, bio);
+ 
+ 	finish_compressed_bio_write(cb);
  out:
  	bio_put(bio);
  }
  
+ static blk_status_t submit_compressed_bio(struct btrfs_fs_info *fs_info,
+ 					  struct compressed_bio *cb,
+ 					  struct bio *bio, int mirror_num)
+ {
+ 	blk_status_t ret;
+ 
+ 	ASSERT(bio->bi_iter.bi_size);
+ 	ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
+ 	if (ret)
+ 		return ret;
+ 	ret = btrfs_map_bio(fs_info, bio, mirror_num);
+ 	return ret;
+ }
+ 
+ /*
+  * Allocate a compressed_bio, which will be used to read/write on-disk
+  * (aka, compressed) * data.
+  *
+  * @cb:                 The compressed_bio structure, which records all the needed
+  *                      information to bind the compressed data to the uncompressed
+  *                      page cache.
+  * @disk_byten:         The logical bytenr where the compressed data will be read
+  *                      from or written to.
+  * @endio_func:         The endio function to call after the IO for compressed data
+  *                      is finished.
+  * @next_stripe_start:  Return value of logical bytenr of where next stripe starts.
+  *                      Let the caller know to only fill the bio up to the stripe
+  *                      boundary.
+  */
+ 
+ 
+ static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_bytenr,
+ 					unsigned int opf, bio_end_io_t endio_func,
+ 					u64 *next_stripe_start)
+ {
+ 	struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
+ 	struct btrfs_io_geometry geom;
+ 	struct extent_map *em;
+ 	struct bio *bio;
+ 	int ret;
+ 
+ 	bio = btrfs_bio_alloc(BIO_MAX_VECS);
+ 
+ 	bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
+ 	bio->bi_opf = opf;
+ 	bio->bi_private = cb;
+ 	bio->bi_end_io = endio_func;
+ 
+ 	em = btrfs_get_chunk_map(fs_info, disk_bytenr, fs_info->sectorsize);
+ 	if (IS_ERR(em)) {
+ 		bio_put(bio);
+ 		return ERR_CAST(em);
+ 	}
+ 
+ 	if (bio_op(bio) == REQ_OP_ZONE_APPEND)
+ 		bio_set_dev(bio, em->map_lookup->stripes[0].dev->bdev);
+ 
+ 	ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), disk_bytenr, &geom);
+ 	free_extent_map(em);
+ 	if (ret < 0) {
+ 		bio_put(bio);
+ 		return ERR_PTR(ret);
+ 	}
+ 	*next_stripe_start = disk_bytenr + geom.len;
+ 
+ 	return bio;
+ }
+ 
  /*
   * worker function to build and submit bios for previously compressed pages.
   * The corresponding pages in the inode should be marked for writeback
@@@ -396,20 -508,19 +510,19 @@@ blk_status_t btrfs_submit_compressed_wr
  	struct btrfs_fs_info *fs_info = inode->root->fs_info;
  	struct bio *bio = NULL;
  	struct compressed_bio *cb;
- 	unsigned long bytes_left;
- 	int pg_index = 0;
- 	struct page *page;
- 	u64 first_byte = disk_start;
+ 	u64 cur_disk_bytenr = disk_start;
+ 	u64 next_stripe_start;
  	blk_status_t ret;
  	int skip_sum = inode->flags & BTRFS_INODE_NODATASUM;
  	const bool use_append = btrfs_use_zone_append(inode, disk_start);
  	const unsigned int bio_op = use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE;
  
- 	WARN_ON(!PAGE_ALIGNED(start));
+ 	ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
+ 	       IS_ALIGNED(len, fs_info->sectorsize));
  	cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
  	if (!cb)
  		return BLK_STS_RESOURCE;
- 	refcount_set(&cb->pending_bios, 0);
+ 	refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits);
  	cb->errors = 0;
  	cb->inode = &inode->vfs_inode;
  	cb->start = start;
@@@ -420,118 -531,100 +533,100 @@@
  	cb->orig_bio = NULL;
  	cb->nr_pages = nr_pages;
  
- 	bio = btrfs_bio_alloc(first_byte);
- 	bio->bi_opf = bio_op | write_flags;
- 	bio->bi_private = cb;
- 	bio->bi_end_io = end_compressed_bio_write;
- 
- 	if (use_append) {
- 		struct btrfs_device *device;
- 
- 		device = btrfs_zoned_get_device(fs_info, disk_start, PAGE_SIZE);
- 		if (IS_ERR(device)) {
- 			kfree(cb);
- 			bio_put(bio);
- 			return BLK_STS_NOTSUPP;
+ 	while (cur_disk_bytenr < disk_start + compressed_len) {
+ 		u64 offset = cur_disk_bytenr - disk_start;
+ 		unsigned int index = offset >> PAGE_SHIFT;
+ 		unsigned int real_size;
+ 		unsigned int added;
+ 		struct page *page = compressed_pages[index];
+ 		bool submit = false;
+ 
+ 		/* Allocate new bio if submitted or not yet allocated */
+ 		if (!bio) {
+ 			bio = alloc_compressed_bio(cb, cur_disk_bytenr,
+ 				bio_op | write_flags, end_compressed_bio_write,
+ 				&next_stripe_start);
+ 			if (IS_ERR(bio)) {
+ 				ret = errno_to_blk_status(PTR_ERR(bio));
+ 				bio = NULL;
+ 				goto finish_cb;
+ 			}
  		}
- 
- 		bio_set_dev(bio, device->bdev);
- 	}
- 
- 	if (blkcg_css) {
- 		bio->bi_opf |= REQ_CGROUP_PUNT;
- 		kthread_associate_blkcg(blkcg_css);
- 	}
- 	refcount_set(&cb->pending_bios, 1);
- 
- 	/* create and submit bios for the compressed pages */
- 	bytes_left = compressed_len;
- 	for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) {
- 		int submit = 0;
- 		int len = 0;
- 
- 		page = compressed_pages[pg_index];
- 		page->mapping = inode->vfs_inode.i_mapping;
- 		if (bio->bi_iter.bi_size)
- 			submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE, bio,
- 							  0);
- 
  		/*
- 		 * Page can only be added to bio if the current bio fits in
- 		 * stripe.
+ 		 * We should never reach next_stripe_start start as we will
+ 		 * submit comp_bio when reach the boundary immediately.
  		 */
- 		if (!submit) {
- 			if (pg_index == 0 && use_append)
- 				len = bio_add_zone_append_page(bio, page,
- 							       PAGE_SIZE, 0);
- 			else
- 				len = bio_add_page(bio, page, PAGE_SIZE, 0);
- 		}
- 
- 		page->mapping = NULL;
- 		if (submit || len < PAGE_SIZE) {
- 			/*
- 			 * inc the count before we submit the bio so
- 			 * we know the end IO handler won't happen before
- 			 * we inc the count.  Otherwise, the cb might get
- 			 * freed before we're done setting it up
- 			 */
- 			refcount_inc(&cb->pending_bios);
- 			ret = btrfs_bio_wq_end_io(fs_info, bio,
- 						  BTRFS_WQ_ENDIO_DATA);
- 			BUG_ON(ret); /* -ENOMEM */
+ 		ASSERT(cur_disk_bytenr != next_stripe_start);
  
+ 		/*
+ 		 * We have various limits on the real read size:
+ 		 * - stripe boundary
+ 		 * - page boundary
+ 		 * - compressed length boundary
+ 		 */
+ 		real_size = min_t(u64, U32_MAX, next_stripe_start - cur_disk_bytenr);
+ 		real_size = min_t(u64, real_size, PAGE_SIZE - offset_in_page(offset));
+ 		real_size = min_t(u64, real_size, compressed_len - offset);
+ 		ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize));
+ 
+ 		if (use_append)
+ 			added = bio_add_zone_append_page(bio, page, real_size,
+ 					offset_in_page(offset));
+ 		else
+ 			added = bio_add_page(bio, page, real_size,
+ 					offset_in_page(offset));
+ 		/* Reached zoned boundary */
+ 		if (added == 0)
+ 			submit = true;
+ 
+ 		cur_disk_bytenr += added;
+ 		/* Reached stripe boundary */
+ 		if (cur_disk_bytenr == next_stripe_start)
+ 			submit = true;
+ 
+ 		/* Finished the range */
+ 		if (cur_disk_bytenr == disk_start + compressed_len)
+ 			submit = true;
+ 
+ 		if (submit) {
  			if (!skip_sum) {
  				ret = btrfs_csum_one_bio(inode, bio, start, 1);
- 				BUG_ON(ret); /* -ENOMEM */
- 			}
- 
- 			ret = btrfs_map_bio(fs_info, bio, 0);
- 			if (ret) {
- 				bio->bi_status = ret;
- 				bio_endio(bio);
+ 				if (ret)
+ 					goto finish_cb;
  			}
  
- 			bio = btrfs_bio_alloc(first_byte);
- 			bio->bi_opf = bio_op | write_flags;
- 			bio->bi_private = cb;
- 			bio->bi_end_io = end_compressed_bio_write;
- 			if (blkcg_css)
- 				bio->bi_opf |= REQ_CGROUP_PUNT;
- 			/*
- 			 * Use bio_add_page() to ensure the bio has at least one
- 			 * page.
- 			 */
- 			bio_add_page(bio, page, PAGE_SIZE, 0);
+ 			ret = submit_compressed_bio(fs_info, cb, bio, 0);
+ 			if (ret)
+ 				goto finish_cb;
+ 			bio = NULL;
  		}
- 		if (bytes_left < PAGE_SIZE) {
- 			btrfs_info(fs_info,
- 					"bytes left %lu compress len %u nr %u",
- 			       bytes_left, cb->compressed_len, cb->nr_pages);
- 		}
- 		bytes_left -= PAGE_SIZE;
- 		first_byte += PAGE_SIZE;
  		cond_resched();
  	}
+ 	if (blkcg_css)
+ 		kthread_associate_blkcg(NULL);
  
- 	ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
- 	BUG_ON(ret); /* -ENOMEM */
- 
- 	if (!skip_sum) {
- 		ret = btrfs_csum_one_bio(inode, bio, start, 1);
- 		BUG_ON(ret); /* -ENOMEM */
- 	}
+ 	return 0;
  
- 	ret = btrfs_map_bio(fs_info, bio, 0);
- 	if (ret) {
+ finish_cb:
+ 	if (bio) {
  		bio->bi_status = ret;
  		bio_endio(bio);
  	}
+ 	/* Last byte of @cb is submitted, endio will free @cb */
+ 	if (cur_disk_bytenr == disk_start + compressed_len)
+ 		return ret;
  
- 	if (blkcg_css)
- 		kthread_associate_blkcg(NULL);
- 
- 	return 0;
+ 	wait_var_event(cb, refcount_read(&cb->pending_sectors) ==
+ 			   (disk_start + compressed_len - cur_disk_bytenr) >>
+ 			   fs_info->sectorsize_bits);
+ 	/*
+ 	 * Even with previous bio ended, we should still have io not yet
+ 	 * submitted, thus need to finish manually.
+ 	 */
+ 	ASSERT(refcount_read(&cb->pending_sectors));
+ 	/* Now we are the only one referring @cb, can finish it safely. */
+ 	finish_compressed_bio_write(cb);
+ 	return ret;
  }
  
  static u64 bio_end_offset(struct bio *bio)
@@@ -541,25 -634,33 +636,33 @@@
  	return page_offset(last->bv_page) + last->bv_len + last->bv_offset;
  }
  
+ /*
+  * Add extra pages in the same compressed file extent so that we don't need to
+  * re-read the same extent again and again.
+  *
+  * NOTE: this won't work well for subpage, as for subpage read, we lock the
+  * full page then submit bio for each compressed/regular extents.
+  *
+  * This means, if we have several sectors in the same page points to the same
+  * on-disk compressed data, we will re-read the same extent many times and
+  * this function can only help for the next page.
+  */
  static noinline int add_ra_bio_pages(struct inode *inode,
  				     u64 compressed_end,
  				     struct compressed_bio *cb)
  {
+ 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
  	unsigned long end_index;
- 	unsigned long pg_index;
- 	u64 last_offset;
+ 	u64 cur = bio_end_offset(cb->orig_bio);
  	u64 isize = i_size_read(inode);
  	int ret;
  	struct page *page;
- 	unsigned long nr_pages = 0;
  	struct extent_map *em;
  	struct address_space *mapping = inode->i_mapping;
  	struct extent_map_tree *em_tree;
  	struct extent_io_tree *tree;
- 	u64 end;
- 	int misses = 0;
+ 	int sectors_missed = 0;
  
- 	last_offset = bio_end_offset(cb->orig_bio);
  	em_tree = &BTRFS_I(inode)->extent_tree;
  	tree = &BTRFS_I(inode)->io_tree;
  
@@@ -578,18 -679,29 +681,29 @@@
  
  	end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
  
- 	while (last_offset < compressed_end) {
- 		pg_index = last_offset >> PAGE_SHIFT;
+ 	while (cur < compressed_end) {
+ 		u64 page_end;
+ 		u64 pg_index = cur >> PAGE_SHIFT;
+ 		u32 add_size;
  
  		if (pg_index > end_index)
  			break;
  
  		page = xa_load(&mapping->i_pages, pg_index);
  		if (page && !xa_is_value(page)) {
- 			misses++;
- 			if (misses > 4)
+ 			sectors_missed += (PAGE_SIZE - offset_in_page(cur)) >>
+ 					  fs_info->sectorsize_bits;
+ 
+ 			/* Beyond threshold, no need to continue */
+ 			if (sectors_missed > 4)
  				break;
- 			goto next;
+ 
+ 			/*
+ 			 * Jump to next page start as we already have page for
+ 			 * current offset.
+ 			 */
+ 			cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE;
+ 			continue;
  		}
  
  		page = __page_cache_alloc(mapping_gfp_constraint(mapping,
@@@ -599,14 -711,11 +713,11 @@@
  
  		if (add_to_page_cache_lru(page, mapping, pg_index, GFP_NOFS)) {
  			put_page(page);
- 			goto next;
+ 			/* There is already a page, skip to page end */
+ 			cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE;
+ 			continue;
  		}
  
- 		/*
- 		 * at this point, we have a locked page in the page cache
- 		 * for these bytes in the file.  But, we have to make
- 		 * sure they map to this compressed extent on disk.
- 		 */
  		ret = set_page_extent_mapped(page);
  		if (ret < 0) {
  			unlock_page(page);
@@@ -614,18 -723,22 +725,22 @@@
  			break;
  		}
  
- 		end = last_offset + PAGE_SIZE - 1;
- 		lock_extent(tree, last_offset, end);
+ 		page_end = (pg_index << PAGE_SHIFT) + PAGE_SIZE - 1;
+ 		lock_extent(tree, cur, page_end);
  		read_lock(&em_tree->lock);
- 		em = lookup_extent_mapping(em_tree, last_offset,
- 					   PAGE_SIZE);
+ 		em = lookup_extent_mapping(em_tree, cur, page_end + 1 - cur);
  		read_unlock(&em_tree->lock);
  
- 		if (!em || last_offset < em->start ||
- 		    (last_offset + PAGE_SIZE > extent_map_end(em)) ||
+ 		/*
+ 		 * At this point, we have a locked page in the page cache for
+ 		 * these bytes in the file.  But, we have to make sure they map
+ 		 * to this compressed extent on disk.
+ 		 */
+ 		if (!em || cur < em->start ||
+ 		    (cur + fs_info->sectorsize > extent_map_end(em)) ||
  		    (em->block_start >> 9) != cb->orig_bio->bi_iter.bi_sector) {
  			free_extent_map(em);
- 			unlock_extent(tree, last_offset, end);
+ 			unlock_extent(tree, cur, page_end);
  			unlock_page(page);
  			put_page(page);
  			break;
@@@ -643,20 -756,23 +758,23 @@@
  			}
  		}
  
- 		ret = bio_add_page(cb->orig_bio, page,
- 				   PAGE_SIZE, 0);
- 
- 		if (ret == PAGE_SIZE) {
- 			nr_pages++;
- 			put_page(page);
- 		} else {
- 			unlock_extent(tree, last_offset, end);
+ 		add_size = min(em->start + em->len, page_end + 1) - cur;
+ 		ret = bio_add_page(cb->orig_bio, page, add_size, offset_in_page(cur));
+ 		if (ret != add_size) {
+ 			unlock_extent(tree, cur, page_end);
  			unlock_page(page);
  			put_page(page);
  			break;
  		}
- next:
- 		last_offset += PAGE_SIZE;
+ 		/*
+ 		 * If it's subpage, we also need to increase its
+ 		 * subpage::readers number, as at endio we will decrease
+ 		 * subpage::readers and to unlock the page.
+ 		 */
+ 		if (fs_info->sectorsize < PAGE_SIZE)
+ 			btrfs_subpage_start_reader(fs_info, page, cur, add_size);
+ 		put_page(page);
+ 		cur += add_size;
  	}
  	return 0;
  }
@@@ -681,9 -797,10 +799,10 @@@ blk_status_t btrfs_submit_compressed_re
  	unsigned int compressed_len;
  	unsigned int nr_pages;
  	unsigned int pg_index;
- 	struct page *page;
- 	struct bio *comp_bio;
- 	u64 cur_disk_byte = bio->bi_iter.bi_sector << 9;
+ 	struct bio *comp_bio = NULL;
+ 	const u64 disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT;
+ 	u64 cur_disk_byte = disk_bytenr;
+ 	u64 next_stripe_start;
  	u64 file_offset;
  	u64 em_len;
  	u64 em_start;
@@@ -710,7 -827,7 +829,7 @@@
  	if (!cb)
  		goto out;
  
- 	refcount_set(&cb->pending_bios, 0);
+ 	refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits);
  	cb->errors = 0;
  	cb->inode = inode;
  	cb->mirror_num = mirror_num;
@@@ -750,86 -867,74 +869,74 @@@
  	/* include any pages we added in add_ra-bio_pages */
  	cb->len = bio->bi_iter.bi_size;
  
- 	comp_bio = btrfs_bio_alloc(cur_disk_byte);
- 	comp_bio->bi_opf = REQ_OP_READ;
- 	comp_bio->bi_private = cb;
- 	comp_bio->bi_end_io = end_compressed_bio_read;
- 	refcount_set(&cb->pending_bios, 1);
- 
- 	for (pg_index = 0; pg_index < nr_pages; pg_index++) {
- 		u32 pg_len = PAGE_SIZE;
- 		int submit = 0;
+ 	while (cur_disk_byte < disk_bytenr + compressed_len) {
+ 		u64 offset = cur_disk_byte - disk_bytenr;
+ 		unsigned int index = offset >> PAGE_SHIFT;
+ 		unsigned int real_size;
+ 		unsigned int added;
+ 		struct page *page = cb->compressed_pages[index];
+ 		bool submit = false;
+ 
+ 		/* Allocate new bio if submitted or not yet allocated */
+ 		if (!comp_bio) {
+ 			comp_bio = alloc_compressed_bio(cb, cur_disk_byte,
+ 					REQ_OP_READ, end_compressed_bio_read,
+ 					&next_stripe_start);
+ 			if (IS_ERR(comp_bio)) {
+ 				ret = errno_to_blk_status(PTR_ERR(comp_bio));
+ 				comp_bio = NULL;
+ 				goto finish_cb;
+ 			}
+ 		}
+ 		/*
+ 		 * We should never reach next_stripe_start start as we will
+ 		 * submit comp_bio when reach the boundary immediately.
+ 		 */
+ 		ASSERT(cur_disk_byte != next_stripe_start);
+ 		/*
+ 		 * We have various limit on the real read size:
+ 		 * - stripe boundary
+ 		 * - page boundary
+ 		 * - compressed length boundary
+ 		 */
+ 		real_size = min_t(u64, U32_MAX, next_stripe_start - cur_disk_byte);
+ 		real_size = min_t(u64, real_size, PAGE_SIZE - offset_in_page(offset));
+ 		real_size = min_t(u64, real_size, compressed_len - offset);
+ 		ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize));
  
+ 		added = bio_add_page(comp_bio, page, real_size, offset_in_page(offset));
  		/*
- 		 * To handle subpage case, we need to make sure the bio only
- 		 * covers the range we need.
- 		 *
- 		 * If we're at the last page, truncate the length to only cover
- 		 * the remaining part.
+ 		 * Maximum compressed extent is smaller than bio size limit,
+ 		 * thus bio_add_page() should always success.
  		 */
- 		if (pg_index == nr_pages - 1)
- 			pg_len = min_t(u32, PAGE_SIZE,
- 					compressed_len - pg_index * PAGE_SIZE);
+ 		ASSERT(added == real_size);
+ 		cur_disk_byte += added;
  
- 		page = cb->compressed_pages[pg_index];
- 		page->mapping = inode->i_mapping;
- 		page->index = em_start >> PAGE_SHIFT;
+ 		/* Reached stripe boundary, need to submit */
+ 		if (cur_disk_byte == next_stripe_start)
+ 			submit = true;
  
- 		if (comp_bio->bi_iter.bi_size)
- 			submit = btrfs_bio_fits_in_stripe(page, pg_len,
- 							  comp_bio, 0);
+ 		/* Has finished the range, need to submit */
+ 		if (cur_disk_byte == disk_bytenr + compressed_len)
+ 			submit = true;
  
- 		page->mapping = NULL;
- 		if (submit || bio_add_page(comp_bio, page, pg_len, 0) < pg_len) {
+ 		if (submit) {
  			unsigned int nr_sectors;
  
- 			ret = btrfs_bio_wq_end_io(fs_info, comp_bio,
- 						  BTRFS_WQ_ENDIO_DATA);
- 			BUG_ON(ret); /* -ENOMEM */
- 
- 			/*
- 			 * inc the count before we submit the bio so
- 			 * we know the end IO handler won't happen before
- 			 * we inc the count.  Otherwise, the cb might get
- 			 * freed before we're done setting it up
- 			 */
- 			refcount_inc(&cb->pending_bios);
- 
  			ret = btrfs_lookup_bio_sums(inode, comp_bio, sums);
- 			BUG_ON(ret); /* -ENOMEM */
+ 			if (ret)
+ 				goto finish_cb;
  
  			nr_sectors = DIV_ROUND_UP(comp_bio->bi_iter.bi_size,
  						  fs_info->sectorsize);
  			sums += fs_info->csum_size * nr_sectors;
  
- 			ret = btrfs_map_bio(fs_info, comp_bio, mirror_num);
- 			if (ret) {
- 				comp_bio->bi_status = ret;
- 				bio_endio(comp_bio);
- 			}
- 
- 			comp_bio = btrfs_bio_alloc(cur_disk_byte);
- 			comp_bio->bi_opf = REQ_OP_READ;
- 			comp_bio->bi_private = cb;
- 			comp_bio->bi_end_io = end_compressed_bio_read;
- 
- 			bio_add_page(comp_bio, page, pg_len, 0);
+ 			ret = submit_compressed_bio(fs_info, cb, comp_bio, mirror_num);
+ 			if (ret)
+ 				goto finish_cb;
+ 			comp_bio = NULL;
  		}
- 		cur_disk_byte += pg_len;
  	}
- 
- 	ret = btrfs_bio_wq_end_io(fs_info, comp_bio, BTRFS_WQ_ENDIO_DATA);
- 	BUG_ON(ret); /* -ENOMEM */
- 
- 	ret = btrfs_lookup_bio_sums(inode, comp_bio, sums);
- 	BUG_ON(ret); /* -ENOMEM */
- 
- 	ret = btrfs_map_bio(fs_info, comp_bio, mirror_num);
- 	if (ret) {
- 		comp_bio->bi_status = ret;
- 		bio_endio(comp_bio);
- 	}
- 
  	return 0;
  
  fail2:
@@@ -844,6 -949,26 +951,26 @@@ fail1
  out:
  	free_extent_map(em);
  	return ret;
+ finish_cb:
+ 	if (comp_bio) {
+ 		comp_bio->bi_status = ret;
+ 		bio_endio(comp_bio);
+ 	}
+ 	/* All bytes of @cb is submitted, endio will free @cb */
+ 	if (cur_disk_byte == disk_bytenr + compressed_len)
+ 		return ret;
+ 
+ 	wait_var_event(cb, refcount_read(&cb->pending_sectors) ==
+ 			   (disk_bytenr + compressed_len - cur_disk_byte) >>
+ 			   fs_info->sectorsize_bits);
+ 	/*
+ 	 * Even with previous bio ended, we should still have io not yet
+ 	 * submitted, thus need to finish @cb manually.
+ 	 */
+ 	ASSERT(refcount_read(&cb->pending_sectors));
+ 	/* Now we are the only one referring @cb, can finish it safely. */
+ 	finish_compressed_bio_read(cb, NULL);
+ 	return ret;
  }
  
  /*
diff --combined fs/btrfs/ctree.c
index 66290b2,74c8e18..c3983bd
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@@ -7,7 -7,6 +7,7 @@@
  #include <linux/slab.h>
  #include <linux/rbtree.h>
  #include <linux/mm.h>
 +#include <linux/error-injection.h>
  #include "ctree.h"
  #include "disk-io.h"
  #include "transaction.h"
@@@ -396,7 -395,7 +396,7 @@@ static noinline int __btrfs_cow_block(s
  	if (*cow_ret == buf)
  		unlock_orig = 1;
  
- 	btrfs_assert_tree_locked(buf);
+ 	btrfs_assert_tree_write_locked(buf);
  
  	WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
  		trans->transid != fs_info->running_transaction->transid);
@@@ -2488,7 -2487,7 +2488,7 @@@ static void insert_ptr(struct btrfs_tra
  	int ret;
  
  	BUG_ON(!path->nodes[level]);
- 	btrfs_assert_tree_locked(path->nodes[level]);
+ 	btrfs_assert_tree_write_locked(path->nodes[level]);
  	lower = path->nodes[level];
  	nritems = btrfs_header_nritems(lower);
  	BUG_ON(slot > nritems);
@@@ -2828,7 -2827,7 +2828,7 @@@ static int push_leaf_right(struct btrfs
  	if (slot >= btrfs_header_nritems(upper) - 1)
  		return 1;
  
- 	btrfs_assert_tree_locked(path->nodes[1]);
+ 	btrfs_assert_tree_write_locked(path->nodes[1]);
  
  	right = btrfs_read_node_slot(upper, slot + 1);
  	/*
@@@ -3066,7 -3065,7 +3066,7 @@@ static int push_leaf_left(struct btrfs_
  	if (right_nritems == 0)
  		return 1;
  
- 	btrfs_assert_tree_locked(path->nodes[1]);
+ 	btrfs_assert_tree_write_locked(path->nodes[1]);
  
  	left = btrfs_read_node_slot(path->nodes[1], slot - 1);
  	/*
@@@ -3582,40 -3581,6 +3582,6 @@@ int btrfs_split_item(struct btrfs_trans
  }
  
  /*
-  * This function duplicate a item, giving 'new_key' to the new item.
-  * It guarantees both items live in the same tree leaf and the new item
-  * is contiguous with the original item.
-  *
-  * This allows us to split file extent in place, keeping a lock on the
-  * leaf the entire time.
-  */
- int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
- 			 struct btrfs_root *root,
- 			 struct btrfs_path *path,
- 			 const struct btrfs_key *new_key)
- {
- 	struct extent_buffer *leaf;
- 	int ret;
- 	u32 item_size;
- 
- 	leaf = path->nodes[0];
- 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
- 	ret = setup_leaf_for_split(trans, root, path,
- 				   item_size + sizeof(struct btrfs_item));
- 	if (ret)
- 		return ret;
- 
- 	path->slots[0]++;
- 	setup_items_for_insert(root, path, new_key, &item_size, 1);
- 	leaf = path->nodes[0];
- 	memcpy_extent_buffer(leaf,
- 			     btrfs_item_ptr_offset(leaf, path->slots[0]),
- 			     btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
- 			     item_size);
- 	return 0;
- }
- 
- /*
   * make the item pointed to by the path smaller.  new_size indicates
   * how small to make it, and from_end tells us if we just chop bytes
   * off the end of the item or if we shift the item to chop bytes off
@@@ -3786,13 -3751,10 +3752,10 @@@ void btrfs_extend_item(struct btrfs_pat
   *
   * @root:	root we are inserting items to
   * @path:	points to the leaf/slot where we are going to insert new items
-  * @cpu_key:	array of keys for items to be inserted
-  * @data_size:	size of the body of each item we are going to insert
-  * @nr:		size of @cpu_key/@data_size arrays
+  * @batch:      information about the batch of items to insert
   */
- void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
- 			    const struct btrfs_key *cpu_key, u32 *data_size,
- 			    int nr)
+ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
+ 				   const struct btrfs_item_batch *batch)
  {
  	struct btrfs_fs_info *fs_info = root->fs_info;
  	struct btrfs_item *item;
@@@ -3804,14 -3766,14 +3767,14 @@@
  	int slot;
  	struct btrfs_map_token token;
  	u32 total_size;
- 	u32 total_data = 0;
- 
- 	for (i = 0; i < nr; i++)
- 		total_data += data_size[i];
- 	total_size = total_data + (nr * sizeof(struct btrfs_item));
  
+ 	/*
+ 	 * Before anything else, update keys in the parent and other ancestors
+ 	 * if needed, then release the write locks on them, so that other tasks
+ 	 * can use them while we modify the leaf.
+ 	 */
  	if (path->slots[0] == 0) {
- 		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
+ 		btrfs_cpu_key_to_disk(&disk_key, &batch->keys[0]);
  		fixup_low_keys(path, &disk_key, 1);
  	}
  	btrfs_unlock_up_safe(path, 1);
@@@ -3821,6 -3783,7 +3784,7 @@@
  
  	nritems = btrfs_header_nritems(leaf);
  	data_end = leaf_data_end(leaf);
+ 	total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
  
  	if (btrfs_leaf_free_space(leaf) < total_size) {
  		btrfs_print_leaf(leaf);
@@@ -3850,31 -3813,32 +3814,32 @@@
  			item = btrfs_item_nr(i);
  			ioff = btrfs_token_item_offset(&token, item);
  			btrfs_set_token_item_offset(&token, item,
- 						    ioff - total_data);
+ 						    ioff - batch->total_data_size);
  		}
  		/* shift the items */
- 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
+ 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + batch->nr),
  			      btrfs_item_nr_offset(slot),
  			      (nritems - slot) * sizeof(struct btrfs_item));
  
  		/* shift the data */
  		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
- 			      data_end - total_data, BTRFS_LEAF_DATA_OFFSET +
- 			      data_end, old_data - data_end);
+ 				      data_end - batch->total_data_size,
+ 				      BTRFS_LEAF_DATA_OFFSET + data_end,
+ 				      old_data - data_end);
  		data_end = old_data;
  	}
  
  	/* setup the item for the new data */
- 	for (i = 0; i < nr; i++) {
- 		btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
+ 	for (i = 0; i < batch->nr; i++) {
+ 		btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]);
  		btrfs_set_item_key(leaf, &disk_key, slot + i);
  		item = btrfs_item_nr(slot + i);
- 		data_end -= data_size[i];
+ 		data_end -= batch->data_sizes[i];
  		btrfs_set_token_item_offset(&token, item, data_end);
- 		btrfs_set_token_item_size(&token, item, data_size[i]);
+ 		btrfs_set_token_item_size(&token, item, batch->data_sizes[i]);
  	}
  
- 	btrfs_set_header_nritems(leaf, nritems + nr);
+ 	btrfs_set_header_nritems(leaf, nritems + batch->nr);
  	btrfs_mark_buffer_dirty(leaf);
  
  	if (btrfs_leaf_free_space(leaf) < 0) {
@@@ -3884,26 -3848,43 +3849,43 @@@
  }
  
  /*
+  * Insert a new item into a leaf.
+  *
+  * @root:      The root of the btree.
+  * @path:      A path pointing to the target leaf and slot.
+  * @key:       The key of the new item.
+  * @data_size: The size of the data associated with the new key.
+  */
+ void btrfs_setup_item_for_insert(struct btrfs_root *root,
+ 				 struct btrfs_path *path,
+ 				 const struct btrfs_key *key,
+ 				 u32 data_size)
+ {
+ 	struct btrfs_item_batch batch;
+ 
+ 	batch.keys = key;
+ 	batch.data_sizes = &data_size;
+ 	batch.total_data_size = data_size;
+ 	batch.nr = 1;
+ 
+ 	setup_items_for_insert(root, path, &batch);
+ }
+ 
+ /*
   * Given a key and some data, insert items into the tree.
   * This does all the path init required, making room in the tree if needed.
   */
  int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
  			    struct btrfs_root *root,
  			    struct btrfs_path *path,
- 			    const struct btrfs_key *cpu_key, u32 *data_size,
- 			    int nr)
+ 			    const struct btrfs_item_batch *batch)
  {
  	int ret = 0;
  	int slot;
- 	int i;
- 	u32 total_size = 0;
- 	u32 total_data = 0;
- 
- 	for (i = 0; i < nr; i++)
- 		total_data += data_size[i];
+ 	u32 total_size;
  
- 	total_size = total_data + (nr * sizeof(struct btrfs_item));
- 	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
+ 	total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
+ 	ret = btrfs_search_slot(trans, root, &batch->keys[0], path, total_size, 1);
  	if (ret == 0)
  		return -EEXIST;
  	if (ret < 0)
@@@ -3912,7 -3893,7 +3894,7 @@@
  	slot = path->slots[0];
  	BUG_ON(slot < 0);
  
- 	setup_items_for_insert(root, path, cpu_key, data_size, nr);
+ 	setup_items_for_insert(root, path, batch);
  	return 0;
  }
  
@@@ -3944,6 -3925,40 +3926,40 @@@ int btrfs_insert_item(struct btrfs_tran
  }
  
  /*
+  * This function duplicates an item, giving 'new_key' to the new item.
+  * It guarantees both items live in the same tree leaf and the new item is
+  * contiguous with the original item.
+  *
+  * This allows us to split a file extent in place, keeping a lock on the leaf
+  * the entire time.
+  */
+ int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
+ 			 struct btrfs_root *root,
+ 			 struct btrfs_path *path,
+ 			 const struct btrfs_key *new_key)
+ {
+ 	struct extent_buffer *leaf;
+ 	int ret;
+ 	u32 item_size;
+ 
+ 	leaf = path->nodes[0];
+ 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ 	ret = setup_leaf_for_split(trans, root, path,
+ 				   item_size + sizeof(struct btrfs_item));
+ 	if (ret)
+ 		return ret;
+ 
+ 	path->slots[0]++;
+ 	btrfs_setup_item_for_insert(root, path, new_key, item_size);
+ 	leaf = path->nodes[0];
+ 	memcpy_extent_buffer(leaf,
+ 			     btrfs_item_ptr_offset(leaf, path->slots[0]),
+ 			     btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
+ 			     item_size);
+ 	return 0;
+ }
+ 
+ /*
   * delete the pointer from a given node.
   *
   * the tree should have been previously balanced so the deletion does not
diff --combined fs/btrfs/dev-replace.c
index fbb8b44,59ef388..c85a7d4
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@@ -70,6 -70,7 +70,7 @@@ static int btrfs_dev_replace_kthread(vo
  
  int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
  {
+ 	struct btrfs_dev_lookup_args args = { .devid = BTRFS_DEV_REPLACE_DEVID };
  	struct btrfs_key key;
  	struct btrfs_root *dev_root = fs_info->dev_root;
  	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
@@@ -100,8 -101,7 +101,7 @@@ no_valid_dev_replace_entry_found
  		 * We don't have a replace item or it's corrupted.  If there is
  		 * a replace target, fail the mount.
  		 */
- 		if (btrfs_find_device(fs_info->fs_devices,
- 				      BTRFS_DEV_REPLACE_DEVID, NULL, NULL)) {
+ 		if (btrfs_find_device(fs_info->fs_devices, &args)) {
  			btrfs_err(fs_info,
  			"found replace target device without a valid replace item");
  			ret = -EUCLEAN;
@@@ -163,8 -163,7 +163,7 @@@
  		 * We don't have an active replace item but if there is a
  		 * replace target, fail the mount.
  		 */
- 		if (btrfs_find_device(fs_info->fs_devices,
- 				      BTRFS_DEV_REPLACE_DEVID, NULL, NULL)) {
+ 		if (btrfs_find_device(fs_info->fs_devices, &args)) {
  			btrfs_err(fs_info,
  			"replace devid present without an active replace item");
  			ret = -EUCLEAN;
@@@ -175,11 -174,10 +174,10 @@@
  		break;
  	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
  	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
- 		dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices,
- 						src_devid, NULL, NULL);
- 		dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices,
- 							BTRFS_DEV_REPLACE_DEVID,
- 							NULL, NULL);
+ 		dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices, &args);
+ 		args.devid = src_devid;
+ 		dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices, &args);
+ 
  		/*
  		 * allow 'btrfs dev replace_cancel' if src/tgt device is
  		 * missing
@@@ -283,7 -281,8 +281,7 @@@ static int btrfs_init_dev_replace_tgtde
  	}
  
  
 -	if (i_size_read(bdev->bd_inode) <
 -	    btrfs_device_get_total_bytes(srcdev)) {
 +	if (bdev_nr_bytes(bdev) < btrfs_device_get_total_bytes(srcdev)) {
  		btrfs_err(fs_info,
  			  "target device is smaller than source device!");
  		ret = -EINVAL;
diff --combined fs/btrfs/disk-io.c
index 29e7598,c725433..59c3be8
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@@ -683,7 -683,7 +683,7 @@@ err
  	return ret;
  }
  
- int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio,
+ int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
  				   struct page *page, u64 start, u64 end,
  				   int mirror)
  {
@@@ -1036,7 -1036,7 +1036,7 @@@ static int btree_set_page_dirty(struct 
  		BUG_ON(!eb);
  		BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
  		BUG_ON(!atomic_read(&eb->refs));
- 		btrfs_assert_tree_locked(eb);
+ 		btrfs_assert_tree_write_locked(eb);
  		return __set_page_dirty_nobuffers(page);
  	}
  	ASSERT(PagePrivate(page) && page->private);
@@@ -1061,7 -1061,7 +1061,7 @@@
  		ASSERT(eb);
  		ASSERT(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
  		ASSERT(atomic_read(&eb->refs));
- 		btrfs_assert_tree_locked(eb);
+ 		btrfs_assert_tree_write_locked(eb);
  		free_extent_buffer(eb);
  
  		cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits);
@@@ -1125,7 -1125,7 +1125,7 @@@ void btrfs_clean_tree_block(struct exte
  	struct btrfs_fs_info *fs_info = buf->fs_info;
  	if (btrfs_header_generation(buf) ==
  	    fs_info->running_transaction->transid) {
- 		btrfs_assert_tree_locked(buf);
+ 		btrfs_assert_tree_write_locked(buf);
  
  		if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
  			percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
@@@ -1500,7 -1500,7 +1500,7 @@@ static int btrfs_init_fs_root(struct bt
  		goto fail;
  
  	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID &&
- 	    root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
+ 	    !btrfs_is_data_reloc_root(root)) {
  		set_bit(BTRFS_ROOT_SHAREABLE, &root->state);
  		btrfs_check_and_init_root_item(&root->root_item);
  	}
@@@ -1644,6 -1644,7 +1644,7 @@@ void btrfs_free_fs_info(struct btrfs_fs
  	btrfs_extent_buffer_leak_debug_check(fs_info);
  	kfree(fs_info->super_copy);
  	kfree(fs_info->super_for_commit);
+ 	kfree(fs_info->subpage_info);
  	kvfree(fs_info);
  }
  
@@@ -1953,8 -1954,7 +1954,7 @@@ sleep
  		wake_up_process(fs_info->cleaner_kthread);
  		mutex_unlock(&fs_info->transaction_kthread_mutex);
  
- 		if (unlikely(test_bit(BTRFS_FS_STATE_ERROR,
- 				      &fs_info->fs_state)))
+ 		if (BTRFS_FS_ERROR(fs_info))
  			btrfs_cleanup_transaction(fs_info);
  		if (!kthread_should_stop() &&
  				(!btrfs_transaction_blocked(fs_info) ||
@@@ -2592,8 -2592,7 +2592,7 @@@ static int validate_super(struct btrfs_
  
  	/*
  	 * For 4K page size, we only support 4K sector size.
- 	 * For 64K page size, we support read-write for 64K sector size, and
- 	 * read-only for 4K sector size.
+ 	 * For 64K page size, we support 64K and 4K sector sizes.
  	 */
  	if ((PAGE_SIZE == SZ_4K && sectorsize != PAGE_SIZE) ||
  	    (PAGE_SIZE == SZ_64K && (sectorsize != SZ_4K &&
@@@ -2883,6 -2882,8 +2882,8 @@@ void btrfs_init_fs_info(struct btrfs_fs
  	spin_lock_init(&fs_info->buffer_lock);
  	spin_lock_init(&fs_info->unused_bgs_lock);
  	spin_lock_init(&fs_info->treelog_bg_lock);
+ 	spin_lock_init(&fs_info->zone_active_bgs_lock);
+ 	spin_lock_init(&fs_info->relocation_bg_lock);
  	rwlock_init(&fs_info->tree_mod_log_lock);
  	mutex_init(&fs_info->unused_bg_unpin_mutex);
  	mutex_init(&fs_info->reclaim_bgs_lock);
@@@ -2896,6 -2897,7 +2897,7 @@@
  	INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
  	INIT_LIST_HEAD(&fs_info->unused_bgs);
  	INIT_LIST_HEAD(&fs_info->reclaim_bgs);
+ 	INIT_LIST_HEAD(&fs_info->zone_active_bgs);
  #ifdef CONFIG_BTRFS_DEBUG
  	INIT_LIST_HEAD(&fs_info->allocated_roots);
  	INIT_LIST_HEAD(&fs_info->allocated_ebs);
@@@ -3228,12 -3230,12 +3230,12 @@@ int __cold open_ctree(struct super_bloc
  	mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
  	btrfs_init_btree_inode(fs_info);
  
- 	invalidate_bdev(fs_devices->latest_bdev);
+ 	invalidate_bdev(fs_devices->latest_dev->bdev);
  
  	/*
  	 * Read super block and check the signature bytes only
  	 */
- 	disk_super = btrfs_read_dev_super(fs_devices->latest_bdev);
+ 	disk_super = btrfs_read_dev_super(fs_devices->latest_dev->bdev);
  	if (IS_ERR(disk_super)) {
  		err = PTR_ERR(disk_super);
  		goto fail_alloc;
@@@ -3392,12 -3394,12 +3394,12 @@@
  		goto fail_alloc;
  	}
  
- 	if (sectorsize != PAGE_SIZE) {
+ 	if (sectorsize < PAGE_SIZE) {
+ 		struct btrfs_subpage_info *subpage_info;
+ 
  		btrfs_warn(fs_info,
  		"read-write for sector size %u with page size %lu is experimental",
  			   sectorsize, PAGE_SIZE);
- 	}
- 	if (sectorsize != PAGE_SIZE) {
  		if (btrfs_super_incompat_flags(fs_info->super_copy) &
  			BTRFS_FEATURE_INCOMPAT_RAID56) {
  			btrfs_err(fs_info,
@@@ -3406,6 -3408,11 +3408,11 @@@
  			err = -EINVAL;
  			goto fail_alloc;
  		}
+ 		subpage_info = kzalloc(sizeof(*subpage_info), GFP_KERNEL);
+ 		if (!subpage_info)
+ 			goto fail_alloc;
+ 		btrfs_init_subpage_info(subpage_info, sectorsize);
+ 		fs_info->subpage_info = subpage_info;
  	}
  
  	ret = btrfs_init_workqueues(fs_info, fs_devices);
@@@ -3465,7 -3472,7 +3472,7 @@@
  	 * below in btrfs_init_dev_replace().
  	 */
  	btrfs_free_extra_devids(fs_devices);
- 	if (!fs_devices->latest_bdev) {
+ 	if (!fs_devices->latest_dev->bdev) {
  		btrfs_err(fs_info, "failed to read devices");
  		goto fail_tree_roots;
  	}
@@@ -3556,7 -3563,8 +3563,8 @@@
  		goto fail_sysfs;
  	}
  
- 	if (!sb_rdonly(sb) && !btrfs_check_rw_degradable(fs_info, NULL)) {
+ 	if (!sb_rdonly(sb) && fs_info->fs_devices->missing_devices &&
+ 	    !btrfs_check_rw_degradable(fs_info, NULL)) {
  		btrfs_warn(fs_info,
  		"writable mount is not allowed due to too many missing devices");
  		goto fail_sysfs;
@@@ -3740,7 -3748,7 +3748,7 @@@ struct btrfs_super_block *btrfs_read_de
  	else if (ret)
  		return ERR_PTR(ret);
  
 -	if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode))
 +	if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev))
  		return ERR_PTR(-EINVAL);
  
  	page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS);
@@@ -3881,7 -3889,9 +3889,9 @@@ static int write_dev_supers(struct btrf
  			bio->bi_opf |= REQ_FUA;
  
  		btrfsic_submit_bio(bio);
- 		btrfs_advance_sb_log(device, i);
+ 
+ 		if (btrfs_advance_sb_log(device, i))
+ 			errors++;
  	}
  	return errors < i ? 0 : -1;
  }
@@@ -4221,7 -4231,7 +4231,7 @@@ void btrfs_drop_and_free_fs_root(struc
  		drop_ref = true;
  	spin_unlock(&fs_info->fs_roots_radix_lock);
  
- 	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+ 	if (BTRFS_FS_ERROR(fs_info)) {
  		ASSERT(root->log_root == NULL);
  		if (root->reloc_root) {
  			btrfs_put_root(root->reloc_root);
@@@ -4372,8 -4382,7 +4382,7 @@@ void __cold close_ctree(struct btrfs_fs
  			btrfs_err(fs_info, "commit super ret %d", ret);
  	}
  
- 	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state) ||
- 	    test_bit(BTRFS_FS_STATE_TRANS_ABORTED, &fs_info->fs_state))
+ 	if (BTRFS_FS_ERROR(fs_info))
  		btrfs_error_commit_super(fs_info);
  
  	kthread_stop(fs_info->transaction_kthread);
@@@ -4470,7 -4479,7 +4479,7 @@@ void btrfs_mark_buffer_dirty(struct ext
  	if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags)))
  		return;
  #endif
- 	btrfs_assert_tree_locked(buf);
+ 	btrfs_assert_tree_write_locked(buf);
  	if (transid != fs_info->generation)
  		WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, found %llu running %llu\n",
  			buf->start, transid, fs_info->generation);
diff --combined fs/btrfs/inode.c
index 954b53a,5fec009..b8c911a
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@@ -6,7 -6,6 +6,7 @@@
  #include <crypto/hash.h>
  #include <linux/kernel.h>
  #include <linux/bio.h>
 +#include <linux/blk-cgroup.h>
  #include <linux/file.h>
  #include <linux/fs.h>
  #include <linux/pagemap.h>
@@@ -288,9 -287,8 +288,9 @@@ static int insert_inline_extent(struct 
  			cur_size = min_t(unsigned long, compressed_size,
  				       PAGE_SIZE);
  
 -			kaddr = page_address(cpage);
 +			kaddr = kmap_atomic(cpage);
  			write_extent_buffer(leaf, kaddr, ptr, cur_size);
 +			kunmap_atomic(kaddr);
  
  			i++;
  			ptr += cur_size;
@@@ -457,11 -455,10 +457,10 @@@ struct async_chunk 
  	struct list_head extents;
  	struct cgroup_subsys_state *blkcg_css;
  	struct btrfs_work work;
- 	atomic_t *pending;
+ 	struct async_cow *async_cow;
  };
  
  struct async_cow {
- 	/* Number of chunks in flight; must be first in the structure */
  	atomic_t num_chunks;
  	struct async_chunk chunks[];
  };
@@@ -492,9 -489,6 +491,6 @@@ static noinline int add_async_extent(st
   */
  static inline bool inode_can_compress(struct btrfs_inode *inode)
  {
- 	/* Subpage doesn't support compression yet */
- 	if (inode->root->fs_info->sectorsize < PAGE_SIZE)
- 		return false;
  	if (inode->flags & BTRFS_INODE_NODATACOW ||
  	    inode->flags & BTRFS_INODE_NODATASUM)
  		return false;
@@@ -516,6 -510,38 +512,38 @@@ static inline int inode_need_compress(s
  			btrfs_ino(inode));
  		return 0;
  	}
+ 	/*
+ 	 * Special check for subpage.
+ 	 *
+ 	 * We lock the full page then run each delalloc range in the page, thus
+ 	 * for the following case, we will hit some subpage specific corner case:
+ 	 *
+ 	 * 0		32K		64K
+ 	 * |	|///////|	|///////|
+ 	 *		\- A		\- B
+ 	 *
+ 	 * In above case, both range A and range B will try to unlock the full
+ 	 * page [0, 64K), causing the one finished later will have page
+ 	 * unlocked already, triggering various page lock requirement BUG_ON()s.
+ 	 *
+ 	 * So here we add an artificial limit that subpage compression can only
+ 	 * if the range is fully page aligned.
+ 	 *
+ 	 * In theory we only need to ensure the first page is fully covered, but
+ 	 * the tailing partial page will be locked until the full compression
+ 	 * finishes, delaying the write of other range.
+ 	 *
+ 	 * TODO: Make btrfs_run_delalloc_range() to lock all delalloc range
+ 	 * first to prevent any submitted async extent to unlock the full page.
+ 	 * By this, we can ensure for subpage case that only the last async_cow
+ 	 * will unlock the full page.
+ 	 */
+ 	if (fs_info->sectorsize < PAGE_SIZE) {
+ 		if (!IS_ALIGNED(start, PAGE_SIZE) ||
+ 		    !IS_ALIGNED(end + 1, PAGE_SIZE))
+ 			return 0;
+ 	}
+ 
  	/* force compress */
  	if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
  		return 1;
@@@ -617,13 -643,24 +645,24 @@@ again
  	total_compressed = actual_end - start;
  
  	/*
- 	 * skip compression for a small file range(<=blocksize) that
+ 	 * Skip compression for a small file range(<=blocksize) that
  	 * isn't an inline extent, since it doesn't save disk space at all.
  	 */
  	if (total_compressed <= blocksize &&
  	   (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
  		goto cleanup_and_bail_uncompressed;
  
+ 	/*
+ 	 * For subpage case, we require full page alignment for the sector
+ 	 * aligned range.
+ 	 * Thus we must also check against @actual_end, not just @end.
+ 	 */
+ 	if (blocksize < PAGE_SIZE) {
+ 		if (!IS_ALIGNED(start, PAGE_SIZE) ||
+ 		    !IS_ALIGNED(round_up(actual_end, blocksize), PAGE_SIZE))
+ 			goto cleanup_and_bail_uncompressed;
+ 	}
+ 
  	total_compressed = min_t(unsigned long, total_compressed,
  			BTRFS_MAX_UNCOMPRESSED);
  	total_in = 0;
@@@ -761,7 -798,7 +800,7 @@@ cont
  		 * win, compare the page count read with the blocks on disk,
  		 * compression must free at least one sector size
  		 */
- 		total_in = ALIGN(total_in, PAGE_SIZE);
+ 		total_in = round_up(total_in, fs_info->sectorsize);
  		if (total_compressed + blocksize <= total_in) {
  			compressed_extents++;
  
@@@ -842,166 -879,148 +881,148 @@@ static void free_async_extent_pages(str
  	async_extent->pages = NULL;
  }
  
- /*
-  * phase two of compressed writeback.  This is the ordered portion
-  * of the code, which only gets called in the order the work was
-  * queued.  We walk all the async extents created by compress_file_range
-  * and send them down to the disk.
-  */
- static noinline void submit_compressed_extents(struct async_chunk *async_chunk)
+ static int submit_uncompressed_range(struct btrfs_inode *inode,
+ 				     struct async_extent *async_extent,
+ 				     struct page *locked_page)
  {
- 	struct btrfs_inode *inode = BTRFS_I(async_chunk->inode);
- 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
- 	struct async_extent *async_extent;
- 	u64 alloc_hint = 0;
- 	struct btrfs_key ins;
- 	struct extent_map *em;
- 	struct btrfs_root *root = inode->root;
- 	struct extent_io_tree *io_tree = &inode->io_tree;
- 	int ret = 0;
- 
- again:
- 	while (!list_empty(&async_chunk->extents)) {
- 		async_extent = list_entry(async_chunk->extents.next,
- 					  struct async_extent, list);
- 		list_del(&async_extent->list);
- 
- retry:
- 		lock_extent(io_tree, async_extent->start,
- 			    async_extent->start + async_extent->ram_size - 1);
- 		/* did the compression code fall back to uncompressed IO? */
- 		if (!async_extent->pages) {
- 			int page_started = 0;
- 			unsigned long nr_written = 0;
+ 	u64 start = async_extent->start;
+ 	u64 end = async_extent->start + async_extent->ram_size - 1;
+ 	unsigned long nr_written = 0;
+ 	int page_started = 0;
+ 	int ret;
  
- 			/* allocate blocks */
- 			ret = cow_file_range(inode, async_chunk->locked_page,
- 					     async_extent->start,
- 					     async_extent->start +
- 					     async_extent->ram_size - 1,
- 					     &page_started, &nr_written, 0);
+ 	/*
+ 	 * Call cow_file_range() to run the delalloc range directly, since we
+ 	 * won't go to NOCOW or async path again.
+ 	 *
+ 	 * Also we call cow_file_range() with @unlock_page == 0, so that we
+ 	 * can directly submit them without interruption.
+ 	 */
+ 	ret = cow_file_range(inode, locked_page, start, end, &page_started,
+ 			     &nr_written, 0);
+ 	/* Inline extent inserted, page gets unlocked and everything is done */
+ 	if (page_started) {
+ 		ret = 0;
+ 		goto out;
+ 	}
+ 	if (ret < 0) {
+ 		if (locked_page)
+ 			unlock_page(locked_page);
+ 		goto out;
+ 	}
  
- 			/* JDM XXX */
+ 	ret = extent_write_locked_range(&inode->vfs_inode, start, end);
+ 	/* All pages will be unlocked, including @locked_page */
+ out:
+ 	kfree(async_extent);
+ 	return ret;
+ }
  
- 			/*
- 			 * if page_started, cow_file_range inserted an
- 			 * inline extent and took care of all the unlocking
- 			 * and IO for us.  Otherwise, we need to submit
- 			 * all those pages down to the drive.
- 			 */
- 			if (!page_started && !ret)
- 				extent_write_locked_range(&inode->vfs_inode,
- 						  async_extent->start,
- 						  async_extent->start +
- 						  async_extent->ram_size - 1,
- 						  WB_SYNC_ALL);
- 			else if (ret && async_chunk->locked_page)
- 				unlock_page(async_chunk->locked_page);
- 			kfree(async_extent);
- 			cond_resched();
- 			continue;
- 		}
+ static int submit_one_async_extent(struct btrfs_inode *inode,
+ 				   struct async_chunk *async_chunk,
+ 				   struct async_extent *async_extent,
+ 				   u64 *alloc_hint)
+ {
+ 	struct extent_io_tree *io_tree = &inode->io_tree;
+ 	struct btrfs_root *root = inode->root;
+ 	struct btrfs_fs_info *fs_info = root->fs_info;
+ 	struct btrfs_key ins;
+ 	struct page *locked_page = NULL;
+ 	struct extent_map *em;
+ 	int ret = 0;
+ 	u64 start = async_extent->start;
+ 	u64 end = async_extent->start + async_extent->ram_size - 1;
  
- 		ret = btrfs_reserve_extent(root, async_extent->ram_size,
- 					   async_extent->compressed_size,
- 					   async_extent->compressed_size,
- 					   0, alloc_hint, &ins, 1, 1);
- 		if (ret) {
- 			free_async_extent_pages(async_extent);
+ 	/*
+ 	 * If async_chunk->locked_page is in the async_extent range, we need to
+ 	 * handle it.
+ 	 */
+ 	if (async_chunk->locked_page) {
+ 		u64 locked_page_start = page_offset(async_chunk->locked_page);
+ 		u64 locked_page_end = locked_page_start + PAGE_SIZE - 1;
  
- 			if (ret == -ENOSPC) {
- 				unlock_extent(io_tree, async_extent->start,
- 					      async_extent->start +
- 					      async_extent->ram_size - 1);
+ 		if (!(start >= locked_page_end || end <= locked_page_start))
+ 			locked_page = async_chunk->locked_page;
+ 	}
+ 	lock_extent(io_tree, start, end);
  
- 				/*
- 				 * we need to redirty the pages if we decide to
- 				 * fallback to uncompressed IO, otherwise we
- 				 * will not submit these pages down to lower
- 				 * layers.
- 				 */
- 				extent_range_redirty_for_io(&inode->vfs_inode,
- 						async_extent->start,
- 						async_extent->start +
- 						async_extent->ram_size - 1);
+ 	/* We have fall back to uncompressed write */
+ 	if (!async_extent->pages)
+ 		return submit_uncompressed_range(inode, async_extent, locked_page);
  
- 				goto retry;
- 			}
- 			goto out_free;
- 		}
+ 	ret = btrfs_reserve_extent(root, async_extent->ram_size,
+ 				   async_extent->compressed_size,
+ 				   async_extent->compressed_size,
+ 				   0, *alloc_hint, &ins, 1, 1);
+ 	if (ret) {
+ 		free_async_extent_pages(async_extent);
  		/*
- 		 * here we're doing allocation and writeback of the
- 		 * compressed pages
+ 		 * Here we used to try again by going back to non-compressed
+ 		 * path for ENOSPC.  But we can't reserve space even for
+ 		 * compressed size, how could it work for uncompressed size
+ 		 * which requires larger size?  So here we directly go error
+ 		 * path.
  		 */
- 		em = create_io_em(inode, async_extent->start,
- 				  async_extent->ram_size, /* len */
- 				  async_extent->start, /* orig_start */
- 				  ins.objectid, /* block_start */
- 				  ins.offset, /* block_len */
- 				  ins.offset, /* orig_block_len */
- 				  async_extent->ram_size, /* ram_bytes */
- 				  async_extent->compress_type,
- 				  BTRFS_ORDERED_COMPRESSED);
- 		if (IS_ERR(em))
- 			/* ret value is not necessary due to void function */
- 			goto out_free_reserve;
- 		free_extent_map(em);
- 
- 		ret = btrfs_add_ordered_extent_compress(inode,
- 						async_extent->start,
- 						ins.objectid,
- 						async_extent->ram_size,
- 						ins.offset,
- 						async_extent->compress_type);
- 		if (ret) {
- 			btrfs_drop_extent_cache(inode, async_extent->start,
- 						async_extent->start +
- 						async_extent->ram_size - 1, 0);
- 			goto out_free_reserve;
- 		}
- 		btrfs_dec_block_group_reservations(fs_info, ins.objectid);
+ 		goto out_free;
+ 	}
+ 
+ 	/* Here we're doing allocation and writeback of the compressed pages */
+ 	em = create_io_em(inode, start,
+ 			  async_extent->ram_size,	/* len */
+ 			  start,			/* orig_start */
+ 			  ins.objectid,			/* block_start */
+ 			  ins.offset,			/* block_len */
+ 			  ins.offset,			/* orig_block_len */
+ 			  async_extent->ram_size,	/* ram_bytes */
+ 			  async_extent->compress_type,
+ 			  BTRFS_ORDERED_COMPRESSED);
+ 	if (IS_ERR(em)) {
+ 		ret = PTR_ERR(em);
+ 		goto out_free_reserve;
+ 	}
+ 	free_extent_map(em);
  
- 		/*
- 		 * clear dirty, set writeback and unlock the pages.
- 		 */
- 		extent_clear_unlock_delalloc(inode, async_extent->start,
- 				async_extent->start +
- 				async_extent->ram_size - 1,
- 				NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
- 				PAGE_UNLOCK | PAGE_START_WRITEBACK);
- 		if (btrfs_submit_compressed_write(inode, async_extent->start,
- 				    async_extent->ram_size,
- 				    ins.objectid,
- 				    ins.offset, async_extent->pages,
- 				    async_extent->nr_pages,
- 				    async_chunk->write_flags,
- 				    async_chunk->blkcg_css)) {
- 			struct page *p = async_extent->pages[0];
- 			const u64 start = async_extent->start;
- 			const u64 end = start + async_extent->ram_size - 1;
- 
- 			p->mapping = inode->vfs_inode.i_mapping;
- 			btrfs_writepage_endio_finish_ordered(inode, p, start,
- 							     end, false);
- 
- 			p->mapping = NULL;
- 			extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
- 						     PAGE_END_WRITEBACK |
- 						     PAGE_SET_ERROR);
- 			free_async_extent_pages(async_extent);
- 		}
- 		alloc_hint = ins.objectid + ins.offset;
- 		kfree(async_extent);
- 		cond_resched();
+ 	ret = btrfs_add_ordered_extent_compress(inode, start,	/* file_offset */
+ 					ins.objectid,		/* disk_bytenr */
+ 					async_extent->ram_size, /* num_bytes */
+ 					ins.offset,		/* disk_num_bytes */
+ 					async_extent->compress_type);
+ 	if (ret) {
+ 		btrfs_drop_extent_cache(inode, start, end, 0);
+ 		goto out_free_reserve;
  	}
- 	return;
+ 	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
+ 
+ 	/* Clear dirty, set writeback and unlock the pages. */
+ 	extent_clear_unlock_delalloc(inode, start, end,
+ 			NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
+ 			PAGE_UNLOCK | PAGE_START_WRITEBACK);
+ 	if (btrfs_submit_compressed_write(inode, start,	/* file_offset */
+ 			    async_extent->ram_size,	/* num_bytes */
+ 			    ins.objectid,		/* disk_bytenr */
+ 			    ins.offset,			/* compressed_len */
+ 			    async_extent->pages,	/* compressed_pages */
+ 			    async_extent->nr_pages,
+ 			    async_chunk->write_flags,
+ 			    async_chunk->blkcg_css)) {
+ 		const u64 start = async_extent->start;
+ 		const u64 end = start + async_extent->ram_size - 1;
+ 
+ 		btrfs_writepage_endio_finish_ordered(inode, NULL, start, end, 0);
+ 
+ 		extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
+ 					     PAGE_END_WRITEBACK | PAGE_SET_ERROR);
+ 		free_async_extent_pages(async_extent);
+ 	}
+ 	*alloc_hint = ins.objectid + ins.offset;
+ 	kfree(async_extent);
+ 	return ret;
+ 
  out_free_reserve:
  	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
  	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
  out_free:
- 	extent_clear_unlock_delalloc(inode, async_extent->start,
- 				     async_extent->start +
- 				     async_extent->ram_size - 1,
+ 	extent_clear_unlock_delalloc(inode, start, end,
  				     NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
  				     EXTENT_DELALLOC_NEW |
  				     EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
@@@ -1009,7 -1028,39 +1030,39 @@@
  				     PAGE_END_WRITEBACK | PAGE_SET_ERROR);
  	free_async_extent_pages(async_extent);
  	kfree(async_extent);
- 	goto again;
+ 	return ret;
+ }
+ 
+ /*
+  * Phase two of compressed writeback.  This is the ordered portion of the code,
+  * which only gets called in the order the work was queued.  We walk all the
+  * async extents created by compress_file_range and send them down to the disk.
+  */
+ static noinline void submit_compressed_extents(struct async_chunk *async_chunk)
+ {
+ 	struct btrfs_inode *inode = BTRFS_I(async_chunk->inode);
+ 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ 	struct async_extent *async_extent;
+ 	u64 alloc_hint = 0;
+ 	int ret = 0;
+ 
+ 	while (!list_empty(&async_chunk->extents)) {
+ 		u64 extent_start;
+ 		u64 ram_size;
+ 
+ 		async_extent = list_entry(async_chunk->extents.next,
+ 					  struct async_extent, list);
+ 		list_del(&async_extent->list);
+ 		extent_start = async_extent->start;
+ 		ram_size = async_extent->ram_size;
+ 
+ 		ret = submit_one_async_extent(inode, async_chunk, async_extent,
+ 					      &alloc_hint);
+ 		btrfs_debug(fs_info,
+ "async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d",
+ 			    inode->root->root_key.objectid,
+ 			    btrfs_ino(inode), extent_start, ram_size, ret);
+ 	}
  }
  
  static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
@@@ -1152,7 -1203,7 +1205,7 @@@ static noinline int cow_file_range(stru
  	 * fails during the stage where it updates the bytenr of file extent
  	 * items.
  	 */
- 	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+ 	if (btrfs_is_data_reloc_root(root))
  		min_alloc_size = num_bytes;
  	else
  		min_alloc_size = fs_info->sectorsize;
@@@ -1188,8 -1239,7 +1241,7 @@@
  		if (ret)
  			goto out_drop_extent_cache;
  
- 		if (root->root_key.objectid ==
- 		    BTRFS_DATA_RELOC_TREE_OBJECTID) {
+ 		if (btrfs_is_data_reloc_root(root)) {
  			ret = btrfs_reloc_clone_csums(inode, start,
  						      cur_alloc_size);
  			/*
@@@ -1327,18 -1377,17 +1379,17 @@@ static noinline void async_cow_submit(s
  static noinline void async_cow_free(struct btrfs_work *work)
  {
  	struct async_chunk *async_chunk;
+ 	struct async_cow *async_cow;
  
  	async_chunk = container_of(work, struct async_chunk, work);
  	if (async_chunk->inode)
  		btrfs_add_delayed_iput(async_chunk->inode);
  	if (async_chunk->blkcg_css)
  		css_put(async_chunk->blkcg_css);
- 	/*
- 	 * Since the pointer to 'pending' is at the beginning of the array of
- 	 * async_chunk's, freeing it ensures the whole array has been freed.
- 	 */
- 	if (atomic_dec_and_test(async_chunk->pending))
- 		kvfree(async_chunk->pending);
+ 
+ 	async_cow = async_chunk->async_cow;
+ 	if (atomic_dec_and_test(&async_cow->num_chunks))
+ 		kvfree(async_cow);
  }
  
  static int cow_file_range_async(struct btrfs_inode *inode,
@@@ -1399,7 -1448,7 +1450,7 @@@
  		 * lightweight reference for the callback lifetime
  		 */
  		ihold(&inode->vfs_inode);
- 		async_chunk[i].pending = &ctx->num_chunks;
+ 		async_chunk[i].async_cow = ctx;
  		async_chunk[i].inode = &inode->vfs_inode;
  		async_chunk[i].start = start;
  		async_chunk[i].end = cur_end;
@@@ -1472,7 -1521,7 +1523,7 @@@ static noinline int run_delalloc_zoned(
  
  	__set_page_dirty_nobuffers(locked_page);
  	account_page_redirty(locked_page);
- 	extent_write_locked_range(&inode->vfs_inode, start, end, WB_SYNC_ALL);
+ 	extent_write_locked_range(&inode->vfs_inode, start, end);
  	*page_started = 1;
  
  	return 0;
@@@ -1505,8 -1554,7 +1556,7 @@@ static int fallback_to_cow(struct btrfs
  			   int *page_started, unsigned long *nr_written)
  {
  	const bool is_space_ino = btrfs_is_free_space_inode(inode);
- 	const bool is_reloc_ino = (inode->root->root_key.objectid ==
- 				   BTRFS_DATA_RELOC_TREE_OBJECTID);
+ 	const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root);
  	const u64 range_bytes = end + 1 - start;
  	struct extent_io_tree *io_tree = &inode->io_tree;
  	u64 range_start = start;
@@@ -1868,8 -1916,7 +1918,7 @@@ out_check
  			btrfs_dec_nocow_writers(fs_info, disk_bytenr);
  		nocow = false;
  
- 		if (root->root_key.objectid ==
- 		    BTRFS_DATA_RELOC_TREE_OBJECTID)
+ 		if (btrfs_is_data_reloc_root(root))
  			/*
  			 * Error handled later, as we must prevent
  			 * extent_clear_unlock_delalloc() in error handler
@@@ -1948,8 -1995,23 +1997,23 @@@ int btrfs_run_delalloc_range(struct btr
  	int ret;
  	const bool zoned = btrfs_is_zoned(inode->root->fs_info);
  
+ 	/*
+ 	 * The range must cover part of the @locked_page, or the returned
+ 	 * @page_started can confuse the caller.
+ 	 */
+ 	ASSERT(!(end <= page_offset(locked_page) ||
+ 		 start >= page_offset(locked_page) + PAGE_SIZE));
+ 
  	if (should_nocow(inode, start, end)) {
- 		ASSERT(!zoned);
+ 		/*
+ 		 * Normally on a zoned device we're only doing COW writes, but
+ 		 * in case of relocation on a zoned filesystem we have taken
+ 		 * precaution, that we're only writing sequentially. It's safe
+ 		 * to use run_delalloc_nocow() here, like for  regular
+ 		 * preallocated inodes.
+ 		 */
+ 		ASSERT(!zoned ||
+ 		       (zoned && btrfs_is_data_reloc_root(inode->root)));
  		ret = run_delalloc_nocow(inode, locked_page, start, end,
  					 page_started, nr_written);
  	} else if (!inode_can_compress(inode) ||
@@@ -2208,7 -2270,7 +2272,7 @@@ void btrfs_clear_delalloc_extent(struc
  		if (btrfs_is_testing(fs_info))
  			return;
  
- 		if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID &&
+ 		if (!btrfs_is_data_reloc_root(root) &&
  		    do_list && !(state->state & EXTENT_NORESERVE) &&
  		    (*bits & EXTENT_CLEAR_DATA_RESV))
  			btrfs_free_reserved_data_space_noquota(fs_info, len);
@@@ -2236,48 -2298,6 +2300,6 @@@
  }
  
  /*
-  * btrfs_bio_fits_in_stripe - Checks whether the size of the given bio will fit
-  * in a chunk's stripe. This function ensures that bios do not span a
-  * stripe/chunk
-  *
-  * @page - The page we are about to add to the bio
-  * @size - size we want to add to the bio
-  * @bio - bio we want to ensure is smaller than a stripe
-  * @bio_flags - flags of the bio
-  *
-  * return 1 if page cannot be added to the bio
-  * return 0 if page can be added to the bio
-  * return error otherwise
-  */
- int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio,
- 			     unsigned long bio_flags)
- {
- 	struct inode *inode = page->mapping->host;
- 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- 	u64 logical = bio->bi_iter.bi_sector << 9;
- 	u32 bio_len = bio->bi_iter.bi_size;
- 	struct extent_map *em;
- 	int ret = 0;
- 	struct btrfs_io_geometry geom;
- 
- 	if (bio_flags & EXTENT_BIO_COMPRESSED)
- 		return 0;
- 
- 	em = btrfs_get_chunk_map(fs_info, logical, fs_info->sectorsize);
- 	if (IS_ERR(em))
- 		return PTR_ERR(em);
- 	ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), logical, &geom);
- 	if (ret < 0)
- 		goto out;
- 
- 	if (geom.len < bio_len + size)
- 		ret = 1;
- out:
- 	free_extent_map(em);
- 	return ret;
- }
- 
- /*
   * in order to insert checksums into the metadata in large chunks,
   * we wait until bio submission time.   All the pages in the bio are
   * checksummed and sums are attached onto the ordered extent record.
@@@ -2533,7 -2553,7 +2555,7 @@@ blk_status_t btrfs_submit_data_bio(stru
  		goto mapit;
  	} else if (async && !skip_sum) {
  		/* csum items have already been cloned */
- 		if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+ 		if (btrfs_is_data_reloc_root(root))
  			goto mapit;
  		/* we're doing a write, do the async checksumming */
  		ret = btrfs_wq_submit_bio(inode, bio, mirror_num, bio_flags,
@@@ -2766,7 -2786,7 +2788,7 @@@ out_page
  		clear_page_dirty_for_io(page);
  		SetPageError(page);
  	}
- 	ClearPageChecked(page);
+ 	btrfs_page_clear_checked(inode->root->fs_info, page, page_start, PAGE_SIZE);
  	unlock_page(page);
  	put_page(page);
  	kfree(fixup);
@@@ -2821,7 -2841,7 +2843,7 @@@ int btrfs_writepage_cow_fixup(struct pa
  	 * page->mapping outside of the page lock.
  	 */
  	ihold(inode);
- 	SetPageChecked(page);
+ 	btrfs_page_set_checked(fs_info, page, page_offset(page), PAGE_SIZE);
  	get_page(page);
  	btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
  	fixup->page = page;
@@@ -3012,8 -3032,12 +3034,12 @@@ static int btrfs_finish_ordered_io(stru
  		goto out;
  	}
  
- 	if (ordered_extent->bdev)
+ 	/* A valid bdev implies a write on a sequential zone */
+ 	if (ordered_extent->bdev) {
  		btrfs_rewrite_logical_zoned(ordered_extent);
+ 		btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
+ 					ordered_extent->disk_num_bytes);
+ 	}
  
  	btrfs_free_io_failure_record(inode, start, end);
  
@@@ -3210,7 -3234,7 +3236,7 @@@ void btrfs_writepage_endio_finish_order
   *
   * The length of such check is always one sector size.
   */
- static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio,
+ static int check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
  			   u32 bio_offset, struct page *page, u32 pgoff,
  			   u64 start)
  {
@@@ -3226,7 -3250,7 +3252,7 @@@
  	ASSERT(pgoff + len <= PAGE_SIZE);
  
  	offset_sectors = bio_offset >> fs_info->sectorsize_bits;
- 	csum_expected = ((u8 *)io_bio->csum) + offset_sectors * csum_size;
+ 	csum_expected = ((u8 *)bbio->csum) + offset_sectors * csum_size;
  
  	kaddr = kmap_atomic(page);
  	shash->tfm = fs_info->csum_shash;
@@@ -3240,9 -3264,9 +3266,9 @@@
  	return 0;
  zeroit:
  	btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected,
- 				    io_bio->mirror_num);
- 	if (io_bio->device)
- 		btrfs_dev_stat_inc_and_print(io_bio->device,
+ 				    bbio->mirror_num);
+ 	if (bbio->device)
+ 		btrfs_dev_stat_inc_and_print(bbio->device,
  					     BTRFS_DEV_STAT_CORRUPTION_ERRS);
  	memset(kaddr + pgoff, 1, len);
  	flush_dcache_page(page);
@@@ -3262,33 -3286,29 +3288,29 @@@
   * Return a bitmap where bit set means a csum mismatch, and bit not set means
   * csum match.
   */
- unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
- 				    struct page *page, u64 start, u64 end)
+ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
+ 				    u32 bio_offset, struct page *page,
+ 				    u64 start, u64 end)
  {
  	struct inode *inode = page->mapping->host;
+ 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
  	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
  	struct btrfs_root *root = BTRFS_I(inode)->root;
  	const u32 sectorsize = root->fs_info->sectorsize;
  	u32 pg_off;
  	unsigned int result = 0;
  
- 	if (PageChecked(page)) {
- 		ClearPageChecked(page);
+ 	if (btrfs_page_test_checked(fs_info, page, start, end + 1 - start)) {
+ 		btrfs_page_clear_checked(fs_info, page, start, end + 1 - start);
  		return 0;
  	}
  
  	/*
- 	 * For subpage case, above PageChecked is not safe as it's not subpage
- 	 * compatible.
- 	 * But for now only cow fixup and compressed read utilize PageChecked
- 	 * flag, while in this context we can easily use io_bio->csum to
- 	 * determine if we really need to do csum verification.
- 	 *
- 	 * So for now, just exit if io_bio->csum is NULL, as it means it's
- 	 * compressed read, and its compressed data csum has already been
- 	 * verified.
+ 	 * This only happens for NODATASUM or compressed read.
+ 	 * Normally this should be covered by above check for compressed read
+ 	 * or the next check for NODATASUM.  Just do a quicker exit here.
  	 */
- 	if (io_bio->csum == NULL)
+ 	if (bbio->csum == NULL)
  		return 0;
  
  	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
@@@ -3305,7 -3325,7 +3327,7 @@@
  		u64 file_offset = pg_off + page_offset(page);
  		int ret;
  
- 		if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
+ 		if (btrfs_is_data_reloc_root(root) &&
  		    test_range_bit(io_tree, file_offset,
  				   file_offset + sectorsize - 1,
  				   EXTENT_NODATASUM, 1, NULL)) {
@@@ -3315,7 -3335,7 +3337,7 @@@
  					  EXTENT_NODATASUM);
  			continue;
  		}
- 		ret = check_data_csum(inode, io_bio, bio_offset, page, pg_off,
+ 		ret = check_data_csum(inode, bbio, bio_offset, page, pg_off,
  				      page_offset(page) + pg_off);
  		if (ret < 0) {
  			const int nr_bit = (pg_off - offset_in_page(start)) >>
@@@ -4006,7 -4026,7 +4028,7 @@@ noinline int btrfs_update_inode(struct 
  	 * without delay
  	 */
  	if (!btrfs_is_free_space_inode(inode)
- 	    && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
+ 	    && !btrfs_is_data_reloc_root(root)
  	    && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
  		btrfs_update_root_times(trans, root);
  
@@@ -4036,11 -4056,11 +4058,11 @@@ int btrfs_update_inode_fallback(struct 
   * also drops the back refs in the inode to the directory
   */
  static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
- 				struct btrfs_root *root,
  				struct btrfs_inode *dir,
  				struct btrfs_inode *inode,
  				const char *name, int name_len)
  {
+ 	struct btrfs_root *root = dir->root;
  	struct btrfs_fs_info *fs_info = root->fs_info;
  	struct btrfs_path *path;
  	int ret = 0;
@@@ -4100,19 -4120,9 +4122,9 @@@ skip_backref
  		goto err;
  	}
  
- 	ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode,
- 			dir_ino);
- 	if (ret != 0 && ret != -ENOENT) {
- 		btrfs_abort_transaction(trans, ret);
- 		goto err;
- 	}
- 
- 	ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir,
- 			index);
- 	if (ret == -ENOENT)
- 		ret = 0;
- 	else if (ret)
- 		btrfs_abort_transaction(trans, ret);
+ 	btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode,
+ 				   dir_ino);
+ 	btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir, index);
  
  	/*
  	 * If we have a pending delayed iput we could end up with the final iput
@@@ -4140,15 -4150,14 +4152,14 @@@ out
  }
  
  int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
- 		       struct btrfs_root *root,
  		       struct btrfs_inode *dir, struct btrfs_inode *inode,
  		       const char *name, int name_len)
  {
  	int ret;
- 	ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
+ 	ret = __btrfs_unlink_inode(trans, dir, inode, name, name_len);
  	if (!ret) {
  		drop_nlink(&inode->vfs_inode);
- 		ret = btrfs_update_inode(trans, root, inode);
+ 		ret = btrfs_update_inode(trans, inode->root, inode);
  	}
  	return ret;
  }
@@@ -4177,7 -4186,6 +4188,6 @@@ static struct btrfs_trans_handle *__unl
  
  static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
  {
- 	struct btrfs_root *root = BTRFS_I(dir)->root;
  	struct btrfs_trans_handle *trans;
  	struct inode *inode = d_inode(dentry);
  	int ret;
@@@ -4189,7 -4197,7 +4199,7 @@@
  	btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
  			0);
  
- 	ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
+ 	ret = btrfs_unlink_inode(trans, BTRFS_I(dir),
  			BTRFS_I(d_inode(dentry)), dentry->d_name.name,
  			dentry->d_name.len);
  	if (ret)
@@@ -4203,7 -4211,7 +4213,7 @@@
  
  out:
  	btrfs_end_transaction(trans);
- 	btrfs_btree_balance_dirty(root->fs_info);
+ 	btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info);
  	return ret;
  }
  
@@@ -4370,7 -4378,7 +4380,7 @@@ static void btrfs_prune_dentries(struc
  	struct inode *inode;
  	u64 objectid = 0;
  
- 	if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+ 	if (!BTRFS_FS_ERROR(fs_info))
  		WARN_ON(btrfs_root_refs(&root->root_item) != 0);
  
  	spin_lock(&root->inode_lock);
@@@ -4554,7 -4562,6 +4564,6 @@@ static int btrfs_rmdir(struct inode *di
  {
  	struct inode *inode = d_inode(dentry);
  	int err = 0;
- 	struct btrfs_root *root = BTRFS_I(dir)->root;
  	struct btrfs_trans_handle *trans;
  	u64 last_unlink_trans;
  
@@@ -4579,7 -4586,7 +4588,7 @@@
  	last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
  
  	/* now the directory is empty */
- 	err = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
+ 	err = btrfs_unlink_inode(trans, BTRFS_I(dir),
  			BTRFS_I(d_inode(dentry)), dentry->d_name.name,
  			dentry->d_name.len);
  	if (!err) {
@@@ -4600,7 -4607,7 +4609,7 @@@
  	}
  out:
  	btrfs_end_transaction(trans);
- 	btrfs_btree_balance_dirty(root->fs_info);
+ 	btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info);
  
  	return err;
  }
@@@ -4909,9 -4916,9 +4918,9 @@@ delete
  
  			btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF,
  					extent_start, extent_num_bytes, 0);
- 			ref.real_root = root->root_key.objectid;
  			btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
- 					ino, extent_offset);
+ 					ino, extent_offset,
+ 					root->root_key.objectid, false);
  			ret = btrfs_free_extent(trans, &ref);
  			if (ret) {
  				btrfs_abort_transaction(trans, ret);
@@@ -5107,7 -5114,8 +5116,8 @@@ again
  				     len);
  		flush_dcache_page(page);
  	}
- 	ClearPageChecked(page);
+ 	btrfs_page_clear_checked(fs_info, page, block_start,
+ 				 block_end + 1 - block_start);
  	btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start);
  	unlock_extent_cached(io_tree, block_start, block_end, &cached_state);
  
@@@ -6437,7 -6445,7 +6447,7 @@@ static struct inode *btrfs_new_inode(st
  	struct btrfs_inode_ref *ref;
  	struct btrfs_key key[2];
  	u32 sizes[2];
- 	int nitems = name ? 2 : 1;
+ 	struct btrfs_item_batch batch;
  	unsigned long ptr;
  	unsigned int nofs_flag;
  	int ret;
@@@ -6529,7 -6537,11 +6539,11 @@@
  		goto fail;
  	}
  
- 	ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems);
+ 	batch.keys = &key[0];
+ 	batch.data_sizes = &sizes[0];
+ 	batch.total_data_size = sizes[0] + (name ? sizes[1] : 0);
+ 	batch.nr = name ? 2 : 1;
+ 	ret = btrfs_insert_empty_items(trans, root, path, &batch);
  	if (ret != 0)
  		goto fail_unlock;
  
@@@ -7963,7 -7975,7 +7977,7 @@@ static int btrfs_dio_iomap_begin(struc
  		iomap->type = IOMAP_MAPPED;
  	}
  	iomap->offset = start;
- 	iomap->bdev = fs_info->fs_devices->latest_bdev;
+ 	iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
  	iomap->length = len;
  
  	if (write && btrfs_use_zone_append(BTRFS_I(inode), em->block_start))
@@@ -8040,13 -8052,13 +8054,13 @@@ static void btrfs_dio_private_put(struc
  
  	if (btrfs_op(dip->dio_bio) == BTRFS_MAP_WRITE) {
  		__endio_write_update_ordered(BTRFS_I(dip->inode),
- 					     dip->logical_offset,
+ 					     dip->file_offset,
  					     dip->bytes,
  					     !dip->dio_bio->bi_status);
  	} else {
  		unlock_extent(&BTRFS_I(dip->inode)->io_tree,
- 			      dip->logical_offset,
- 			      dip->logical_offset + dip->bytes - 1);
+ 			      dip->file_offset,
+ 			      dip->file_offset + dip->bytes - 1);
  	}
  
  	bio_endio(dip->dio_bio);
@@@ -8074,10 -8086,11 +8088,11 @@@ static blk_status_t submit_dio_repair_b
  	return ret;
  }
  
- static blk_status_t btrfs_check_read_dio_bio(struct inode *inode,
- 					     struct btrfs_io_bio *io_bio,
+ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
+ 					     struct btrfs_bio *bbio,
  					     const bool uptodate)
  {
+ 	struct inode *inode = dip->inode;
  	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
  	const u32 sectorsize = fs_info->sectorsize;
  	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
@@@ -8085,11 -8098,12 +8100,12 @@@
  	const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
  	struct bio_vec bvec;
  	struct bvec_iter iter;
- 	u64 start = io_bio->logical;
+ 	const u64 orig_file_offset = dip->file_offset;
+ 	u64 start = orig_file_offset;
  	u32 bio_offset = 0;
  	blk_status_t err = BLK_STS_OK;
  
- 	__bio_for_each_segment(bvec, &io_bio->bio, iter, io_bio->iter) {
+ 	__bio_for_each_segment(bvec, &bbio->bio, iter, bbio->iter) {
  		unsigned int i, nr_sectors, pgoff;
  
  		nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len);
@@@ -8097,7 -8111,7 +8113,7 @@@
  		for (i = 0; i < nr_sectors; i++) {
  			ASSERT(pgoff < PAGE_SIZE);
  			if (uptodate &&
- 			    (!csum || !check_data_csum(inode, io_bio,
+ 			    (!csum || !check_data_csum(inode, bbio,
  						       bio_offset, bvec.bv_page,
  						       pgoff, start))) {
  				clean_io_failure(fs_info, failure_tree, io_tree,
@@@ -8107,12 -8121,12 +8123,12 @@@
  			} else {
  				int ret;
  
- 				ASSERT((start - io_bio->logical) < UINT_MAX);
+ 				ASSERT((start - orig_file_offset) < UINT_MAX);
  				ret = btrfs_repair_one_sector(inode,
- 						&io_bio->bio,
- 						start - io_bio->logical,
+ 						&bbio->bio,
+ 						start - orig_file_offset,
  						bvec.bv_page, pgoff,
- 						start, io_bio->mirror_num,
+ 						start, bbio->mirror_num,
  						submit_dio_repair_bio);
  				if (ret)
  					err = errno_to_blk_status(ret);
@@@ -8153,15 -8167,13 +8169,13 @@@ static void btrfs_end_dio_bio(struct bi
  			   bio->bi_opf, bio->bi_iter.bi_sector,
  			   bio->bi_iter.bi_size, err);
  
- 	if (bio_op(bio) == REQ_OP_READ) {
- 		err = btrfs_check_read_dio_bio(dip->inode, btrfs_io_bio(bio),
- 					       !err);
- 	}
+ 	if (bio_op(bio) == REQ_OP_READ)
+ 		err = btrfs_check_read_dio_bio(dip, btrfs_bio(bio), !err);
  
  	if (err)
  		dip->dio_bio->bi_status = err;
  
- 	btrfs_record_physical_zoned(dip->inode, dip->logical_offset, bio);
+ 	btrfs_record_physical_zoned(dip->inode, dip->file_offset, bio);
  
  	bio_put(bio);
  	btrfs_dio_private_put(dip);
@@@ -8203,10 -8215,10 +8217,10 @@@ static inline blk_status_t btrfs_submit
  	} else {
  		u64 csum_offset;
  
- 		csum_offset = file_offset - dip->logical_offset;
+ 		csum_offset = file_offset - dip->file_offset;
  		csum_offset >>= fs_info->sectorsize_bits;
  		csum_offset *= fs_info->csum_size;
- 		btrfs_io_bio(bio)->csum = dip->csums + csum_offset;
+ 		btrfs_bio(bio)->csum = dip->csums + csum_offset;
  	}
  map:
  	ret = btrfs_map_bio(fs_info, bio, 0);
@@@ -8241,7 -8253,7 +8255,7 @@@ static struct btrfs_dio_private *btrfs_
  		return NULL;
  
  	dip->inode = inode;
- 	dip->logical_offset = file_offset;
+ 	dip->file_offset = file_offset;
  	dip->bytes = dio_bio->bi_iter.bi_size;
  	dip->disk_bytenr = dio_bio->bi_iter.bi_sector << 9;
  	dip->dio_bio = dio_bio;
@@@ -8249,7 -8261,7 +8263,7 @@@
  	return dip;
  }
  
 -static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter,
 +static void btrfs_submit_direct(const struct iomap_iter *iter,
  		struct bio *dio_bio, loff_t file_offset)
  {
  	struct inode *inode = iter->inode;
@@@ -8279,7 -8291,7 +8293,7 @@@
  		}
  		dio_bio->bi_status = BLK_STS_RESOURCE;
  		bio_endio(dio_bio);
 -		return BLK_QC_T_NONE;
 +		return;
  	}
  
  	if (!write) {
@@@ -8322,7 -8334,6 +8336,6 @@@
  		bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len);
  		bio->bi_private = dip;
  		bio->bi_end_io = btrfs_end_dio_bio;
- 		btrfs_io_bio(bio)->logical = file_offset;
  
  		if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
  			status = extract_ordered_extent(BTRFS_I(inode), bio,
@@@ -8373,13 -8384,15 +8386,13 @@@
  
  		free_extent_map(em);
  	} while (submit_len > 0);
 -	return BLK_QC_T_NONE;
 +	return;
  
  out_err_em:
  	free_extent_map(em);
  out_err:
  	dip->dio_bio->bi_status = status;
  	btrfs_dio_private_put(dip);
 -
 -	return BLK_QC_T_NONE;
  }
  
  const struct iomap_ops btrfs_dio_iomap_ops = {
@@@ -8696,9 -8709,9 +8709,9 @@@ next
  	 * did something wrong.
  	 */
  	ASSERT(!PageOrdered(page));
+ 	btrfs_page_clear_checked(fs_info, page, page_offset(page), PAGE_SIZE);
  	if (!inode_evicting)
  		__btrfs_releasepage(page, GFP_NOFS);
- 	ClearPageChecked(page);
  	clear_page_extent_mapped(page);
  }
  
@@@ -8842,7 -8855,7 +8855,7 @@@ again
  		memzero_page(page, zero_start, PAGE_SIZE - zero_start);
  		flush_dcache_page(page);
  	}
- 	ClearPageChecked(page);
+ 	btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE);
  	btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start);
  	btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start);
  
@@@ -9152,8 -9165,10 +9165,10 @@@ void btrfs_destroy_inode(struct inode *
  	WARN_ON(inode->block_rsv.reserved);
  	WARN_ON(inode->block_rsv.size);
  	WARN_ON(inode->outstanding_extents);
- 	WARN_ON(inode->delalloc_bytes);
- 	WARN_ON(inode->new_delalloc_bytes);
+ 	if (!S_ISDIR(vfs_inode->i_mode)) {
+ 		WARN_ON(inode->delalloc_bytes);
+ 		WARN_ON(inode->new_delalloc_bytes);
+ 	}
  	WARN_ON(inode->csum_bytes);
  	WARN_ON(inode->defrag_bytes);
  
@@@ -9450,7 -9465,7 +9465,7 @@@ static int btrfs_rename_exchange(struc
  	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
  		ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
  	} else { /* src is an inode */
- 		ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir),
+ 		ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
  					   BTRFS_I(old_dentry->d_inode),
  					   old_dentry->d_name.name,
  					   old_dentry->d_name.len);
@@@ -9466,7 -9481,7 +9481,7 @@@
  	if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
  		ret = btrfs_unlink_subvol(trans, new_dir, new_dentry);
  	} else { /* dest is an inode */
- 		ret = __btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir),
+ 		ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir),
  					   BTRFS_I(new_dentry->d_inode),
  					   new_dentry->d_name.name,
  					   new_dentry->d_name.len);
@@@ -9741,7 -9756,7 +9756,7 @@@ static int btrfs_rename(struct user_nam
  		 */
  		btrfs_pin_log_trans(root);
  		log_pinned = true;
- 		ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir),
+ 		ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
  					BTRFS_I(d_inode(old_dentry)),
  					old_dentry->d_name.name,
  					old_dentry->d_name.len);
@@@ -9761,7 -9776,7 +9776,7 @@@
  			ret = btrfs_unlink_subvol(trans, new_dir, new_dentry);
  			BUG_ON(new_inode->i_nlink == 0);
  		} else {
- 			ret = btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir),
+ 			ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir),
  						 BTRFS_I(d_inode(new_dentry)),
  						 new_dentry->d_name.name,
  						 new_dentry->d_name.len);
@@@ -9979,7 -9994,7 +9994,7 @@@ int btrfs_start_delalloc_snapshot(struc
  	};
  	struct btrfs_fs_info *fs_info = root->fs_info;
  
- 	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+ 	if (BTRFS_FS_ERROR(fs_info))
  		return -EROFS;
  
  	return start_delalloc_inodes(root, &wbc, true, in_reclaim_context);
@@@ -9998,7 -10013,7 +10013,7 @@@ int btrfs_start_delalloc_roots(struct b
  	struct list_head splice;
  	int ret;
  
- 	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+ 	if (BTRFS_FS_ERROR(fs_info))
  		return -EROFS;
  
  	INIT_LIST_HEAD(&splice);
diff --combined fs/btrfs/ioctl.c
index 36ff713,92424a2..02ff085
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@@ -48,6 -48,7 +48,7 @@@
  #include "space-info.h"
  #include "delalloc-space.h"
  #include "block-group.h"
+ #include "subpage.h"
  
  #ifdef CONFIG_64BIT
  /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
@@@ -81,7 -82,8 +82,8 @@@ struct btrfs_ioctl_send_args_32 
  	compat_uptr_t clone_sources;	/* in */
  	__u64 parent_root;		/* in */
  	__u64 flags;			/* in */
- 	__u64 reserved[4];		/* in */
+ 	__u32 version;			/* in */
+ 	__u8  reserved[28];		/* in */
  } __attribute__ ((__packed__));
  
  #define BTRFS_IOC_SEND_32 _IOW(BTRFS_IOCTL_MAGIC, 38, \
@@@ -985,129 -987,32 +987,32 @@@ out
  	return ret;
  }
  
- /*
-  * When we're defragging a range, we don't want to kick it off again
-  * if it is really just waiting for delalloc to send it down.
-  * If we find a nice big extent or delalloc range for the bytes in the
-  * file you want to defrag, we return 0 to let you know to skip this
-  * part of the file
-  */
- static int check_defrag_in_cache(struct inode *inode, u64 offset, u32 thresh)
- {
- 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
- 	struct extent_map *em = NULL;
- 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
- 	u64 end;
- 
- 	read_lock(&em_tree->lock);
- 	em = lookup_extent_mapping(em_tree, offset, PAGE_SIZE);
- 	read_unlock(&em_tree->lock);
- 
- 	if (em) {
- 		end = extent_map_end(em);
- 		free_extent_map(em);
- 		if (end - offset > thresh)
- 			return 0;
- 	}
- 	/* if we already have a nice delalloc here, just stop */
- 	thresh /= 2;
- 	end = count_range_bits(io_tree, &offset, offset + thresh,
- 			       thresh, EXTENT_DELALLOC, 1);
- 	if (end >= thresh)
- 		return 0;
- 	return 1;
- }
- 
- /*
-  * helper function to walk through a file and find extents
-  * newer than a specific transid, and smaller than thresh.
-  *
-  * This is used by the defragging code to find new and small
-  * extents
-  */
- static int find_new_extents(struct btrfs_root *root,
- 			    struct inode *inode, u64 newer_than,
- 			    u64 *off, u32 thresh)
- {
- 	struct btrfs_path *path;
- 	struct btrfs_key min_key;
- 	struct extent_buffer *leaf;
- 	struct btrfs_file_extent_item *extent;
- 	int type;
- 	int ret;
- 	u64 ino = btrfs_ino(BTRFS_I(inode));
- 
- 	path = btrfs_alloc_path();
- 	if (!path)
- 		return -ENOMEM;
- 
- 	min_key.objectid = ino;
- 	min_key.type = BTRFS_EXTENT_DATA_KEY;
- 	min_key.offset = *off;
- 
- 	while (1) {
- 		ret = btrfs_search_forward(root, &min_key, path, newer_than);
- 		if (ret != 0)
- 			goto none;
- process_slot:
- 		if (min_key.objectid != ino)
- 			goto none;
- 		if (min_key.type != BTRFS_EXTENT_DATA_KEY)
- 			goto none;
- 
- 		leaf = path->nodes[0];
- 		extent = btrfs_item_ptr(leaf, path->slots[0],
- 					struct btrfs_file_extent_item);
- 
- 		type = btrfs_file_extent_type(leaf, extent);
- 		if (type == BTRFS_FILE_EXTENT_REG &&
- 		    btrfs_file_extent_num_bytes(leaf, extent) < thresh &&
- 		    check_defrag_in_cache(inode, min_key.offset, thresh)) {
- 			*off = min_key.offset;
- 			btrfs_free_path(path);
- 			return 0;
- 		}
- 
- 		path->slots[0]++;
- 		if (path->slots[0] < btrfs_header_nritems(leaf)) {
- 			btrfs_item_key_to_cpu(leaf, &min_key, path->slots[0]);
- 			goto process_slot;
- 		}
- 
- 		if (min_key.offset == (u64)-1)
- 			goto none;
- 
- 		min_key.offset++;
- 		btrfs_release_path(path);
- 	}
- none:
- 	btrfs_free_path(path);
- 	return -ENOENT;
- }
- 
- static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start)
+ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
+ 					       bool locked)
  {
  	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
  	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
  	struct extent_map *em;
- 	u64 len = PAGE_SIZE;
+ 	const u32 sectorsize = BTRFS_I(inode)->root->fs_info->sectorsize;
  
  	/*
  	 * hopefully we have this extent in the tree already, try without
  	 * the full extent lock
  	 */
  	read_lock(&em_tree->lock);
- 	em = lookup_extent_mapping(em_tree, start, len);
+ 	em = lookup_extent_mapping(em_tree, start, sectorsize);
  	read_unlock(&em_tree->lock);
  
  	if (!em) {
  		struct extent_state *cached = NULL;
- 		u64 end = start + len - 1;
+ 		u64 end = start + sectorsize - 1;
  
  		/* get the big lock and read metadata off disk */
- 		lock_extent_bits(io_tree, start, end, &cached);
- 		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
- 		unlock_extent_cached(io_tree, start, end, &cached);
+ 		if (!locked)
+ 			lock_extent_bits(io_tree, start, end, &cached);
+ 		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, sectorsize);
+ 		if (!locked)
+ 			unlock_extent_cached(io_tree, start, end, &cached);
  
  		if (IS_ERR(em))
  			return NULL;
@@@ -1116,7 -1021,8 +1021,8 @@@
  	return em;
  }
  
- static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
+ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
+ 				     bool locked)
  {
  	struct extent_map *next;
  	bool ret = true;
@@@ -1125,7 -1031,7 +1031,7 @@@
  	if (em->start + em->len >= i_size_read(inode))
  		return false;
  
- 	next = defrag_lookup_extent(inode, em->start + em->len);
+ 	next = defrag_lookup_extent(inode, em->start + em->len, locked);
  	if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
  		ret = false;
  	else if ((em->block_start + em->block_len == next->block_start) &&
@@@ -1136,297 -1042,435 +1042,435 @@@
  	return ret;
  }
  
- static int should_defrag_range(struct inode *inode, u64 start, u32 thresh,
- 			       u64 *last_len, u64 *skip, u64 *defrag_end,
- 			       int compress)
+ /*
+  * Prepare one page to be defragged.
+  *
+  * This will ensure:
+  *
+  * - Returned page is locked and has been set up properly.
+  * - No ordered extent exists in the page.
+  * - The page is uptodate.
+  *
+  * NOTE: Caller should also wait for page writeback after the cluster is
+  * prepared, here we don't do writeback wait for each page.
+  */
+ static struct page *defrag_prepare_one_page(struct btrfs_inode *inode,
+ 					    pgoff_t index)
  {
- 	struct extent_map *em;
- 	int ret = 1;
- 	bool next_mergeable = true;
- 	bool prev_mergeable = true;
+ 	struct address_space *mapping = inode->vfs_inode.i_mapping;
+ 	gfp_t mask = btrfs_alloc_write_mask(mapping);
+ 	u64 page_start = (u64)index << PAGE_SHIFT;
+ 	u64 page_end = page_start + PAGE_SIZE - 1;
+ 	struct extent_state *cached_state = NULL;
+ 	struct page *page;
+ 	int ret;
+ 
+ again:
+ 	page = find_or_create_page(mapping, index, mask);
+ 	if (!page)
+ 		return ERR_PTR(-ENOMEM);
  
  	/*
- 	 * make sure that once we start defragging an extent, we keep on
- 	 * defragging it
+ 	 * Since we can defragment files opened read-only, we can encounter
+ 	 * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We
+ 	 * can't do I/O using huge pages yet, so return an error for now.
+ 	 * Filesystem transparent huge pages are typically only used for
+ 	 * executables that explicitly enable them, so this isn't very
+ 	 * restrictive.
  	 */
- 	if (start < *defrag_end)
- 		return 1;
+ 	if (PageCompound(page)) {
+ 		unlock_page(page);
+ 		put_page(page);
+ 		return ERR_PTR(-ETXTBSY);
+ 	}
  
- 	*skip = 0;
+ 	ret = set_page_extent_mapped(page);
+ 	if (ret < 0) {
+ 		unlock_page(page);
+ 		put_page(page);
+ 		return ERR_PTR(ret);
+ 	}
  
- 	em = defrag_lookup_extent(inode, start);
- 	if (!em)
- 		return 0;
+ 	/* Wait for any existing ordered extent in the range */
+ 	while (1) {
+ 		struct btrfs_ordered_extent *ordered;
  
- 	/* this will cover holes, and inline extents */
- 	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- 		ret = 0;
- 		goto out;
- 	}
+ 		lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state);
+ 		ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
+ 		unlock_extent_cached(&inode->io_tree, page_start, page_end,
+ 				     &cached_state);
+ 		if (!ordered)
+ 			break;
  
- 	if (!*defrag_end)
- 		prev_mergeable = false;
+ 		unlock_page(page);
+ 		btrfs_start_ordered_extent(ordered, 1);
+ 		btrfs_put_ordered_extent(ordered);
+ 		lock_page(page);
+ 		/*
+ 		 * We unlocked the page above, so we need check if it was
+ 		 * released or not.
+ 		 */
+ 		if (page->mapping != mapping || !PagePrivate(page)) {
+ 			unlock_page(page);
+ 			put_page(page);
+ 			goto again;
+ 		}
+ 	}
  
- 	next_mergeable = defrag_check_next_extent(inode, em);
- 	/*
- 	 * we hit a real extent, if it is big or the next extent is not a
- 	 * real extent, don't bother defragging it
- 	 */
- 	if (!compress && (*last_len == 0 || *last_len >= thresh) &&
- 	    (em->len >= thresh || (!next_mergeable && !prev_mergeable)))
- 		ret = 0;
- out:
  	/*
- 	 * last_len ends up being a counter of how many bytes we've defragged.
- 	 * every time we choose not to defrag an extent, we reset *last_len
- 	 * so that the next tiny extent will force a defrag.
- 	 *
- 	 * The end result of this is that tiny extents before a single big
- 	 * extent will force at least part of that big extent to be defragged.
+ 	 * Now the page range has no ordered extent any more.  Read the page to
+ 	 * make it uptodate.
  	 */
- 	if (ret) {
- 		*defrag_end = extent_map_end(em);
- 	} else {
- 		*last_len = 0;
- 		*skip = extent_map_end(em);
- 		*defrag_end = 0;
+ 	if (!PageUptodate(page)) {
+ 		btrfs_readpage(NULL, page);
+ 		lock_page(page);
+ 		if (page->mapping != mapping || !PagePrivate(page)) {
+ 			unlock_page(page);
+ 			put_page(page);
+ 			goto again;
+ 		}
+ 		if (!PageUptodate(page)) {
+ 			unlock_page(page);
+ 			put_page(page);
+ 			return ERR_PTR(-EIO);
+ 		}
  	}
- 
- 	free_extent_map(em);
- 	return ret;
+ 	return page;
  }
  
+ struct defrag_target_range {
+ 	struct list_head list;
+ 	u64 start;
+ 	u64 len;
+ };
+ 
  /*
-  * it doesn't do much good to defrag one or two pages
-  * at a time.  This pulls in a nice chunk of pages
-  * to COW and defrag.
-  *
-  * It also makes sure the delalloc code has enough
-  * dirty data to avoid making new small extents as part
-  * of the defrag
+  * Collect all valid target extents.
   *
-  * It's a good idea to start RA on this range
-  * before calling this.
+  * @start:	   file offset to lookup
+  * @len:	   length to lookup
+  * @extent_thresh: file extent size threshold, any extent size >= this value
+  *		   will be ignored
+  * @newer_than:    only defrag extents newer than this value
+  * @do_compress:   whether the defrag is doing compression
+  *		   if true, @extent_thresh will be ignored and all regular
+  *		   file extents meeting @newer_than will be targets.
+  * @locked:	   if the range has already held extent lock
+  * @target_list:   list of targets file extents
   */
- static int cluster_pages_for_defrag(struct inode *inode,
- 				    struct page **pages,
- 				    unsigned long start_index,
- 				    unsigned long num_pages)
+ static int defrag_collect_targets(struct btrfs_inode *inode,
+ 				  u64 start, u64 len, u32 extent_thresh,
+ 				  u64 newer_than, bool do_compress,
+ 				  bool locked, struct list_head *target_list)
  {
- 	unsigned long file_end;
- 	u64 isize = i_size_read(inode);
- 	u64 page_start;
- 	u64 page_end;
- 	u64 page_cnt;
- 	u64 start = (u64)start_index << PAGE_SHIFT;
- 	u64 search_start;
- 	int ret;
- 	int i;
- 	int i_done;
- 	struct btrfs_ordered_extent *ordered;
- 	struct extent_state *cached_state = NULL;
- 	struct extent_io_tree *tree;
- 	struct extent_changeset *data_reserved = NULL;
- 	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
+ 	u64 cur = start;
+ 	int ret = 0;
  
- 	file_end = (isize - 1) >> PAGE_SHIFT;
- 	if (!isize || start_index > file_end)
- 		return 0;
+ 	while (cur < start + len) {
+ 		struct extent_map *em;
+ 		struct defrag_target_range *new;
+ 		bool next_mergeable = true;
+ 		u64 range_len;
  
- 	page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1);
+ 		em = defrag_lookup_extent(&inode->vfs_inode, cur, locked);
+ 		if (!em)
+ 			break;
  
- 	ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
- 			start, page_cnt << PAGE_SHIFT);
- 	if (ret)
- 		return ret;
- 	i_done = 0;
- 	tree = &BTRFS_I(inode)->io_tree;
+ 		/* Skip hole/inline/preallocated extents */
+ 		if (em->block_start >= EXTENT_MAP_LAST_BYTE ||
+ 		    test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ 			goto next;
  
- 	/* step one, lock all the pages */
- 	for (i = 0; i < page_cnt; i++) {
- 		struct page *page;
- again:
- 		page = find_or_create_page(inode->i_mapping,
- 					   start_index + i, mask);
- 		if (!page)
- 			break;
+ 		/* Skip older extent */
+ 		if (em->generation < newer_than)
+ 			goto next;
  
- 		ret = set_page_extent_mapped(page);
- 		if (ret < 0) {
- 			unlock_page(page);
- 			put_page(page);
- 			break;
+ 		/*
+ 		 * For do_compress case, we want to compress all valid file
+ 		 * extents, thus no @extent_thresh or mergeable check.
+ 		 */
+ 		if (do_compress)
+ 			goto add;
+ 
+ 		/* Skip too large extent */
+ 		if (em->len >= extent_thresh)
+ 			goto next;
+ 
+ 		next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em,
+ 							  locked);
+ 		if (!next_mergeable) {
+ 			struct defrag_target_range *last;
+ 
+ 			/* Empty target list, no way to merge with last entry */
+ 			if (list_empty(target_list))
+ 				goto next;
+ 			last = list_entry(target_list->prev,
+ 					  struct defrag_target_range, list);
+ 			/* Not mergeable with last entry */
+ 			if (last->start + last->len != cur)
+ 				goto next;
+ 
+ 			/* Mergeable, fall through to add it to @target_list. */
  		}
  
- 		page_start = page_offset(page);
- 		page_end = page_start + PAGE_SIZE - 1;
- 		while (1) {
- 			lock_extent_bits(tree, page_start, page_end,
- 					 &cached_state);
- 			ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode),
- 							      page_start);
- 			unlock_extent_cached(tree, page_start, page_end,
- 					     &cached_state);
- 			if (!ordered)
- 				break;
- 
- 			unlock_page(page);
- 			btrfs_start_ordered_extent(ordered, 1);
- 			btrfs_put_ordered_extent(ordered);
- 			lock_page(page);
- 			/*
- 			 * we unlocked the page above, so we need check if
- 			 * it was released or not.
- 			 */
- 			if (page->mapping != inode->i_mapping) {
- 				unlock_page(page);
- 				put_page(page);
- 				goto again;
+ add:
+ 		range_len = min(extent_map_end(em), start + len) - cur;
+ 		/*
+ 		 * This one is a good target, check if it can be merged into
+ 		 * last range of the target list.
+ 		 */
+ 		if (!list_empty(target_list)) {
+ 			struct defrag_target_range *last;
+ 
+ 			last = list_entry(target_list->prev,
+ 					  struct defrag_target_range, list);
+ 			ASSERT(last->start + last->len <= cur);
+ 			if (last->start + last->len == cur) {
+ 				/* Mergeable, enlarge the last entry */
+ 				last->len += range_len;
+ 				goto next;
  			}
+ 			/* Fall through to allocate a new entry */
  		}
  
- 		if (!PageUptodate(page)) {
- 			btrfs_readpage(NULL, page);
- 			lock_page(page);
- 			if (!PageUptodate(page)) {
- 				unlock_page(page);
- 				put_page(page);
- 				ret = -EIO;
- 				break;
- 			}
+ 		/* Allocate new defrag_target_range */
+ 		new = kmalloc(sizeof(*new), GFP_NOFS);
+ 		if (!new) {
+ 			free_extent_map(em);
+ 			ret = -ENOMEM;
+ 			break;
  		}
+ 		new->start = cur;
+ 		new->len = range_len;
+ 		list_add_tail(&new->list, target_list);
  
- 		if (page->mapping != inode->i_mapping) {
- 			unlock_page(page);
- 			put_page(page);
- 			goto again;
+ next:
+ 		cur = extent_map_end(em);
+ 		free_extent_map(em);
+ 	}
+ 	if (ret < 0) {
+ 		struct defrag_target_range *entry;
+ 		struct defrag_target_range *tmp;
+ 
+ 		list_for_each_entry_safe(entry, tmp, target_list, list) {
+ 			list_del_init(&entry->list);
+ 			kfree(entry);
  		}
+ 	}
+ 	return ret;
+ }
+ 
+ #define CLUSTER_SIZE	(SZ_256K)
+ 
+ /*
+  * Defrag one contiguous target range.
+  *
+  * @inode:	target inode
+  * @target:	target range to defrag
+  * @pages:	locked pages covering the defrag range
+  * @nr_pages:	number of locked pages
+  *
+  * Caller should ensure:
+  *
+  * - Pages are prepared
+  *   Pages should be locked, no ordered extent in the pages range,
+  *   no writeback.
+  *
+  * - Extent bits are locked
+  */
+ static int defrag_one_locked_target(struct btrfs_inode *inode,
+ 				    struct defrag_target_range *target,
+ 				    struct page **pages, int nr_pages,
+ 				    struct extent_state **cached_state)
+ {
+ 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ 	struct extent_changeset *data_reserved = NULL;
+ 	const u64 start = target->start;
+ 	const u64 len = target->len;
+ 	unsigned long last_index = (start + len - 1) >> PAGE_SHIFT;
+ 	unsigned long start_index = start >> PAGE_SHIFT;
+ 	unsigned long first_index = page_index(pages[0]);
+ 	int ret = 0;
+ 	int i;
+ 
+ 	ASSERT(last_index - first_index + 1 <= nr_pages);
+ 
+ 	ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len);
+ 	if (ret < 0)
+ 		return ret;
+ 	clear_extent_bit(&inode->io_tree, start, start + len - 1,
+ 			 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
+ 			 EXTENT_DEFRAG, 0, 0, cached_state);
+ 	set_extent_defrag(&inode->io_tree, start, start + len - 1, cached_state);
  
- 		pages[i] = page;
- 		i_done++;
+ 	/* Update the page status */
+ 	for (i = start_index - first_index; i <= last_index - first_index; i++) {
+ 		ClearPageChecked(pages[i]);
+ 		btrfs_page_clamp_set_dirty(fs_info, pages[i], start, len);
  	}
- 	if (!i_done || ret)
- 		goto out;
+ 	btrfs_delalloc_release_extents(inode, len);
+ 	extent_changeset_free(data_reserved);
  
- 	if (!(inode->i_sb->s_flags & SB_ACTIVE))
- 		goto out;
+ 	return ret;
+ }
  
- 	/*
- 	 * so now we have a nice long stream of locked
- 	 * and up to date pages, lets wait on them
- 	 */
- 	for (i = 0; i < i_done; i++)
- 		wait_on_page_writeback(pages[i]);
+ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
+ 			    u32 extent_thresh, u64 newer_than, bool do_compress)
+ {
+ 	struct extent_state *cached_state = NULL;
+ 	struct defrag_target_range *entry;
+ 	struct defrag_target_range *tmp;
+ 	LIST_HEAD(target_list);
+ 	struct page **pages;
+ 	const u32 sectorsize = inode->root->fs_info->sectorsize;
+ 	u64 last_index = (start + len - 1) >> PAGE_SHIFT;
+ 	u64 start_index = start >> PAGE_SHIFT;
+ 	unsigned int nr_pages = last_index - start_index + 1;
+ 	int ret = 0;
+ 	int i;
  
- 	page_start = page_offset(pages[0]);
- 	page_end = page_offset(pages[i_done - 1]) + PAGE_SIZE;
+ 	ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE);
+ 	ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize));
  
- 	lock_extent_bits(&BTRFS_I(inode)->io_tree,
- 			 page_start, page_end - 1, &cached_state);
+ 	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
+ 	if (!pages)
+ 		return -ENOMEM;
  
+ 	/* Prepare all pages */
+ 	for (i = 0; i < nr_pages; i++) {
+ 		pages[i] = defrag_prepare_one_page(inode, start_index + i);
+ 		if (IS_ERR(pages[i])) {
+ 			ret = PTR_ERR(pages[i]);
+ 			pages[i] = NULL;
+ 			goto free_pages;
+ 		}
+ 	}
+ 	for (i = 0; i < nr_pages; i++)
+ 		wait_on_page_writeback(pages[i]);
+ 
+ 	/* Lock the pages range */
+ 	lock_extent_bits(&inode->io_tree, start_index << PAGE_SHIFT,
+ 			 (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
+ 			 &cached_state);
  	/*
- 	 * When defragmenting we skip ranges that have holes or inline extents,
- 	 * (check should_defrag_range()), to avoid unnecessary IO and wasting
- 	 * space. At btrfs_defrag_file(), we check if a range should be defragged
- 	 * before locking the inode and then, if it should, we trigger a sync
- 	 * page cache readahead - we lock the inode only after that to avoid
- 	 * blocking for too long other tasks that possibly want to operate on
- 	 * other file ranges. But before we were able to get the inode lock,
- 	 * some other task may have punched a hole in the range, or we may have
- 	 * now an inline extent, in which case we should not defrag. So check
- 	 * for that here, where we have the inode and the range locked, and bail
- 	 * out if that happened.
+ 	 * Now we have a consistent view about the extent map, re-check
+ 	 * which range really needs to be defragged.
+ 	 *
+ 	 * And this time we have extent locked already, pass @locked = true
+ 	 * so that we won't relock the extent range and cause deadlock.
  	 */
- 	search_start = page_start;
- 	while (search_start < page_end) {
- 		struct extent_map *em;
+ 	ret = defrag_collect_targets(inode, start, len, extent_thresh,
+ 				     newer_than, do_compress, true,
+ 				     &target_list);
+ 	if (ret < 0)
+ 		goto unlock_extent;
  
- 		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, search_start,
- 				      page_end - search_start);
- 		if (IS_ERR(em)) {
- 			ret = PTR_ERR(em);
- 			goto out_unlock_range;
- 		}
- 		if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- 			free_extent_map(em);
- 			/* Ok, 0 means we did not defrag anything */
- 			ret = 0;
- 			goto out_unlock_range;
+ 	list_for_each_entry(entry, &target_list, list) {
+ 		ret = defrag_one_locked_target(inode, entry, pages, nr_pages,
+ 					       &cached_state);
+ 		if (ret < 0)
+ 			break;
+ 	}
+ 
+ 	list_for_each_entry_safe(entry, tmp, &target_list, list) {
+ 		list_del_init(&entry->list);
+ 		kfree(entry);
+ 	}
+ unlock_extent:
+ 	unlock_extent_cached(&inode->io_tree, start_index << PAGE_SHIFT,
+ 			     (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
+ 			     &cached_state);
+ free_pages:
+ 	for (i = 0; i < nr_pages; i++) {
+ 		if (pages[i]) {
+ 			unlock_page(pages[i]);
+ 			put_page(pages[i]);
  		}
- 		search_start = extent_map_end(em);
- 		free_extent_map(em);
  	}
+ 	kfree(pages);
+ 	return ret;
+ }
  
- 	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,
- 			  page_end - 1, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
- 			  EXTENT_DEFRAG, 0, 0, &cached_state);
+ static int defrag_one_cluster(struct btrfs_inode *inode,
+ 			      struct file_ra_state *ra,
+ 			      u64 start, u32 len, u32 extent_thresh,
+ 			      u64 newer_than, bool do_compress,
+ 			      unsigned long *sectors_defragged,
+ 			      unsigned long max_sectors)
+ {
+ 	const u32 sectorsize = inode->root->fs_info->sectorsize;
+ 	struct defrag_target_range *entry;
+ 	struct defrag_target_range *tmp;
+ 	LIST_HEAD(target_list);
+ 	int ret;
  
- 	if (i_done != page_cnt) {
- 		spin_lock(&BTRFS_I(inode)->lock);
- 		btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
- 		spin_unlock(&BTRFS_I(inode)->lock);
- 		btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
- 				start, (page_cnt - i_done) << PAGE_SHIFT, true);
- 	}
+ 	BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE));
+ 	ret = defrag_collect_targets(inode, start, len, extent_thresh,
+ 				     newer_than, do_compress, false,
+ 				     &target_list);
+ 	if (ret < 0)
+ 		goto out;
  
+ 	list_for_each_entry(entry, &target_list, list) {
+ 		u32 range_len = entry->len;
  
- 	set_extent_defrag(&BTRFS_I(inode)->io_tree, page_start, page_end - 1,
- 			  &cached_state);
+ 		/* Reached the limit */
+ 		if (max_sectors && max_sectors == *sectors_defragged)
+ 			break;
  
- 	unlock_extent_cached(&BTRFS_I(inode)->io_tree,
- 			     page_start, page_end - 1, &cached_state);
+ 		if (max_sectors)
+ 			range_len = min_t(u32, range_len,
+ 				(max_sectors - *sectors_defragged) * sectorsize);
  
- 	for (i = 0; i < i_done; i++) {
- 		clear_page_dirty_for_io(pages[i]);
- 		ClearPageChecked(pages[i]);
- 		set_page_dirty(pages[i]);
- 		unlock_page(pages[i]);
- 		put_page(pages[i]);
+ 		if (ra)
+ 			page_cache_sync_readahead(inode->vfs_inode.i_mapping,
+ 				ra, NULL, entry->start >> PAGE_SHIFT,
+ 				((entry->start + range_len - 1) >> PAGE_SHIFT) -
+ 				(entry->start >> PAGE_SHIFT) + 1);
+ 		/*
+ 		 * Here we may not defrag any range if holes are punched before
+ 		 * we locked the pages.
+ 		 * But that's fine, it only affects the @sectors_defragged
+ 		 * accounting.
+ 		 */
+ 		ret = defrag_one_range(inode, entry->start, range_len,
+ 				       extent_thresh, newer_than, do_compress);
+ 		if (ret < 0)
+ 			break;
+ 		*sectors_defragged += range_len;
  	}
- 	btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
- 	extent_changeset_free(data_reserved);
- 	return i_done;
- 
- out_unlock_range:
- 	unlock_extent_cached(&BTRFS_I(inode)->io_tree,
- 			     page_start, page_end - 1, &cached_state);
  out:
- 	for (i = 0; i < i_done; i++) {
- 		unlock_page(pages[i]);
- 		put_page(pages[i]);
+ 	list_for_each_entry_safe(entry, tmp, &target_list, list) {
+ 		list_del_init(&entry->list);
+ 		kfree(entry);
  	}
- 	btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
- 			start, page_cnt << PAGE_SHIFT, true);
- 	btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
- 	extent_changeset_free(data_reserved);
  	return ret;
- 
  }
  
- int btrfs_defrag_file(struct inode *inode, struct file *file,
+ /*
+  * Entry point to file defragmentation.
+  *
+  * @inode:	   inode to be defragged
+  * @ra:		   readahead state (can be NUL)
+  * @range:	   defrag options including range and flags
+  * @newer_than:	   minimum transid to defrag
+  * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode
+  *		   will be defragged.
+  */
+ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
  		      struct btrfs_ioctl_defrag_range_args *range,
  		      u64 newer_than, unsigned long max_to_defrag)
  {
  	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- 	struct btrfs_root *root = BTRFS_I(inode)->root;
- 	struct file_ra_state *ra = NULL;
- 	unsigned long last_index;
+ 	unsigned long sectors_defragged = 0;
  	u64 isize = i_size_read(inode);
- 	u64 last_len = 0;
- 	u64 skip = 0;
- 	u64 defrag_end = 0;
- 	u64 newer_off = range->start;
- 	unsigned long i;
- 	unsigned long ra_index = 0;
- 	int ret;
- 	int defrag_count = 0;
+ 	u64 cur;
+ 	u64 last_byte;
+ 	bool do_compress = range->flags & BTRFS_DEFRAG_RANGE_COMPRESS;
+ 	bool ra_allocated = false;
  	int compress_type = BTRFS_COMPRESS_ZLIB;
+ 	int ret = 0;
  	u32 extent_thresh = range->extent_thresh;
- 	unsigned long max_cluster = SZ_256K >> PAGE_SHIFT;
- 	unsigned long cluster = max_cluster;
- 	u64 new_align = ~((u64)SZ_128K - 1);
- 	struct page **pages = NULL;
- 	bool do_compress = range->flags & BTRFS_DEFRAG_RANGE_COMPRESS;
  
  	if (isize == 0)
  		return 0;
@@@ -1444,172 -1488,87 +1488,87 @@@
  	if (extent_thresh == 0)
  		extent_thresh = SZ_256K;
  
+ 	if (range->start + range->len > range->start) {
+ 		/* Got a specific range */
+ 		last_byte = min(isize, range->start + range->len) - 1;
+ 	} else {
+ 		/* Defrag until file end */
+ 		last_byte = isize - 1;
+ 	}
+ 
  	/*
- 	 * If we were not given a file, allocate a readahead context. As
+ 	 * If we were not given a ra, allocate a readahead context. As
  	 * readahead is just an optimization, defrag will work without it so
  	 * we don't error out.
  	 */
- 	if (!file) {
+ 	if (!ra) {
+ 		ra_allocated = true;
  		ra = kzalloc(sizeof(*ra), GFP_KERNEL);
  		if (ra)
  			file_ra_state_init(ra, inode->i_mapping);
- 	} else {
- 		ra = &file->f_ra;
- 	}
- 
- 	pages = kmalloc_array(max_cluster, sizeof(struct page *), GFP_KERNEL);
- 	if (!pages) {
- 		ret = -ENOMEM;
- 		goto out_ra;
- 	}
- 
- 	/* find the last page to defrag */
- 	if (range->start + range->len > range->start) {
- 		last_index = min_t(u64, isize - 1,
- 			 range->start + range->len - 1) >> PAGE_SHIFT;
- 	} else {
- 		last_index = (isize - 1) >> PAGE_SHIFT;
- 	}
- 
- 	if (newer_than) {
- 		ret = find_new_extents(root, inode, newer_than,
- 				       &newer_off, SZ_64K);
- 		if (!ret) {
- 			range->start = newer_off;
- 			/*
- 			 * we always align our defrag to help keep
- 			 * the extents in the file evenly spaced
- 			 */
- 			i = (newer_off & new_align) >> PAGE_SHIFT;
- 		} else
- 			goto out_ra;
- 	} else {
- 		i = range->start >> PAGE_SHIFT;
  	}
- 	if (!max_to_defrag)
- 		max_to_defrag = last_index - i + 1;
  
- 	/*
- 	 * make writeback starts from i, so the defrag range can be
- 	 * written sequentially.
- 	 */
- 	if (i < inode->i_mapping->writeback_index)
- 		inode->i_mapping->writeback_index = i;
- 
- 	while (i <= last_index && defrag_count < max_to_defrag &&
- 	       (i < DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE))) {
- 		/*
- 		 * make sure we stop running if someone unmounts
- 		 * the FS
- 		 */
- 		if (!(inode->i_sb->s_flags & SB_ACTIVE))
- 			break;
- 
- 		if (btrfs_defrag_cancelled(fs_info)) {
- 			btrfs_debug(fs_info, "defrag_file cancelled");
- 			ret = -EAGAIN;
- 			goto error;
- 		}
+ 	/* Align the range */
+ 	cur = round_down(range->start, fs_info->sectorsize);
+ 	last_byte = round_up(last_byte, fs_info->sectorsize) - 1;
  
- 		if (!should_defrag_range(inode, (u64)i << PAGE_SHIFT,
- 					 extent_thresh, &last_len, &skip,
- 					 &defrag_end, do_compress)){
- 			unsigned long next;
- 			/*
- 			 * the should_defrag function tells us how much to skip
- 			 * bump our counter by the suggested amount
- 			 */
- 			next = DIV_ROUND_UP(skip, PAGE_SIZE);
- 			i = max(i + 1, next);
- 			continue;
- 		}
+ 	while (cur < last_byte) {
+ 		u64 cluster_end;
  
- 		if (!newer_than) {
- 			cluster = (PAGE_ALIGN(defrag_end) >>
- 				   PAGE_SHIFT) - i;
- 			cluster = min(cluster, max_cluster);
- 		} else {
- 			cluster = max_cluster;
- 		}
+ 		/* The cluster size 256K should always be page aligned */
+ 		BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE));
  
- 		if (i + cluster > ra_index) {
- 			ra_index = max(i, ra_index);
- 			if (ra)
- 				page_cache_sync_readahead(inode->i_mapping, ra,
- 						file, ra_index, cluster);
- 			ra_index += cluster;
- 		}
+ 		/* We want the cluster end at page boundary when possible */
+ 		cluster_end = (((cur >> PAGE_SHIFT) +
+ 			       (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1;
+ 		cluster_end = min(cluster_end, last_byte);
  
  		btrfs_inode_lock(inode, 0);
  		if (IS_SWAPFILE(inode)) {
  			ret = -ETXTBSY;
- 		} else {
- 			if (do_compress)
- 				BTRFS_I(inode)->defrag_compress = compress_type;
- 			ret = cluster_pages_for_defrag(inode, pages, i, cluster);
+ 			btrfs_inode_unlock(inode, 0);
+ 			break;
  		}
- 		if (ret < 0) {
+ 		if (!(inode->i_sb->s_flags & SB_ACTIVE)) {
  			btrfs_inode_unlock(inode, 0);
- 			goto out_ra;
+ 			break;
  		}
- 
- 		defrag_count += ret;
- 		balance_dirty_pages_ratelimited(inode->i_mapping);
+ 		if (do_compress)
+ 			BTRFS_I(inode)->defrag_compress = compress_type;
+ 		ret = defrag_one_cluster(BTRFS_I(inode), ra, cur,
+ 				cluster_end + 1 - cur, extent_thresh,
+ 				newer_than, do_compress,
+ 				&sectors_defragged, max_to_defrag);
  		btrfs_inode_unlock(inode, 0);
- 
- 		if (newer_than) {
- 			if (newer_off == (u64)-1)
- 				break;
- 
- 			if (ret > 0)
- 				i += ret;
- 
- 			newer_off = max(newer_off + 1,
- 					(u64)i << PAGE_SHIFT);
- 
- 			ret = find_new_extents(root, inode, newer_than,
- 					       &newer_off, SZ_64K);
- 			if (!ret) {
- 				range->start = newer_off;
- 				i = (newer_off & new_align) >> PAGE_SHIFT;
- 			} else {
- 				break;
- 			}
- 		} else {
- 			if (ret > 0) {
- 				i += ret;
- 				last_len += ret << PAGE_SHIFT;
- 			} else {
- 				i++;
- 				last_len = 0;
- 			}
- 		}
+ 		if (ret < 0)
+ 			break;
+ 		cur = cluster_end + 1;
  	}
  
- 	ret = defrag_count;
- error:
- 	if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) {
- 		filemap_flush(inode->i_mapping);
- 		if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- 			     &BTRFS_I(inode)->runtime_flags))
+ 	if (ra_allocated)
+ 		kfree(ra);
+ 	if (sectors_defragged) {
+ 		/*
+ 		 * We have defragged some sectors, for compression case they
+ 		 * need to be written back immediately.
+ 		 */
+ 		if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) {
  			filemap_flush(inode->i_mapping);
+ 			if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ 				     &BTRFS_I(inode)->runtime_flags))
+ 				filemap_flush(inode->i_mapping);
+ 		}
+ 		if (range->compress_type == BTRFS_COMPRESS_LZO)
+ 			btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
+ 		else if (range->compress_type == BTRFS_COMPRESS_ZSTD)
+ 			btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
+ 		ret = sectors_defragged;
  	}
- 
- 	if (range->compress_type == BTRFS_COMPRESS_LZO) {
- 		btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
- 	} else if (range->compress_type == BTRFS_COMPRESS_ZSTD) {
- 		btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
- 	}
- 
- out_ra:
  	if (do_compress) {
  		btrfs_inode_lock(inode, 0);
  		BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE;
  		btrfs_inode_unlock(inode, 0);
  	}
- 	if (!file)
- 		kfree(ra);
- 	kfree(pages);
  	return ret;
  }
  
@@@ -1658,6 -1617,7 +1617,7 @@@ static int exclop_start_or_cancel_reloc
  static noinline int btrfs_ioctl_resize(struct file *file,
  					void __user *arg)
  {
+ 	BTRFS_DEV_LOOKUP_ARGS(args);
  	struct inode *inode = file_inode(file);
  	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
  	u64 new_size;
@@@ -1713,7 -1673,8 +1673,8 @@@
  		btrfs_info(fs_info, "resizing devid %llu", devid);
  	}
  
- 	device = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
+ 	args.devid = devid;
+ 	device = btrfs_find_device(fs_info->fs_devices, &args);
  	if (!device) {
  		btrfs_info(fs_info, "resizer unable to find device %llu",
  			   devid);
@@@ -1730,7 -1691,7 +1691,7 @@@
  	}
  
  	if (!strcmp(sizestr, "max"))
 -		new_size = device->bdev->bd_inode->i_size;
 +		new_size = bdev_nr_bytes(device->bdev);
  	else {
  		if (sizestr[0] == '-') {
  			mod = -1;
@@@ -1771,7 -1732,7 +1732,7 @@@
  		ret = -EINVAL;
  		goto out_finish;
  	}
 -	if (new_size > device->bdev->bd_inode->i_size) {
 +	if (new_size > bdev_nr_bytes(device->bdev)) {
  		ret = -EFBIG;
  		goto out_finish;
  	}
@@@ -3136,12 -3097,6 +3097,6 @@@ static int btrfs_ioctl_defrag(struct fi
  		goto out;
  	}
  
- 	/* Subpage defrag will be supported in later commits */
- 	if (root->fs_info->sectorsize < PAGE_SIZE) {
- 		ret = -ENOTTY;
- 		goto out;
- 	}
- 
  	switch (inode->i_mode & S_IFMT) {
  	case S_IFDIR:
  		if (!capable(CAP_SYS_ADMIN)) {
@@@ -3176,7 -3131,7 +3131,7 @@@
  			/* the rest are all set to zero by kzalloc */
  			range.len = (u64)-1;
  		}
- 		ret = btrfs_defrag_file(file_inode(file), file,
+ 		ret = btrfs_defrag_file(file_inode(file), &file->f_ra,
  					&range, BTRFS_OLDEST_GENERATION, 0);
  		if (ret > 0)
  			ret = 0;
@@@ -3220,6 -3175,7 +3175,7 @@@ out
  
  static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
  {
+ 	BTRFS_DEV_LOOKUP_ARGS(args);
  	struct inode *inode = file_inode(file);
  	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
  	struct btrfs_ioctl_vol_args_v2 *vol_args;
@@@ -3231,35 -3187,39 +3187,39 @@@
  	if (!capable(CAP_SYS_ADMIN))
  		return -EPERM;
  
- 	ret = mnt_want_write_file(file);
- 	if (ret)
- 		return ret;
- 
  	vol_args = memdup_user(arg, sizeof(*vol_args));
  	if (IS_ERR(vol_args)) {
  		ret = PTR_ERR(vol_args);
- 		goto err_drop;
+ 		goto out;
  	}
  
  	if (vol_args->flags & ~BTRFS_DEVICE_REMOVE_ARGS_MASK) {
  		ret = -EOPNOTSUPP;
  		goto out;
  	}
+ 
  	vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
- 	if (!(vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) &&
- 	    strcmp("cancel", vol_args->name) == 0)
+ 	if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) {
+ 		args.devid = vol_args->devid;
+ 	} else if (!strcmp("cancel", vol_args->name)) {
  		cancel = true;
+ 	} else {
+ 		ret = btrfs_get_dev_args_from_path(fs_info, &args, vol_args->name);
+ 		if (ret)
+ 			goto out;
+ 	}
+ 
+ 	ret = mnt_want_write_file(file);
+ 	if (ret)
+ 		goto out;
  
  	ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE,
  					   cancel);
  	if (ret)
- 		goto out;
- 	/* Exclusive operation is now claimed */
+ 		goto err_drop;
  
- 	if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID)
- 		ret = btrfs_rm_device(fs_info, NULL, vol_args->devid, &bdev, &mode);
- 	else
- 		ret = btrfs_rm_device(fs_info, vol_args->name, 0, &bdev, &mode);
+ 	/* Exclusive operation is now claimed */
+ 	ret = btrfs_rm_device(fs_info, &args, &bdev, &mode);
  
  	btrfs_exclop_finish(fs_info);
  
@@@ -3271,17 -3231,19 +3231,19 @@@
  			btrfs_info(fs_info, "device deleted: %s",
  					vol_args->name);
  	}
- out:
- 	kfree(vol_args);
  err_drop:
  	mnt_drop_write_file(file);
  	if (bdev)
  		blkdev_put(bdev, mode);
+ out:
+ 	btrfs_put_dev_args_from_path(&args);
+ 	kfree(vol_args);
  	return ret;
  }
  
  static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
  {
+ 	BTRFS_DEV_LOOKUP_ARGS(args);
  	struct inode *inode = file_inode(file);
  	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
  	struct btrfs_ioctl_vol_args *vol_args;
@@@ -3293,32 -3255,38 +3255,38 @@@
  	if (!capable(CAP_SYS_ADMIN))
  		return -EPERM;
  
- 	ret = mnt_want_write_file(file);
- 	if (ret)
- 		return ret;
- 
  	vol_args = memdup_user(arg, sizeof(*vol_args));
- 	if (IS_ERR(vol_args)) {
- 		ret = PTR_ERR(vol_args);
- 		goto out_drop_write;
- 	}
+ 	if (IS_ERR(vol_args))
+ 		return PTR_ERR(vol_args);
+ 
  	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
- 	cancel = (strcmp("cancel", vol_args->name) == 0);
+ 	if (!strcmp("cancel", vol_args->name)) {
+ 		cancel = true;
+ 	} else {
+ 		ret = btrfs_get_dev_args_from_path(fs_info, &args, vol_args->name);
+ 		if (ret)
+ 			goto out;
+ 	}
+ 
+ 	ret = mnt_want_write_file(file);
+ 	if (ret)
+ 		goto out;
  
  	ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE,
  					   cancel);
  	if (ret == 0) {
- 		ret = btrfs_rm_device(fs_info, vol_args->name, 0, &bdev, &mode);
+ 		ret = btrfs_rm_device(fs_info, &args, &bdev, &mode);
  		if (!ret)
  			btrfs_info(fs_info, "disk deleted %s", vol_args->name);
  		btrfs_exclop_finish(fs_info);
  	}
  
- 	kfree(vol_args);
- out_drop_write:
  	mnt_drop_write_file(file);
  	if (bdev)
  		blkdev_put(bdev, mode);
+ out:
+ 	btrfs_put_dev_args_from_path(&args);
+ 	kfree(vol_args);
  	return ret;
  }
  
@@@ -3379,22 -3347,21 +3347,21 @@@ static long btrfs_ioctl_fs_info(struct 
  static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info,
  				 void __user *arg)
  {
+ 	BTRFS_DEV_LOOKUP_ARGS(args);
  	struct btrfs_ioctl_dev_info_args *di_args;
  	struct btrfs_device *dev;
  	int ret = 0;
- 	char *s_uuid = NULL;
  
  	di_args = memdup_user(arg, sizeof(*di_args));
  	if (IS_ERR(di_args))
  		return PTR_ERR(di_args);
  
+ 	args.devid = di_args->devid;
  	if (!btrfs_is_empty_uuid(di_args->uuid))
- 		s_uuid = di_args->uuid;
+ 		args.uuid = di_args->uuid;
  
  	rcu_read_lock();
- 	dev = btrfs_find_device(fs_info->fs_devices, di_args->devid, s_uuid,
- 				NULL);
- 
+ 	dev = btrfs_find_device(fs_info->fs_devices, &args);
  	if (!dev) {
  		ret = -ENODEV;
  		goto out;
@@@ -4430,7 -4397,6 +4397,6 @@@ static long btrfs_ioctl_quota_rescan_st
  						void __user *arg)
  {
  	struct btrfs_ioctl_quota_rescan_args qsa = {0};
- 	int ret = 0;
  
  	if (!capable(CAP_SYS_ADMIN))
  		return -EPERM;
@@@ -4441,9 -4407,9 +4407,9 @@@
  	}
  
  	if (copy_to_user(arg, &qsa, sizeof(qsa)))
- 		ret = -EFAULT;
+ 		return -EFAULT;
  
- 	return ret;
+ 	return 0;
  }
  
  static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info,
diff --combined fs/btrfs/lzo.c
index 295bbc1,00cffc1..65cb076
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@@ -32,19 -32,19 +32,19 @@@
   *     payload.
   *     One regular LZO compressed extent can have one or more segments.
   *     For inlined LZO compressed extent, only one segment is allowed.
-  *     One segment represents at most one page of uncompressed data.
+  *     One segment represents at most one sector of uncompressed data.
   *
   * 2.1 Segment header
   *     Fixed size. LZO_LEN (4) bytes long, LE32.
   *     Records the total size of the segment (not including the header).
-  *     Segment header never crosses page boundary, thus it's possible to
-  *     have at most 3 padding zeros at the end of the page.
+  *     Segment header never crosses sector boundary, thus it's possible to
+  *     have at most 3 padding zeros at the end of the sector.
   *
   * 2.2 Data Payload
-  *     Variable size. Size up limit should be lzo1x_worst_compress(PAGE_SIZE)
-  *     which is 4419 for a 4KiB page.
+  *     Variable size. Size up limit should be lzo1x_worst_compress(sectorsize)
+  *     which is 4419 for a 4KiB sectorsize.
   *
-  * Example:
+  * Example with 4K sectorsize:
   * Page 1:
   *          0     0x2   0x4   0x6   0x8   0xa   0xc   0xe     0x10
   * 0x0000   |  Header   | SegHdr 01 | Data payload 01 ...     |
@@@ -112,170 -112,161 +112,174 @@@ static inline size_t read_compress_leng
  	return le32_to_cpu(dlen);
  }
  
+ /*
+  * Will do:
+  *
+  * - Write a segment header into the destination
+  * - Copy the compressed buffer into the destination
+  * - Make sure we have enough space in the last sector to fit a segment header
+  *   If not, we will pad at most (LZO_LEN (4)) - 1 bytes of zeros.
+  *
+  * Will allocate new pages when needed.
+  */
+ static int copy_compressed_data_to_page(char *compressed_data,
+ 					size_t compressed_size,
+ 					struct page **out_pages,
+ 					u32 *cur_out,
+ 					const u32 sectorsize)
+ {
+ 	u32 sector_bytes_left;
+ 	u32 orig_out;
+ 	struct page *cur_page;
++	char *kaddr;
+ 
+ 	/*
+ 	 * We never allow a segment header crossing sector boundary, previous
+ 	 * run should ensure we have enough space left inside the sector.
+ 	 */
+ 	ASSERT((*cur_out / sectorsize) == (*cur_out + LZO_LEN - 1) / sectorsize);
+ 
+ 	cur_page = out_pages[*cur_out / PAGE_SIZE];
+ 	/* Allocate a new page */
+ 	if (!cur_page) {
+ 		cur_page = alloc_page(GFP_NOFS);
+ 		if (!cur_page)
+ 			return -ENOMEM;
+ 		out_pages[*cur_out / PAGE_SIZE] = cur_page;
+ 	}
+ 
 -	write_compress_length(page_address(cur_page) + offset_in_page(*cur_out),
++	kaddr = kmap(cur_page);
++	write_compress_length(kaddr + offset_in_page(*cur_out),
+ 			      compressed_size);
+ 	*cur_out += LZO_LEN;
+ 
+ 	orig_out = *cur_out;
+ 
+ 	/* Copy compressed data */
+ 	while (*cur_out - orig_out < compressed_size) {
+ 		u32 copy_len = min_t(u32, sectorsize - *cur_out % sectorsize,
+ 				     orig_out + compressed_size - *cur_out);
+ 
++		kunmap(cur_page);
+ 		cur_page = out_pages[*cur_out / PAGE_SIZE];
+ 		/* Allocate a new page */
+ 		if (!cur_page) {
+ 			cur_page = alloc_page(GFP_NOFS);
+ 			if (!cur_page)
+ 				return -ENOMEM;
+ 			out_pages[*cur_out / PAGE_SIZE] = cur_page;
+ 		}
++		kaddr = kmap(cur_page);
+ 
 -		memcpy(page_address(cur_page) + offset_in_page(*cur_out),
++		memcpy(kaddr + offset_in_page(*cur_out),
+ 		       compressed_data + *cur_out - orig_out, copy_len);
+ 
+ 		*cur_out += copy_len;
+ 	}
+ 
+ 	/*
+ 	 * Check if we can fit the next segment header into the remaining space
+ 	 * of the sector.
+ 	 */
+ 	sector_bytes_left = round_up(*cur_out, sectorsize) - *cur_out;
+ 	if (sector_bytes_left >= LZO_LEN || sector_bytes_left == 0)
 -		return 0;
++		goto out;
+ 
+ 	/* The remaining size is not enough, pad it with zeros */
 -	memset(page_address(cur_page) + offset_in_page(*cur_out), 0,
++	memset(kaddr + offset_in_page(*cur_out), 0,
+ 	       sector_bytes_left);
+ 	*cur_out += sector_bytes_left;
++
++out:
++	kunmap(cur_page);
+ 	return 0;
+ }
+ 
  int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
  		u64 start, struct page **pages, unsigned long *out_pages,
  		unsigned long *total_in, unsigned long *total_out)
  {
  	struct workspace *workspace = list_entry(ws, struct workspace, list);
+ 	const u32 sectorsize = btrfs_sb(mapping->host->i_sb)->sectorsize;
+ 	struct page *page_in = NULL;
++	char *sizes_ptr;
  	int ret = 0;
- 	char *data_in;
- 	char *cpage_out, *sizes_ptr;
- 	int nr_pages = 0;
- 	struct page *in_page = NULL;
- 	struct page *out_page = NULL;
- 	unsigned long bytes_left;
- 	unsigned long len = *total_out;
- 	unsigned long nr_dest_pages = *out_pages;
- 	const unsigned long max_out = nr_dest_pages * PAGE_SIZE;
- 	size_t in_len;
- 	size_t out_len;
- 	char *buf;
- 	unsigned long tot_in = 0;
- 	unsigned long tot_out = 0;
- 	unsigned long pg_bytes_left;
- 	unsigned long out_offset;
- 	unsigned long bytes;
+ 	/* Points to the file offset of input data */
+ 	u64 cur_in = start;
+ 	/* Points to the current output byte */
+ 	u32 cur_out = 0;
+ 	u32 len = *total_out;
  
  	*out_pages = 0;
  	*total_out = 0;
  	*total_in = 0;
  
- 	in_page = find_get_page(mapping, start >> PAGE_SHIFT);
- 	data_in = kmap(in_page);
- 
  	/*
- 	 * store the size of all chunks of compressed data in
- 	 * the first 4 bytes
+ 	 * Skip the header for now, we will later come back and write the total
+ 	 * compressed size
  	 */
- 	out_page = alloc_page(GFP_NOFS);
- 	if (out_page == NULL) {
- 		ret = -ENOMEM;
- 		goto out;
- 	}
- 	cpage_out = kmap(out_page);
- 	out_offset = LZO_LEN;
- 	tot_out = LZO_LEN;
- 	pages[0] = out_page;
- 	nr_pages = 1;
- 	pg_bytes_left = PAGE_SIZE - LZO_LEN;
- 
- 	/* compress at most one page of data each time */
- 	in_len = min(len, PAGE_SIZE);
- 	while (tot_in < len) {
- 		ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf,
- 				       &out_len, workspace->mem);
- 		if (ret != LZO_E_OK) {
- 			pr_debug("BTRFS: lzo in loop returned %d\n",
- 			       ret);
+ 	cur_out += LZO_LEN;
+ 	while (cur_in < start + len) {
++		char *data_in;
+ 		const u32 sectorsize_mask = sectorsize - 1;
+ 		u32 sector_off = (cur_in - start) & sectorsize_mask;
+ 		u32 in_len;
+ 		size_t out_len;
+ 
+ 		/* Get the input page first */
+ 		if (!page_in) {
+ 			page_in = find_get_page(mapping, cur_in >> PAGE_SHIFT);
+ 			ASSERT(page_in);
+ 		}
+ 
+ 		/* Compress at most one sector of data each time */
+ 		in_len = min_t(u32, start + len - cur_in, sectorsize - sector_off);
+ 		ASSERT(in_len);
 -		ret = lzo1x_1_compress(page_address(page_in) +
++		data_in = kmap(page_in);
++		ret = lzo1x_1_compress(data_in +
+ 				       offset_in_page(cur_in), in_len,
+ 				       workspace->cbuf, &out_len,
+ 				       workspace->mem);
++		kunmap(page_in);
+ 		if (ret < 0) {
+ 			pr_debug("BTRFS: lzo in loop returned %d\n", ret);
  			ret = -EIO;
  			goto out;
  		}
  
- 		/* store the size of this chunk of compressed data */
- 		write_compress_length(cpage_out + out_offset, out_len);
- 		tot_out += LZO_LEN;
- 		out_offset += LZO_LEN;
- 		pg_bytes_left -= LZO_LEN;
- 
- 		tot_in += in_len;
- 		tot_out += out_len;
- 
- 		/* copy bytes from the working buffer into the pages */
- 		buf = workspace->cbuf;
- 		while (out_len) {
- 			bytes = min_t(unsigned long, pg_bytes_left, out_len);
- 
- 			memcpy(cpage_out + out_offset, buf, bytes);
- 
- 			out_len -= bytes;
- 			pg_bytes_left -= bytes;
- 			buf += bytes;
- 			out_offset += bytes;
- 
- 			/*
- 			 * we need another page for writing out.
- 			 *
- 			 * Note if there's less than 4 bytes left, we just
- 			 * skip to a new page.
- 			 */
- 			if ((out_len == 0 && pg_bytes_left < LZO_LEN) ||
- 			    pg_bytes_left == 0) {
- 				if (pg_bytes_left) {
- 					memset(cpage_out + out_offset, 0,
- 					       pg_bytes_left);
- 					tot_out += pg_bytes_left;
- 				}
- 
- 				/* we're done, don't allocate new page */
- 				if (out_len == 0 && tot_in >= len)
- 					break;
- 
- 				kunmap(out_page);
- 				if (nr_pages == nr_dest_pages) {
- 					out_page = NULL;
- 					ret = -E2BIG;
- 					goto out;
- 				}
- 
- 				out_page = alloc_page(GFP_NOFS);
- 				if (out_page == NULL) {
- 					ret = -ENOMEM;
- 					goto out;
- 				}
- 				cpage_out = kmap(out_page);
- 				pages[nr_pages++] = out_page;
- 
- 				pg_bytes_left = PAGE_SIZE;
- 				out_offset = 0;
- 			}
- 		}
+ 		ret = copy_compressed_data_to_page(workspace->cbuf, out_len,
+ 						   pages, &cur_out, sectorsize);
+ 		if (ret < 0)
+ 			goto out;
  
- 		/* we're making it bigger, give up */
- 		if (tot_in > 8192 && tot_in < tot_out) {
+ 		cur_in += in_len;
+ 
+ 		/*
+ 		 * Check if we're making it bigger after two sectors.  And if
+ 		 * it is so, give up.
+ 		 */
+ 		if (cur_in - start > sectorsize * 2 && cur_in - start < cur_out) {
  			ret = -E2BIG;
  			goto out;
  		}
  
- 		/* we're all done */
- 		if (tot_in >= len)
- 			break;
- 
- 		if (tot_out > max_out)
- 			break;
- 
- 		bytes_left = len - tot_in;
- 		kunmap(in_page);
- 		put_page(in_page);
- 
- 		start += PAGE_SIZE;
- 		in_page = find_get_page(mapping, start >> PAGE_SHIFT);
- 		data_in = kmap(in_page);
- 		in_len = min(bytes_left, PAGE_SIZE);
- 	}
- 
- 	if (tot_out >= tot_in) {
- 		ret = -E2BIG;
- 		goto out;
+ 		/* Check if we have reached page boundary */
+ 		if (IS_ALIGNED(cur_in, PAGE_SIZE)) {
+ 			put_page(page_in);
+ 			page_in = NULL;
+ 		}
  	}
  
- 	/* store the size of all chunks of compressed data */
+ 	/* Store the size of all chunks of compressed data */
 -	write_compress_length(page_address(pages[0]), cur_out);
 +	sizes_ptr = kmap_local_page(pages[0]);
- 	write_compress_length(sizes_ptr, tot_out);
++	write_compress_length(sizes_ptr, cur_out);
 +	kunmap_local(sizes_ptr);
  
  	ret = 0;
- 	*total_out = tot_out;
- 	*total_in = tot_in;
+ 	*total_out = cur_out;
+ 	*total_in = cur_in - start;
  out:
- 	*out_pages = nr_pages;
- 	if (out_page)
- 		kunmap(out_page);
- 
- 	if (in_page) {
- 		kunmap(in_page);
- 		put_page(in_page);
- 	}
- 
+ 	*out_pages = DIV_ROUND_UP(cur_out, PAGE_SIZE);
  	return ret;
  }
  
@@@ -290,7 -281,6 +294,7 @@@ static void copy_compressed_segment(str
  	u32 orig_in = *cur_in;
  
  	while (*cur_in < orig_in + len) {
 +		char *kaddr;
  		struct page *cur_page;
  		u32 copy_len = min_t(u32, PAGE_SIZE - offset_in_page(*cur_in),
  					  orig_in + len - *cur_in);
@@@ -298,11 -288,9 +302,11 @@@
  		ASSERT(copy_len);
  		cur_page = cb->compressed_pages[*cur_in / PAGE_SIZE];
  
 +		kaddr = kmap(cur_page);
  		memcpy(dest + *cur_in - orig_in,
 -			page_address(cur_page) + offset_in_page(*cur_in),
 +			kaddr + offset_in_page(*cur_in),
  			copy_len);
 +		kunmap(cur_page);
  
  		*cur_in += copy_len;
  	}
@@@ -313,7 -301,6 +317,7 @@@ int lzo_decompress_bio(struct list_hea
  	struct workspace *workspace = list_entry(ws, struct workspace, list);
  	const struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
  	const u32 sectorsize = fs_info->sectorsize;
 +	char *kaddr;
  	int ret;
  	/* Compressed data length, can be unaligned */
  	u32 len_in;
@@@ -322,9 -309,7 +326,9 @@@
  	/* Bytes decompressed so far */
  	u32 cur_out = 0;
  
 -	len_in = read_compress_length(page_address(cb->compressed_pages[0]));
 +	kaddr = kmap(cb->compressed_pages[0]);
 +	len_in = read_compress_length(kaddr);
 +	kunmap(cb->compressed_pages[0]);
  	cur_in += LZO_LEN;
  
  	/*
@@@ -358,9 -343,8 +362,9 @@@
  		       (cur_in + LZO_LEN - 1) / sectorsize);
  		cur_page = cb->compressed_pages[cur_in / PAGE_SIZE];
  		ASSERT(cur_page);
 -		seg_len = read_compress_length(page_address(cur_page) +
 -					       offset_in_page(cur_in));
 +		kaddr = kmap(cur_page);
 +		seg_len = read_compress_length(kaddr + offset_in_page(cur_in));
 +		kunmap(cur_page);
  		cur_in += LZO_LEN;
  
  		/* Copy the compressed segment payload into workspace */
@@@ -445,7 -429,7 +449,7 @@@ int lzo_decompress(struct list_head *ws
  	destlen = min_t(unsigned long, destlen, PAGE_SIZE);
  	bytes = min_t(unsigned long, destlen, out_len - start_byte);
  
 -	kaddr = page_address(dest_page);
 +	kaddr = kmap_local_page(dest_page);
  	memcpy(kaddr, workspace->buf + start_byte, bytes);
  
  	/*
@@@ -455,7 -439,6 +459,7 @@@
  	 */
  	if (bytes < destlen)
  		memset(kaddr+bytes, 0, destlen-bytes);
 +	kunmap_local(kaddr);
  out:
  	return ret;
  }
diff --combined fs/btrfs/volumes.c
index 9533f35,546bf11..61ac57b
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@@ -14,6 -14,7 +14,7 @@@
  #include <linux/semaphore.h>
  #include <linux/uuid.h>
  #include <linux/list_sort.h>
+ #include <linux/namei.h>
  #include "misc.h"
  #include "ctree.h"
  #include "extent_map.h"
@@@ -250,7 -251,7 +251,7 @@@ static void btrfs_dev_stat_print_on_loa
  static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
  			     enum btrfs_map_op op,
  			     u64 logical, u64 *length,
- 			     struct btrfs_bio **bbio_ret,
+ 			     struct btrfs_io_context **bioc_ret,
  			     int mirror_num, int need_raid_map);
  
  /*
@@@ -508,7 -509,7 +509,7 @@@ btrfs_get_bdev_and_sb(const char *devic
  	}
  
  	if (flush)
 -		filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
 +		sync_blockdev(*bdev);
  	ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
  	if (ret) {
  		blkdev_put(*bdev, flags);
@@@ -812,9 -813,13 +813,13 @@@ static noinline struct btrfs_device *de
  
  		device = NULL;
  	} else {
+ 		struct btrfs_dev_lookup_args args = {
+ 			.devid = devid,
+ 			.uuid = disk_super->dev_item.uuid,
+ 		};
+ 
  		mutex_lock(&fs_devices->device_list_mutex);
- 		device = btrfs_find_device(fs_devices, devid,
- 				disk_super->dev_item.uuid, NULL);
+ 		device = btrfs_find_device(fs_devices, &args);
  
  		/*
  		 * If this disk has been pulled into an fs devices created by
@@@ -1091,7 -1096,7 +1096,7 @@@ void btrfs_free_extra_devids(struct btr
  	list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list)
  		__btrfs_free_extra_devids(seed_dev, &latest_dev);
  
- 	fs_devices->latest_bdev = latest_dev->bdev;
+ 	fs_devices->latest_dev = latest_dev;
  
  	mutex_unlock(&uuid_mutex);
  }
@@@ -1122,8 -1127,10 +1127,10 @@@ static void btrfs_close_one_device(stru
  	if (device->devid == BTRFS_DEV_REPLACE_DEVID)
  		clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
  
- 	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
+ 	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
+ 		clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
  		fs_devices->missing_devices--;
+ 	}
  
  	btrfs_close_bdev(device);
  	if (device->bdev) {
@@@ -1222,7 -1229,7 +1229,7 @@@ static int open_fs_devices(struct btrfs
  		return -EINVAL;
  
  	fs_devices->opened = 1;
- 	fs_devices->latest_bdev = latest_dev->bdev;
+ 	fs_devices->latest_dev = latest_dev;
  	fs_devices->total_rw_bytes = 0;
  	fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
  	fs_devices->read_policy = BTRFS_READ_POLICY_PID;
@@@ -1286,7 -1293,7 +1293,7 @@@ static struct btrfs_super_block *btrfs_
  	pgoff_t index;
  
  	/* make sure our super fits in the device */
 -	if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
 +	if (bytenr + PAGE_SIZE >= bdev_nr_bytes(bdev))
  		return ERR_PTR(-EINVAL);
  
  	/* make sure our super fits in the page */
@@@ -1843,8 -1850,10 +1850,10 @@@ static int btrfs_add_dev_item(struct bt
  	key.type = BTRFS_DEV_ITEM_KEY;
  	key.offset = device->devid;
  
+ 	btrfs_reserve_chunk_metadata(trans, true);
  	ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path,
  				      &key, sizeof(*dev_item));
+ 	btrfs_trans_release_chunk_metadata(trans);
  	if (ret)
  		goto out;
  
@@@ -1882,18 -1891,22 +1891,22 @@@ out
  /*
   * Function to update ctime/mtime for a given device path.
   * Mainly used for ctime/mtime based probe like libblkid.
+  *
+  * We don't care about errors here, this is just to be kind to userspace.
   */
- static void update_dev_time(struct block_device *bdev)
+ static void update_dev_time(const char *device_path)
  {
- 	struct inode *inode = bdev->bd_inode;
+ 	struct path path;
  	struct timespec64 now;
+ 	int ret;
  
- 	/* Shouldn't happen but just in case. */
- 	if (!inode)
+ 	ret = kern_path(device_path, LOOKUP_FOLLOW, &path);
+ 	if (ret)
  		return;
  
- 	now = current_time(inode);
- 	generic_update_time(inode, &now, S_MTIME | S_CTIME);
+ 	now = current_time(d_inode(path.dentry));
+ 	inode_update_time(d_inode(path.dentry), &now, S_MTIME | S_CTIME);
+ 	path_put(&path);
  }
  
  static int btrfs_rm_dev_item(struct btrfs_device *device)
@@@ -1917,7 -1930,9 +1930,9 @@@
  	key.type = BTRFS_DEV_ITEM_KEY;
  	key.offset = device->devid;
  
+ 	btrfs_reserve_chunk_metadata(trans, false);
  	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ 	btrfs_trans_release_chunk_metadata(trans);
  	if (ret) {
  		if (ret > 0)
  			ret = -ENOENT;
@@@ -1986,7 -2001,7 +2001,7 @@@ static struct btrfs_device * btrfs_find
  }
  
  /*
-  * Helper function to check if the given device is part of s_bdev / latest_bdev
+  * Helper function to check if the given device is part of s_bdev / latest_dev
   * and replace it with the provided or the next active device, in the context
   * where this function called, there should be always be another device (or
   * this_dev) which is active.
@@@ -2005,8 -2020,8 +2020,8 @@@ void __cold btrfs_assign_next_active_de
  			(fs_info->sb->s_bdev == device->bdev))
  		fs_info->sb->s_bdev = next_device->bdev;
  
- 	if (fs_info->fs_devices->latest_bdev == device->bdev)
- 		fs_info->fs_devices->latest_bdev = next_device->bdev;
+ 	if (fs_info->fs_devices->latest_dev->bdev == device->bdev)
+ 		fs_info->fs_devices->latest_dev = next_device;
  }
  
  /*
@@@ -2069,11 -2084,12 +2084,12 @@@ void btrfs_scratch_superblocks(struct b
  	btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
  
  	/* Update ctime/mtime for device path for libblkid */
- 	update_dev_time(bdev);
+ 	update_dev_time(device_path);
  }
  
- int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
- 		    u64 devid, struct block_device **bdev, fmode_t *mode)
+ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
+ 		    struct btrfs_dev_lookup_args *args,
+ 		    struct block_device **bdev, fmode_t *mode)
  {
  	struct btrfs_device *device;
  	struct btrfs_fs_devices *cur_devices;
@@@ -2081,22 -2097,23 +2097,23 @@@
  	u64 num_devices;
  	int ret = 0;
  
- 	mutex_lock(&uuid_mutex);
- 
+ 	/*
+ 	 * The device list in fs_devices is accessed without locks (neither
+ 	 * uuid_mutex nor device_list_mutex) as it won't change on a mounted
+ 	 * filesystem and another device rm cannot run.
+ 	 */
  	num_devices = btrfs_num_devices(fs_info);
  
  	ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
  	if (ret)
  		goto out;
  
- 	device = btrfs_find_device_by_devspec(fs_info, devid, device_path);
- 
- 	if (IS_ERR(device)) {
- 		if (PTR_ERR(device) == -ENOENT &&
- 		    device_path && strcmp(device_path, "missing") == 0)
+ 	device = btrfs_find_device(fs_info->fs_devices, args);
+ 	if (!device) {
+ 		if (args->missing)
  			ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
  		else
- 			ret = PTR_ERR(device);
+ 			ret = -ENOENT;
  		goto out;
  	}
  
@@@ -2126,11 -2143,9 +2143,9 @@@
  		mutex_unlock(&fs_info->chunk_mutex);
  	}
  
- 	mutex_unlock(&uuid_mutex);
  	ret = btrfs_shrink_device(device, 0);
  	if (!ret)
  		btrfs_reada_remove_dev(device);
- 	mutex_lock(&uuid_mutex);
  	if (ret)
  		goto error_undo;
  
@@@ -2159,7 -2174,7 +2174,7 @@@
  	/*
  	 * In normal cases the cur_devices == fs_devices. But in case
  	 * of deleting a seed device, the cur_devices should point to
- 	 * its own fs_devices listed under the fs_devices->seed.
+ 	 * its own fs_devices listed under the fs_devices->seed_list.
  	 */
  	cur_devices = device->fs_devices;
  	mutex_lock(&fs_devices->device_list_mutex);
@@@ -2210,14 -2225,21 +2225,21 @@@
  	synchronize_rcu();
  	btrfs_free_device(device);
  
- 	if (cur_devices->open_devices == 0) {
+ 	/*
+ 	 * This can happen if cur_devices is the private seed devices list.  We
+ 	 * cannot call close_fs_devices() here because it expects the uuid_mutex
+ 	 * to be held, but in fact we don't need that for the private
+ 	 * seed_devices, we can simply decrement cur_devices->opened and then
+ 	 * remove it from our list and free the fs_devices.
+ 	 */
+ 	if (cur_devices->num_devices == 0) {
  		list_del_init(&cur_devices->seed_list);
- 		close_fs_devices(cur_devices);
+ 		ASSERT(cur_devices->opened == 1);
+ 		cur_devices->opened--;
  		free_fs_devices(cur_devices);
  	}
  
  out:
- 	mutex_unlock(&uuid_mutex);
  	return ret;
  
  error_undo:
@@@ -2305,13 -2327,6 +2327,6 @@@ void btrfs_destroy_dev_replace_tgtdev(s
  
  	mutex_unlock(&fs_devices->device_list_mutex);
  
- 	/*
- 	 * The update_dev_time() with in btrfs_scratch_superblocks()
- 	 * may lead to a call to btrfs_show_devname() which will try
- 	 * to hold device_list_mutex. And here this device
- 	 * is already out of device list, so we don't have to hold
- 	 * the device_list_mutex lock.
- 	 */
  	btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev,
  				  tgtdev->name->str);
  
@@@ -2320,69 -2335,98 +2335,98 @@@
  	btrfs_free_device(tgtdev);
  }
  
- static struct btrfs_device *btrfs_find_device_by_path(
- 		struct btrfs_fs_info *fs_info, const char *device_path)
+ /**
+  * Populate args from device at path
+  *
+  * @fs_info:	the filesystem
+  * @args:	the args to populate
+  * @path:	the path to the device
+  *
+  * This will read the super block of the device at @path and populate @args with
+  * the devid, fsid, and uuid.  This is meant to be used for ioctls that need to
+  * lookup a device to operate on, but need to do it before we take any locks.
+  * This properly handles the special case of "missing" that a user may pass in,
+  * and does some basic sanity checks.  The caller must make sure that @path is
+  * properly NUL terminated before calling in, and must call
+  * btrfs_put_dev_args_from_path() in order to free up the temporary fsid and
+  * uuid buffers.
+  *
+  * Return: 0 for success, -errno for failure
+  */
+ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
+ 				 struct btrfs_dev_lookup_args *args,
+ 				 const char *path)
  {
- 	int ret = 0;
  	struct btrfs_super_block *disk_super;
- 	u64 devid;
- 	u8 *dev_uuid;
  	struct block_device *bdev;
- 	struct btrfs_device *device;
+ 	int ret;
  
- 	ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
- 				    fs_info->bdev_holder, 0, &bdev, &disk_super);
- 	if (ret)
- 		return ERR_PTR(ret);
+ 	if (!path || !path[0])
+ 		return -EINVAL;
+ 	if (!strcmp(path, "missing")) {
+ 		args->missing = true;
+ 		return 0;
+ 	}
  
- 	devid = btrfs_stack_device_id(&disk_super->dev_item);
- 	dev_uuid = disk_super->dev_item.uuid;
+ 	args->uuid = kzalloc(BTRFS_UUID_SIZE, GFP_KERNEL);
+ 	args->fsid = kzalloc(BTRFS_FSID_SIZE, GFP_KERNEL);
+ 	if (!args->uuid || !args->fsid) {
+ 		btrfs_put_dev_args_from_path(args);
+ 		return -ENOMEM;
+ 	}
+ 
+ 	ret = btrfs_get_bdev_and_sb(path, FMODE_READ, fs_info->bdev_holder, 0,
+ 				    &bdev, &disk_super);
+ 	if (ret)
+ 		return ret;
+ 	args->devid = btrfs_stack_device_id(&disk_super->dev_item);
+ 	memcpy(args->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE);
  	if (btrfs_fs_incompat(fs_info, METADATA_UUID))
- 		device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
- 					   disk_super->metadata_uuid);
+ 		memcpy(args->fsid, disk_super->metadata_uuid, BTRFS_FSID_SIZE);
  	else
- 		device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
- 					   disk_super->fsid);
- 
+ 		memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
  	btrfs_release_disk_super(disk_super);
- 	if (!device)
- 		device = ERR_PTR(-ENOENT);
  	blkdev_put(bdev, FMODE_READ);
- 	return device;
+ 	return 0;
  }
  
  /*
-  * Lookup a device given by device id, or the path if the id is 0.
+  * Only use this jointly with btrfs_get_dev_args_from_path() because we will
+  * allocate our ->uuid and ->fsid pointers, everybody else uses local variables
+  * that don't need to be freed.
   */
+ void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args)
+ {
+ 	kfree(args->uuid);
+ 	kfree(args->fsid);
+ 	args->uuid = NULL;
+ 	args->fsid = NULL;
+ }
+ 
  struct btrfs_device *btrfs_find_device_by_devspec(
  		struct btrfs_fs_info *fs_info, u64 devid,
  		const char *device_path)
  {
+ 	BTRFS_DEV_LOOKUP_ARGS(args);
  	struct btrfs_device *device;
+ 	int ret;
  
  	if (devid) {
- 		device = btrfs_find_device(fs_info->fs_devices, devid, NULL,
- 					   NULL);
+ 		args.devid = devid;
+ 		device = btrfs_find_device(fs_info->fs_devices, &args);
  		if (!device)
  			return ERR_PTR(-ENOENT);
  		return device;
  	}
  
- 	if (!device_path || !device_path[0])
- 		return ERR_PTR(-EINVAL);
- 
- 	if (strcmp(device_path, "missing") == 0) {
- 		/* Find first missing device */
- 		list_for_each_entry(device, &fs_info->fs_devices->devices,
- 				    dev_list) {
- 			if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
- 				     &device->dev_state) && !device->bdev)
- 				return device;
- 		}
+ 	ret = btrfs_get_dev_args_from_path(fs_info, &args, device_path);
+ 	if (ret)
+ 		return ERR_PTR(ret);
+ 	device = btrfs_find_device(fs_info->fs_devices, &args);
+ 	btrfs_put_dev_args_from_path(&args);
+ 	if (!device)
  		return ERR_PTR(-ENOENT);
- 	}
- 
- 	return btrfs_find_device_by_path(fs_info, device_path);
+ 	return device;
  }
  
  /*
@@@ -2459,6 -2503,7 +2503,7 @@@ static int btrfs_prepare_sprout(struct 
   */
  static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
  {
+ 	BTRFS_DEV_LOOKUP_ARGS(args);
  	struct btrfs_fs_info *fs_info = trans->fs_info;
  	struct btrfs_root *root = fs_info->chunk_root;
  	struct btrfs_path *path;
@@@ -2468,7 -2513,6 +2513,6 @@@
  	struct btrfs_key key;
  	u8 fs_uuid[BTRFS_FSID_SIZE];
  	u8 dev_uuid[BTRFS_UUID_SIZE];
- 	u64 devid;
  	int ret;
  
  	path = btrfs_alloc_path();
@@@ -2480,7 -2524,9 +2524,9 @@@
  	key.type = BTRFS_DEV_ITEM_KEY;
  
  	while (1) {
+ 		btrfs_reserve_chunk_metadata(trans, false);
  		ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+ 		btrfs_trans_release_chunk_metadata(trans);
  		if (ret < 0)
  			goto error;
  
@@@ -2505,13 -2551,14 +2551,14 @@@ next_slot
  
  		dev_item = btrfs_item_ptr(leaf, path->slots[0],
  					  struct btrfs_dev_item);
- 		devid = btrfs_device_id(leaf, dev_item);
+ 		args.devid = btrfs_device_id(leaf, dev_item);
  		read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
  				   BTRFS_UUID_SIZE);
  		read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
  				   BTRFS_FSID_SIZE);
- 		device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
- 					   fs_uuid);
+ 		args.uuid = dev_uuid;
+ 		args.fsid = fs_uuid;
+ 		device = btrfs_find_device(fs_info->fs_devices, &args);
  		BUG_ON(!device); /* Logic error */
  
  		if (device->fs_devices->seeding) {
@@@ -2610,8 -2657,8 +2657,8 @@@ int btrfs_init_new_device(struct btrfs_
  	device->io_width = fs_info->sectorsize;
  	device->io_align = fs_info->sectorsize;
  	device->sector_size = fs_info->sectorsize;
 -	device->total_bytes = round_down(i_size_read(bdev->bd_inode),
 -					 fs_info->sectorsize);
 +	device->total_bytes =
 +		round_down(bdev_nr_bytes(bdev), fs_info->sectorsize);
  	device->disk_total_bytes = device->total_bytes;
  	device->commit_total_bytes = device->total_bytes;
  	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
@@@ -2627,6 -2674,8 +2674,8 @@@
  			btrfs_abort_transaction(trans, ret);
  			goto error_trans;
  		}
+ 		btrfs_assign_next_active_device(fs_info->fs_devices->latest_dev,
+ 						device);
  	}
  
  	device->fs_devices = fs_devices;
@@@ -2733,7 -2782,7 +2782,7 @@@
  	btrfs_forget_devices(device_path);
  
  	/* Update ctime/mtime for blkid or udev */
- 	update_dev_time(bdev);
+ 	update_dev_time(device_path);
  
  	return ret;
  
@@@ -2826,6 -2875,7 +2875,7 @@@ int btrfs_grow_device(struct btrfs_tran
  	struct btrfs_super_block *super_copy = fs_info->super_copy;
  	u64 old_total;
  	u64 diff;
+ 	int ret;
  
  	if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
  		return -EACCES;
@@@ -2854,7 -2904,11 +2904,11 @@@
  			      &trans->transaction->dev_update_list);
  	mutex_unlock(&fs_info->chunk_mutex);
  
- 	return btrfs_update_device(trans, device);
+ 	btrfs_reserve_chunk_metadata(trans, false);
+ 	ret = btrfs_update_device(trans, device);
+ 	btrfs_trans_release_chunk_metadata(trans);
+ 
+ 	return ret;
  }
  
  static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
@@@ -3096,7 -3150,7 +3150,7 @@@ int btrfs_remove_chunk(struct btrfs_tra
  		const u64 sys_flags = btrfs_system_alloc_profile(fs_info);
  		struct btrfs_block_group *sys_bg;
  
- 		sys_bg = btrfs_alloc_chunk(trans, sys_flags);
+ 		sys_bg = btrfs_create_chunk(trans, sys_flags);
  		if (IS_ERR(sys_bg)) {
  			ret = PTR_ERR(sys_bg);
  			btrfs_abort_transaction(trans, ret);
@@@ -4889,8 -4943,10 +4943,10 @@@ again
  			round_down(old_total - diff, fs_info->sectorsize));
  	mutex_unlock(&fs_info->chunk_mutex);
  
+ 	btrfs_reserve_chunk_metadata(trans, false);
  	/* Now btrfs_update_device() will change the on-disk size. */
  	ret = btrfs_update_device(trans, device);
+ 	btrfs_trans_release_chunk_metadata(trans);
  	if (ret < 0) {
  		btrfs_abort_transaction(trans, ret);
  		btrfs_end_transaction(trans);
@@@ -4973,7 -5029,7 +5029,7 @@@ static void check_raid1c34_incompat_fla
  }
  
  /*
-  * Structure used internally for __btrfs_alloc_chunk() function.
+  * Structure used internally for btrfs_create_chunk() function.
   * Wraps needed parameters.
   */
  struct alloc_chunk_ctl {
@@@ -5377,7 -5433,7 +5433,7 @@@ error_del_extent
  	return block_group;
  }
  
- struct btrfs_block_group *btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
  					    u64 type)
  {
  	struct btrfs_fs_info *info = trans->fs_info;
@@@ -5578,12 -5634,12 +5634,12 @@@ static noinline int init_first_rw_devic
  	 */
  
  	alloc_profile = btrfs_metadata_alloc_profile(fs_info);
- 	meta_bg = btrfs_alloc_chunk(trans, alloc_profile);
+ 	meta_bg = btrfs_create_chunk(trans, alloc_profile);
  	if (IS_ERR(meta_bg))
  		return PTR_ERR(meta_bg);
  
  	alloc_profile = btrfs_system_alloc_profile(fs_info);
- 	sys_bg = btrfs_alloc_chunk(trans, alloc_profile);
+ 	sys_bg = btrfs_create_chunk(trans, alloc_profile);
  	if (IS_ERR(sys_bg))
  		return PTR_ERR(sys_bg);
  
@@@ -5597,17 -5653,17 +5653,17 @@@ static inline int btrfs_chunk_max_error
  	return btrfs_raid_array[index].tolerated_failures;
  }
  
- int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
+ bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset)
  {
  	struct extent_map *em;
  	struct map_lookup *map;
- 	int readonly = 0;
  	int miss_ndevs = 0;
  	int i;
+ 	bool ret = true;
  
  	em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
  	if (IS_ERR(em))
- 		return 1;
+ 		return false;
  
  	map = em->map_lookup;
  	for (i = 0; i < map->num_stripes; i++) {
@@@ -5618,21 -5674,20 +5674,20 @@@
  		}
  		if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
  					&map->stripes[i].dev->dev_state)) {
- 			readonly = 1;
+ 			ret = false;
  			goto end;
  		}
  	}
  
  	/*
- 	 * If the number of missing devices is larger than max errors,
- 	 * we can not write the data into that chunk successfully, so
- 	 * set it readonly.
+ 	 * If the number of missing devices is larger than max errors, we can
+ 	 * not write the data into that chunk successfully.
  	 */
  	if (miss_ndevs > btrfs_chunk_max_errors(map))
- 		readonly = 1;
+ 		ret = false;
  end:
  	free_extent_map(em);
- 	return readonly;
+ 	return ret;
  }
  
  void btrfs_mapping_tree_free(struct extent_map_tree *tree)
@@@ -5795,7 -5850,7 +5850,7 @@@ static int find_live_mirror(struct btrf
  }
  
  /* Bubble-sort the stripe set to put the parity/syndrome stripes last */
- static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
+ static void sort_parity_stripes(struct btrfs_io_context *bioc, int num_stripes)
  {
  	int i;
  	int again = 1;
@@@ -5804,52 -5859,55 +5859,55 @@@
  		again = 0;
  		for (i = 0; i < num_stripes - 1; i++) {
  			/* Swap if parity is on a smaller index */
- 			if (bbio->raid_map[i] > bbio->raid_map[i + 1]) {
- 				swap(bbio->stripes[i], bbio->stripes[i + 1]);
- 				swap(bbio->raid_map[i], bbio->raid_map[i + 1]);
+ 			if (bioc->raid_map[i] > bioc->raid_map[i + 1]) {
+ 				swap(bioc->stripes[i], bioc->stripes[i + 1]);
+ 				swap(bioc->raid_map[i], bioc->raid_map[i + 1]);
  				again = 1;
  			}
  		}
  	}
  }
  
- static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
+ static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
+ 						       int total_stripes,
+ 						       int real_stripes)
  {
- 	struct btrfs_bio *bbio = kzalloc(
- 		 /* the size of the btrfs_bio */
- 		sizeof(struct btrfs_bio) +
- 		/* plus the variable array for the stripes */
- 		sizeof(struct btrfs_bio_stripe) * (total_stripes) +
- 		/* plus the variable array for the tgt dev */
+ 	struct btrfs_io_context *bioc = kzalloc(
+ 		 /* The size of btrfs_io_context */
+ 		sizeof(struct btrfs_io_context) +
+ 		/* Plus the variable array for the stripes */
+ 		sizeof(struct btrfs_io_stripe) * (total_stripes) +
+ 		/* Plus the variable array for the tgt dev */
  		sizeof(int) * (real_stripes) +
  		/*
- 		 * plus the raid_map, which includes both the tgt dev
- 		 * and the stripes
+ 		 * Plus the raid_map, which includes both the tgt dev
+ 		 * and the stripes.
  		 */
  		sizeof(u64) * (total_stripes),
  		GFP_NOFS|__GFP_NOFAIL);
  
- 	atomic_set(&bbio->error, 0);
- 	refcount_set(&bbio->refs, 1);
+ 	atomic_set(&bioc->error, 0);
+ 	refcount_set(&bioc->refs, 1);
  
- 	bbio->tgtdev_map = (int *)(bbio->stripes + total_stripes);
- 	bbio->raid_map = (u64 *)(bbio->tgtdev_map + real_stripes);
+ 	bioc->fs_info = fs_info;
+ 	bioc->tgtdev_map = (int *)(bioc->stripes + total_stripes);
+ 	bioc->raid_map = (u64 *)(bioc->tgtdev_map + real_stripes);
  
- 	return bbio;
+ 	return bioc;
  }
  
- void btrfs_get_bbio(struct btrfs_bio *bbio)
+ void btrfs_get_bioc(struct btrfs_io_context *bioc)
  {
- 	WARN_ON(!refcount_read(&bbio->refs));
- 	refcount_inc(&bbio->refs);
+ 	WARN_ON(!refcount_read(&bioc->refs));
+ 	refcount_inc(&bioc->refs);
  }
  
- void btrfs_put_bbio(struct btrfs_bio *bbio)
+ void btrfs_put_bioc(struct btrfs_io_context *bioc)
  {
- 	if (!bbio)
+ 	if (!bioc)
  		return;
- 	if (refcount_dec_and_test(&bbio->refs))
- 		kfree(bbio);
+ 	if (refcount_dec_and_test(&bioc->refs))
+ 		kfree(bioc);
  }
  
  /* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */
@@@ -5859,11 -5917,11 +5917,11 @@@
   */
  static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
  					 u64 logical, u64 *length_ret,
- 					 struct btrfs_bio **bbio_ret)
+ 					 struct btrfs_io_context **bioc_ret)
  {
  	struct extent_map *em;
  	struct map_lookup *map;
- 	struct btrfs_bio *bbio;
+ 	struct btrfs_io_context *bioc;
  	u64 length = *length_ret;
  	u64 offset;
  	u64 stripe_nr;
@@@ -5882,8 -5940,8 +5940,8 @@@
  	int ret = 0;
  	int i;
  
- 	/* discard always return a bbio */
- 	ASSERT(bbio_ret);
+ 	/* Discard always returns a bioc. */
+ 	ASSERT(bioc_ret);
  
  	em = btrfs_get_chunk_map(fs_info, logical, length);
  	if (IS_ERR(em))
@@@ -5946,26 -6004,25 +6004,25 @@@
  					&stripe_index);
  	}
  
- 	bbio = alloc_btrfs_bio(num_stripes, 0);
- 	if (!bbio) {
+ 	bioc = alloc_btrfs_io_context(fs_info, num_stripes, 0);
+ 	if (!bioc) {
  		ret = -ENOMEM;
  		goto out;
  	}
  
  	for (i = 0; i < num_stripes; i++) {
- 		bbio->stripes[i].physical =
+ 		bioc->stripes[i].physical =
  			map->stripes[stripe_index].physical +
  			stripe_offset + stripe_nr * map->stripe_len;
- 		bbio->stripes[i].dev = map->stripes[stripe_index].dev;
+ 		bioc->stripes[i].dev = map->stripes[stripe_index].dev;
  
  		if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
  				 BTRFS_BLOCK_GROUP_RAID10)) {
- 			bbio->stripes[i].length = stripes_per_dev *
+ 			bioc->stripes[i].length = stripes_per_dev *
  				map->stripe_len;
  
  			if (i / sub_stripes < remaining_stripes)
- 				bbio->stripes[i].length +=
- 					map->stripe_len;
+ 				bioc->stripes[i].length += map->stripe_len;
  
  			/*
  			 * Special for the first stripe and
@@@ -5976,19 -6033,17 +6033,17 @@@
  			 *    off     end_off
  			 */
  			if (i < sub_stripes)
- 				bbio->stripes[i].length -=
- 					stripe_offset;
+ 				bioc->stripes[i].length -= stripe_offset;
  
  			if (stripe_index >= last_stripe &&
  			    stripe_index <= (last_stripe +
  					     sub_stripes - 1))
- 				bbio->stripes[i].length -=
- 					stripe_end_offset;
+ 				bioc->stripes[i].length -= stripe_end_offset;
  
  			if (i == sub_stripes - 1)
  				stripe_offset = 0;
  		} else {
- 			bbio->stripes[i].length = length;
+ 			bioc->stripes[i].length = length;
  		}
  
  		stripe_index++;
@@@ -5998,9 -6053,9 +6053,9 @@@
  		}
  	}
  
- 	*bbio_ret = bbio;
- 	bbio->map_type = map->type;
- 	bbio->num_stripes = num_stripes;
+ 	*bioc_ret = bioc;
+ 	bioc->map_type = map->type;
+ 	bioc->num_stripes = num_stripes;
  out:
  	free_extent_map(em);
  	return ret;
@@@ -6024,7 -6079,7 +6079,7 @@@ static int get_extra_mirror_from_replac
  					 u64 srcdev_devid, int *mirror_num,
  					 u64 *physical)
  {
- 	struct btrfs_bio *bbio = NULL;
+ 	struct btrfs_io_context *bioc = NULL;
  	int num_stripes;
  	int index_srcdev = 0;
  	int found = 0;
@@@ -6033,20 -6088,20 +6088,20 @@@
  	int ret = 0;
  
  	ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
- 				logical, &length, &bbio, 0, 0);
+ 				logical, &length, &bioc, 0, 0);
  	if (ret) {
- 		ASSERT(bbio == NULL);
+ 		ASSERT(bioc == NULL);
  		return ret;
  	}
  
- 	num_stripes = bbio->num_stripes;
+ 	num_stripes = bioc->num_stripes;
  	if (*mirror_num > num_stripes) {
  		/*
  		 * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror,
  		 * that means that the requested area is not left of the left
  		 * cursor
  		 */
- 		btrfs_put_bbio(bbio);
+ 		btrfs_put_bioc(bioc);
  		return -EIO;
  	}
  
@@@ -6056,7 -6111,7 +6111,7 @@@
  	 * pointer to the one of the target drive.
  	 */
  	for (i = 0; i < num_stripes; i++) {
- 		if (bbio->stripes[i].dev->devid != srcdev_devid)
+ 		if (bioc->stripes[i].dev->devid != srcdev_devid)
  			continue;
  
  		/*
@@@ -6064,15 -6119,15 +6119,15 @@@
  		 * mirror with the lowest physical address
  		 */
  		if (found &&
- 		    physical_of_found <= bbio->stripes[i].physical)
+ 		    physical_of_found <= bioc->stripes[i].physical)
  			continue;
  
  		index_srcdev = i;
  		found = 1;
- 		physical_of_found = bbio->stripes[i].physical;
+ 		physical_of_found = bioc->stripes[i].physical;
  	}
  
- 	btrfs_put_bbio(bbio);
+ 	btrfs_put_bioc(bioc);
  
  	ASSERT(found);
  	if (!found)
@@@ -6103,12 -6158,12 +6158,12 @@@ static bool is_block_group_to_copy(stru
  }
  
  static void handle_ops_on_dev_replace(enum btrfs_map_op op,
- 				      struct btrfs_bio **bbio_ret,
+ 				      struct btrfs_io_context **bioc_ret,
  				      struct btrfs_dev_replace *dev_replace,
  				      u64 logical,
  				      int *num_stripes_ret, int *max_errors_ret)
  {
- 	struct btrfs_bio *bbio = *bbio_ret;
+ 	struct btrfs_io_context *bioc = *bioc_ret;
  	u64 srcdev_devid = dev_replace->srcdev->devid;
  	int tgtdev_indexes = 0;
  	int num_stripes = *num_stripes_ret;
@@@ -6138,17 -6193,17 +6193,17 @@@
  		 */
  		index_where_to_add = num_stripes;
  		for (i = 0; i < num_stripes; i++) {
- 			if (bbio->stripes[i].dev->devid == srcdev_devid) {
+ 			if (bioc->stripes[i].dev->devid == srcdev_devid) {
  				/* write to new disk, too */
- 				struct btrfs_bio_stripe *new =
- 					bbio->stripes + index_where_to_add;
- 				struct btrfs_bio_stripe *old =
- 					bbio->stripes + i;
+ 				struct btrfs_io_stripe *new =
+ 					bioc->stripes + index_where_to_add;
+ 				struct btrfs_io_stripe *old =
+ 					bioc->stripes + i;
  
  				new->physical = old->physical;
  				new->length = old->length;
  				new->dev = dev_replace->tgtdev;
- 				bbio->tgtdev_map[i] = index_where_to_add;
+ 				bioc->tgtdev_map[i] = index_where_to_add;
  				index_where_to_add++;
  				max_errors++;
  				tgtdev_indexes++;
@@@ -6168,30 -6223,29 +6223,29 @@@
  		 * full copy of the source drive.
  		 */
  		for (i = 0; i < num_stripes; i++) {
- 			if (bbio->stripes[i].dev->devid == srcdev_devid) {
+ 			if (bioc->stripes[i].dev->devid == srcdev_devid) {
  				/*
  				 * In case of DUP, in order to keep it simple,
  				 * only add the mirror with the lowest physical
  				 * address
  				 */
  				if (found &&
- 				    physical_of_found <=
- 				     bbio->stripes[i].physical)
+ 				    physical_of_found <= bioc->stripes[i].physical)
  					continue;
  				index_srcdev = i;
  				found = 1;
- 				physical_of_found = bbio->stripes[i].physical;
+ 				physical_of_found = bioc->stripes[i].physical;
  			}
  		}
  		if (found) {
- 			struct btrfs_bio_stripe *tgtdev_stripe =
- 				bbio->stripes + num_stripes;
+ 			struct btrfs_io_stripe *tgtdev_stripe =
+ 				bioc->stripes + num_stripes;
  
  			tgtdev_stripe->physical = physical_of_found;
  			tgtdev_stripe->length =
- 				bbio->stripes[index_srcdev].length;
+ 				bioc->stripes[index_srcdev].length;
  			tgtdev_stripe->dev = dev_replace->tgtdev;
- 			bbio->tgtdev_map[index_srcdev] = num_stripes;
+ 			bioc->tgtdev_map[index_srcdev] = num_stripes;
  
  			tgtdev_indexes++;
  			num_stripes++;
@@@ -6200,8 -6254,8 +6254,8 @@@
  
  	*num_stripes_ret = num_stripes;
  	*max_errors_ret = max_errors;
- 	bbio->num_tgtdevs = tgtdev_indexes;
- 	*bbio_ret = bbio;
+ 	bioc->num_tgtdevs = tgtdev_indexes;
+ 	*bioc_ret = bioc;
  }
  
  static bool need_full_stripe(enum btrfs_map_op op)
@@@ -6304,7 -6358,7 +6358,7 @@@ int btrfs_get_io_geometry(struct btrfs_
  static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
  			     enum btrfs_map_op op,
  			     u64 logical, u64 *length,
- 			     struct btrfs_bio **bbio_ret,
+ 			     struct btrfs_io_context **bioc_ret,
  			     int mirror_num, int need_raid_map)
  {
  	struct extent_map *em;
@@@ -6319,7 -6373,7 +6373,7 @@@
  	int num_stripes;
  	int max_errors = 0;
  	int tgtdev_indexes = 0;
- 	struct btrfs_bio *bbio = NULL;
+ 	struct btrfs_io_context *bioc = NULL;
  	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
  	int dev_replace_is_ongoing = 0;
  	int num_alloc_stripes;
@@@ -6328,7 -6382,7 +6382,7 @@@
  	u64 raid56_full_stripe_start = (u64)-1;
  	struct btrfs_io_geometry geom;
  
- 	ASSERT(bbio_ret);
+ 	ASSERT(bioc_ret);
  	ASSERT(op != BTRFS_MAP_DISCARD);
  
  	em = btrfs_get_chunk_map(fs_info, logical, *length);
@@@ -6472,20 -6526,20 +6526,20 @@@
  		tgtdev_indexes = num_stripes;
  	}
  
- 	bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes);
- 	if (!bbio) {
+ 	bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes, tgtdev_indexes);
+ 	if (!bioc) {
  		ret = -ENOMEM;
  		goto out;
  	}
  
  	for (i = 0; i < num_stripes; i++) {
- 		bbio->stripes[i].physical = map->stripes[stripe_index].physical +
+ 		bioc->stripes[i].physical = map->stripes[stripe_index].physical +
  			stripe_offset + stripe_nr * map->stripe_len;
- 		bbio->stripes[i].dev = map->stripes[stripe_index].dev;
+ 		bioc->stripes[i].dev = map->stripes[stripe_index].dev;
  		stripe_index++;
  	}
  
- 	/* build raid_map */
+ 	/* Build raid_map */
  	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
  	    (need_full_stripe(op) || mirror_num > 1)) {
  		u64 tmp;
@@@ -6497,15 -6551,15 +6551,15 @@@
  		/* Fill in the logical address of each stripe */
  		tmp = stripe_nr * data_stripes;
  		for (i = 0; i < data_stripes; i++)
- 			bbio->raid_map[(i+rot) % num_stripes] =
+ 			bioc->raid_map[(i + rot) % num_stripes] =
  				em->start + (tmp + i) * map->stripe_len;
  
- 		bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
+ 		bioc->raid_map[(i + rot) % map->num_stripes] = RAID5_P_STRIPE;
  		if (map->type & BTRFS_BLOCK_GROUP_RAID6)
- 			bbio->raid_map[(i+rot+1) % num_stripes] =
+ 			bioc->raid_map[(i + rot + 1) % num_stripes] =
  				RAID6_Q_STRIPE;
  
- 		sort_parity_stripes(bbio, num_stripes);
+ 		sort_parity_stripes(bioc, num_stripes);
  	}
  
  	if (need_full_stripe(op))
@@@ -6513,15 -6567,15 +6567,15 @@@
  
  	if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
  	    need_full_stripe(op)) {
- 		handle_ops_on_dev_replace(op, &bbio, dev_replace, logical,
+ 		handle_ops_on_dev_replace(op, &bioc, dev_replace, logical,
  					  &num_stripes, &max_errors);
  	}
  
- 	*bbio_ret = bbio;
- 	bbio->map_type = map->type;
- 	bbio->num_stripes = num_stripes;
- 	bbio->max_errors = max_errors;
- 	bbio->mirror_num = mirror_num;
+ 	*bioc_ret = bioc;
+ 	bioc->map_type = map->type;
+ 	bioc->num_stripes = num_stripes;
+ 	bioc->max_errors = max_errors;
+ 	bioc->mirror_num = mirror_num;
  
  	/*
  	 * this is the case that REQ_READ && dev_replace_is_ongoing &&
@@@ -6530,9 -6584,9 +6584,9 @@@
  	 */
  	if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
  		WARN_ON(num_stripes > 1);
- 		bbio->stripes[0].dev = dev_replace->tgtdev;
- 		bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
- 		bbio->mirror_num = map->num_stripes + 1;
+ 		bioc->stripes[0].dev = dev_replace->tgtdev;
+ 		bioc->stripes[0].physical = physical_to_patch_in_first_stripe;
+ 		bioc->mirror_num = map->num_stripes + 1;
  	}
  out:
  	if (dev_replace_is_ongoing) {
@@@ -6546,43 -6600,43 +6600,43 @@@
  
  int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
  		      u64 logical, u64 *length,
- 		      struct btrfs_bio **bbio_ret, int mirror_num)
+ 		      struct btrfs_io_context **bioc_ret, int mirror_num)
  {
  	if (op == BTRFS_MAP_DISCARD)
  		return __btrfs_map_block_for_discard(fs_info, logical,
- 						     length, bbio_ret);
+ 						     length, bioc_ret);
  
- 	return __btrfs_map_block(fs_info, op, logical, length, bbio_ret,
+ 	return __btrfs_map_block(fs_info, op, logical, length, bioc_ret,
  				 mirror_num, 0);
  }
  
  /* For Scrub/replace */
  int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
  		     u64 logical, u64 *length,
- 		     struct btrfs_bio **bbio_ret)
+ 		     struct btrfs_io_context **bioc_ret)
  {
- 	return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1);
+ 	return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, 0, 1);
  }
  
- static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio)
+ static inline void btrfs_end_bioc(struct btrfs_io_context *bioc, struct bio *bio)
  {
- 	bio->bi_private = bbio->private;
- 	bio->bi_end_io = bbio->end_io;
+ 	bio->bi_private = bioc->private;
+ 	bio->bi_end_io = bioc->end_io;
  	bio_endio(bio);
  
- 	btrfs_put_bbio(bbio);
+ 	btrfs_put_bioc(bioc);
  }
  
  static void btrfs_end_bio(struct bio *bio)
  {
- 	struct btrfs_bio *bbio = bio->bi_private;
+ 	struct btrfs_io_context *bioc = bio->bi_private;
  	int is_orig_bio = 0;
  
  	if (bio->bi_status) {
- 		atomic_inc(&bbio->error);
+ 		atomic_inc(&bioc->error);
  		if (bio->bi_status == BLK_STS_IOERR ||
  		    bio->bi_status == BLK_STS_TARGET) {
- 			struct btrfs_device *dev = btrfs_io_bio(bio)->device;
+ 			struct btrfs_device *dev = btrfs_bio(bio)->device;
  
  			ASSERT(dev->bdev);
  			if (btrfs_op(bio) == BTRFS_MAP_WRITE)
@@@ -6597,22 -6651,22 +6651,22 @@@
  		}
  	}
  
- 	if (bio == bbio->orig_bio)
+ 	if (bio == bioc->orig_bio)
  		is_orig_bio = 1;
  
- 	btrfs_bio_counter_dec(bbio->fs_info);
+ 	btrfs_bio_counter_dec(bioc->fs_info);
  
- 	if (atomic_dec_and_test(&bbio->stripes_pending)) {
+ 	if (atomic_dec_and_test(&bioc->stripes_pending)) {
  		if (!is_orig_bio) {
  			bio_put(bio);
- 			bio = bbio->orig_bio;
+ 			bio = bioc->orig_bio;
  		}
  
- 		btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
+ 		btrfs_bio(bio)->mirror_num = bioc->mirror_num;
  		/* only send an error to the higher layers if it is
  		 * beyond the tolerance of the btrfs bio
  		 */
- 		if (atomic_read(&bbio->error) > bbio->max_errors) {
+ 		if (atomic_read(&bioc->error) > bioc->max_errors) {
  			bio->bi_status = BLK_STS_IOERR;
  		} else {
  			/*
@@@ -6622,19 -6676,19 +6676,19 @@@
  			bio->bi_status = BLK_STS_OK;
  		}
  
- 		btrfs_end_bbio(bbio, bio);
+ 		btrfs_end_bioc(bioc, bio);
  	} else if (!is_orig_bio) {
  		bio_put(bio);
  	}
  }
  
- static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
+ static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio,
  			      u64 physical, struct btrfs_device *dev)
  {
- 	struct btrfs_fs_info *fs_info = bbio->fs_info;
+ 	struct btrfs_fs_info *fs_info = bioc->fs_info;
  
- 	bio->bi_private = bbio;
- 	btrfs_io_bio(bio)->device = dev;
+ 	bio->bi_private = bioc;
+ 	btrfs_bio(bio)->device = dev;
  	bio->bi_end_io = btrfs_end_bio;
  	bio->bi_iter.bi_sector = physical >> 9;
  	/*
@@@ -6663,20 -6717,20 +6717,20 @@@
  	btrfsic_submit_bio(bio);
  }
  
- static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
+ static void bioc_error(struct btrfs_io_context *bioc, struct bio *bio, u64 logical)
  {
- 	atomic_inc(&bbio->error);
- 	if (atomic_dec_and_test(&bbio->stripes_pending)) {
+ 	atomic_inc(&bioc->error);
+ 	if (atomic_dec_and_test(&bioc->stripes_pending)) {
  		/* Should be the original bio. */
- 		WARN_ON(bio != bbio->orig_bio);
+ 		WARN_ON(bio != bioc->orig_bio);
  
- 		btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
+ 		btrfs_bio(bio)->mirror_num = bioc->mirror_num;
  		bio->bi_iter.bi_sector = logical >> 9;
- 		if (atomic_read(&bbio->error) > bbio->max_errors)
+ 		if (atomic_read(&bioc->error) > bioc->max_errors)
  			bio->bi_status = BLK_STS_IOERR;
  		else
  			bio->bi_status = BLK_STS_OK;
- 		btrfs_end_bbio(bbio, bio);
+ 		btrfs_end_bioc(bioc, bio);
  	}
  }
  
@@@ -6691,36 -6745,34 +6745,34 @@@ blk_status_t btrfs_map_bio(struct btrfs
  	int ret;
  	int dev_nr;
  	int total_devs;
- 	struct btrfs_bio *bbio = NULL;
+ 	struct btrfs_io_context *bioc = NULL;
  
  	length = bio->bi_iter.bi_size;
  	map_length = length;
  
  	btrfs_bio_counter_inc_blocked(fs_info);
  	ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical,
- 				&map_length, &bbio, mirror_num, 1);
+ 				&map_length, &bioc, mirror_num, 1);
  	if (ret) {
  		btrfs_bio_counter_dec(fs_info);
  		return errno_to_blk_status(ret);
  	}
  
- 	total_devs = bbio->num_stripes;
- 	bbio->orig_bio = first_bio;
- 	bbio->private = first_bio->bi_private;
- 	bbio->end_io = first_bio->bi_end_io;
- 	bbio->fs_info = fs_info;
- 	atomic_set(&bbio->stripes_pending, bbio->num_stripes);
+ 	total_devs = bioc->num_stripes;
+ 	bioc->orig_bio = first_bio;
+ 	bioc->private = first_bio->bi_private;
+ 	bioc->end_io = first_bio->bi_end_io;
+ 	atomic_set(&bioc->stripes_pending, bioc->num_stripes);
  
- 	if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
+ 	if ((bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
  	    ((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) {
  		/* In this case, map_length has been set to the length of
  		   a single stripe; not the whole write */
  		if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
- 			ret = raid56_parity_write(fs_info, bio, bbio,
- 						  map_length);
+ 			ret = raid56_parity_write(bio, bioc, map_length);
  		} else {
- 			ret = raid56_parity_recover(fs_info, bio, bbio,
- 						    map_length, mirror_num, 1);
+ 			ret = raid56_parity_recover(bio, bioc, map_length,
+ 						    mirror_num, 1);
  		}
  
  		btrfs_bio_counter_dec(fs_info);
@@@ -6735,12 -6787,12 +6787,12 @@@
  	}
  
  	for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
- 		dev = bbio->stripes[dev_nr].dev;
+ 		dev = bioc->stripes[dev_nr].dev;
  		if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING,
  						   &dev->dev_state) ||
  		    (btrfs_op(first_bio) == BTRFS_MAP_WRITE &&
  		    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
- 			bbio_error(bbio, first_bio, logical);
+ 			bioc_error(bioc, first_bio, logical);
  			continue;
  		}
  
@@@ -6749,12 -6801,39 +6801,39 @@@
  		else
  			bio = first_bio;
  
- 		submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, dev);
+ 		submit_stripe_bio(bioc, bio, bioc->stripes[dev_nr].physical, dev);
  	}
  	btrfs_bio_counter_dec(fs_info);
  	return BLK_STS_OK;
  }
  
+ static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args,
+ 				      const struct btrfs_fs_devices *fs_devices)
+ {
+ 	if (args->fsid == NULL)
+ 		return true;
+ 	if (memcmp(fs_devices->metadata_uuid, args->fsid, BTRFS_FSID_SIZE) == 0)
+ 		return true;
+ 	return false;
+ }
+ 
+ static bool dev_args_match_device(const struct btrfs_dev_lookup_args *args,
+ 				  const struct btrfs_device *device)
+ {
+ 	ASSERT((args->devid != (u64)-1) || args->missing);
+ 
+ 	if ((args->devid != (u64)-1) && device->devid != args->devid)
+ 		return false;
+ 	if (args->uuid && memcmp(device->uuid, args->uuid, BTRFS_UUID_SIZE) != 0)
+ 		return false;
+ 	if (!args->missing)
+ 		return true;
+ 	if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) &&
+ 	    !device->bdev)
+ 		return true;
+ 	return false;
+ }
+ 
  /*
   * Find a device specified by @devid or @uuid in the list of @fs_devices, or
   * return NULL.
@@@ -6762,31 -6841,25 +6841,25 @@@
   * If devid and uuid are both specified, the match must be exact, otherwise
   * only devid is used.
   */
- struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
- 				       u64 devid, u8 *uuid, u8 *fsid)
+ struct btrfs_device *btrfs_find_device(const struct btrfs_fs_devices *fs_devices,
+ 				       const struct btrfs_dev_lookup_args *args)
  {
  	struct btrfs_device *device;
  	struct btrfs_fs_devices *seed_devs;
  
- 	if (!fsid || !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
+ 	if (dev_args_match_fs_devices(args, fs_devices)) {
  		list_for_each_entry(device, &fs_devices->devices, dev_list) {
- 			if (device->devid == devid &&
- 			    (!uuid || memcmp(device->uuid, uuid,
- 					     BTRFS_UUID_SIZE) == 0))
+ 			if (dev_args_match_device(args, device))
  				return device;
  		}
  	}
  
  	list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
- 		if (!fsid ||
- 		    !memcmp(seed_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
- 			list_for_each_entry(device, &seed_devs->devices,
- 					    dev_list) {
- 				if (device->devid == devid &&
- 				    (!uuid || memcmp(device->uuid, uuid,
- 						     BTRFS_UUID_SIZE) == 0))
- 					return device;
- 			}
+ 		if (!dev_args_match_fs_devices(args, seed_devs))
+ 			continue;
+ 		list_for_each_entry(device, &seed_devs->devices, dev_list) {
+ 			if (dev_args_match_device(args, device))
+ 				return device;
  		}
  	}
  
@@@ -6952,6 -7025,7 +7025,7 @@@ static void warn_32bit_meta_chunk(struc
  static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
  			  struct btrfs_chunk *chunk)
  {
+ 	BTRFS_DEV_LOOKUP_ARGS(args);
  	struct btrfs_fs_info *fs_info = leaf->fs_info;
  	struct extent_map_tree *map_tree = &fs_info->mapping_tree;
  	struct map_lookup *map;
@@@ -7029,11 -7103,12 +7103,12 @@@
  		map->stripes[i].physical =
  			btrfs_stripe_offset_nr(leaf, chunk, i);
  		devid = btrfs_stripe_devid_nr(leaf, chunk, i);
+ 		args.devid = devid;
  		read_extent_buffer(leaf, uuid, (unsigned long)
  				   btrfs_stripe_dev_uuid_nr(chunk, i),
  				   BTRFS_UUID_SIZE);
- 		map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices,
- 							devid, uuid, NULL);
+ 		args.uuid = uuid;
+ 		map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, &args);
  		if (!map->stripes[i].dev &&
  		    !btrfs_test_opt(fs_info, DEGRADED)) {
  			free_extent_map(em);
@@@ -7151,6 -7226,7 +7226,7 @@@ static struct btrfs_fs_devices *open_se
  static int read_one_dev(struct extent_buffer *leaf,
  			struct btrfs_dev_item *dev_item)
  {
+ 	BTRFS_DEV_LOOKUP_ARGS(args);
  	struct btrfs_fs_info *fs_info = leaf->fs_info;
  	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
  	struct btrfs_device *device;
@@@ -7159,11 -7235,13 +7235,13 @@@
  	u8 fs_uuid[BTRFS_FSID_SIZE];
  	u8 dev_uuid[BTRFS_UUID_SIZE];
  
- 	devid = btrfs_device_id(leaf, dev_item);
+ 	devid = args.devid = btrfs_device_id(leaf, dev_item);
  	read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
  			   BTRFS_UUID_SIZE);
  	read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
  			   BTRFS_FSID_SIZE);
+ 	args.uuid = dev_uuid;
+ 	args.fsid = fs_uuid;
  
  	if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) {
  		fs_devices = open_seed_devices(fs_info, fs_uuid);
@@@ -7171,8 -7249,7 +7249,7 @@@
  			return PTR_ERR(fs_devices);
  	}
  
- 	device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
- 				   fs_uuid);
+ 	device = btrfs_find_device(fs_info->fs_devices, &args);
  	if (!device) {
  		if (!btrfs_test_opt(fs_info, DEGRADED)) {
  			btrfs_report_missing_device(fs_info, devid,
@@@ -7236,7 -7313,7 +7313,7 @@@
  
  	fill_device_from_item(leaf, dev_item, device);
  	if (device->bdev) {
 -		u64 max_total_bytes = i_size_read(device->bdev->bd_inode);
 +		u64 max_total_bytes = bdev_nr_bytes(device->bdev);
  
  		if (device->total_bytes > max_total_bytes) {
  			btrfs_err(fs_info,
@@@ -7841,12 -7918,14 +7918,14 @@@ static void btrfs_dev_stat_print_on_loa
  int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
  			struct btrfs_ioctl_get_dev_stats *stats)
  {
+ 	BTRFS_DEV_LOOKUP_ARGS(args);
  	struct btrfs_device *dev;
  	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
  	int i;
  
  	mutex_lock(&fs_devices->device_list_mutex);
- 	dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL);
+ 	args.devid = stats->devid;
+ 	dev = btrfs_find_device(fs_info->fs_devices, &args);
  	mutex_unlock(&fs_devices->device_list_mutex);
  
  	if (!dev) {
@@@ -7922,6 -8001,7 +8001,7 @@@ static int verify_one_dev_extent(struc
  				 u64 chunk_offset, u64 devid,
  				 u64 physical_offset, u64 physical_len)
  {
+ 	struct btrfs_dev_lookup_args args = { .devid = devid };
  	struct extent_map_tree *em_tree = &fs_info->mapping_tree;
  	struct extent_map *em;
  	struct map_lookup *map;
@@@ -7977,7 -8057,7 +8057,7 @@@
  	}
  
  	/* Make sure no dev extent is beyond device boundary */
- 	dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
+ 	dev = btrfs_find_device(fs_info->fs_devices, &args);
  	if (!dev) {
  		btrfs_err(fs_info, "failed to find devid %llu", devid);
  		ret = -EUCLEAN;
diff --combined include/linux/fs.h
index 0dcb902,56eba72..f3cfca5
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@@ -48,7 -48,6 +48,7 @@@
  struct backing_dev_info;
  struct bdi_writeback;
  struct bio;
 +struct io_comp_batch;
  struct export_operations;
  struct fiemap_extent_info;
  struct hd_geometry;
@@@ -330,12 -329,16 +330,12 @@@ struct kiocb 
  	randomized_struct_fields_start
  
  	loff_t			ki_pos;
 -	void (*ki_complete)(struct kiocb *iocb, long ret, long ret2);
 +	void (*ki_complete)(struct kiocb *iocb, long ret);
  	void			*private;
  	int			ki_flags;
  	u16			ki_hint;
  	u16			ki_ioprio; /* See linux/ioprio.h */
 -	union {
 -		unsigned int		ki_cookie; /* for ->iopoll */
 -		struct wait_page_queue	*ki_waitq; /* for async buffered IO */
 -	};
 -
 +	struct wait_page_queue	*ki_waitq; /* for async buffered IO */
  	randomized_struct_fields_end
  };
  
@@@ -2072,8 -2075,7 +2072,8 @@@ struct file_operations 
  	ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
  	ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
  	ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
 -	int (*iopoll)(struct kiocb *kiocb, bool spin);
 +	int (*iopoll)(struct kiocb *kiocb, struct io_comp_batch *,
 +			unsigned int flags);
  	int (*iterate) (struct file *, struct dir_context *);
  	int (*iterate_shared) (struct file *, struct dir_context *);
  	__poll_t (*poll) (struct file *, struct poll_table_struct *);
@@@ -2496,6 -2498,8 +2496,8 @@@ enum file_time_flags 
  
  extern bool atime_needs_update(const struct path *, struct inode *);
  extern void touch_atime(const struct path *);
+ int inode_update_time(struct inode *inode, struct timespec64 *time, int flags);
+ 
  static inline void file_accessed(struct file *file)
  {
  	if (!(file->f_flags & O_NOATIME))