btrfs: add BTRFS_IOC_ENCODED_READ ioctl
authorOmar Sandoval <osandov@fb.com>
Thu, 10 Oct 2019 00:59:07 +0000 (17:59 -0700)
committerDavid Sterba <dsterba@suse.com>
Mon, 14 Mar 2022 12:13:51 +0000 (13:13 +0100)
There are 4 main cases:

1. Inline extents: we copy the data straight out of the extent buffer.
2. Hole/preallocated extents: we fill in zeroes.
3. Regular, uncompressed extents: we read the sectors we need directly
   from disk.
4. Regular, compressed extents: we read the entire compressed extent
   from disk and indicate what subset of the decompressed extent is in
   the file.

This initial implementation simplifies a few things that can be improved
in the future:

- Cases 1, 3, and 4 allocate temporary memory to read into before
  copying out to userspace.
- We don't do read repair, because it turns out that read repair is
  currently broken for compressed data.
- We hold the inode lock during the operation.

Note that we don't need to hold the mmap lock. We may race with
btrfs_page_mkwrite() and read the old data from before the page was
dirtied:

btrfs_page_mkwrite         btrfs_encoded_read
---------------------------------------------------
(enter)                    (enter)
                           btrfs_wait_ordered_range
lock_extent_bits
btrfs_page_set_dirty
unlock_extent_cached
(exit)
                           lock_extent_bits
                           read extent (dirty page hasn't been flushed,
                                        so this is the old data)
                           unlock_extent_cached
                           (exit)

we read the old data from before the page was dirtied. But, that's true
even if we were to hold the mmap lock:

btrfs_page_mkwrite               btrfs_encoded_read
-------------------------------------------------------------------
(enter)                          (enter)
                                 btrfs_inode_lock(BTRFS_ILOCK_MMAP)
down_read(i_mmap_lock) (blocked)
                                 btrfs_wait_ordered_range
                                 lock_extent_bits
 read extent (page hasn't been dirtied,
                                              so this is the old data)
                                 unlock_extent_cached
                                 btrfs_inode_unlock(BTRFS_ILOCK_MMAP)
down_read(i_mmap_lock) returns
lock_extent_bits
btrfs_page_set_dirty
unlock_extent_cached

In other words, this is inherently racy, so it's fine that we return the
old data in this tiny window.

Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
fs/btrfs/ctree.h
fs/btrfs/inode.c
fs/btrfs/ioctl.c

index 91b27d5..22a3fef 100644 (file)
@@ -49,6 +49,7 @@ extern struct kmem_cache *btrfs_free_space_bitmap_cachep;
 struct btrfs_ordered_sum;
 struct btrfs_ref;
 struct btrfs_bio;
+struct btrfs_ioctl_encoded_io_args;
 
 #define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */
 
@@ -3305,6 +3306,9 @@ int btrfs_writepage_cow_fixup(struct page *page);
 void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
                                          struct page *page, u64 start,
                                          u64 end, bool uptodate);
+ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
+                          struct btrfs_ioctl_encoded_io_args *encoded);
+
 extern const struct dentry_operations btrfs_dentry_operations;
 extern const struct iomap_ops btrfs_dio_iomap_ops;
 extern const struct iomap_dio_ops btrfs_dio_ops;
index 004d2dd..505cd20 100644 (file)
@@ -10156,6 +10156,504 @@ void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end)
        }
 }
 
+static int btrfs_encoded_io_compression_from_extent(
+                               struct btrfs_fs_info *fs_info,
+                               int compress_type)
+{
+       switch (compress_type) {
+       case BTRFS_COMPRESS_NONE:
+               return BTRFS_ENCODED_IO_COMPRESSION_NONE;
+       case BTRFS_COMPRESS_ZLIB:
+               return BTRFS_ENCODED_IO_COMPRESSION_ZLIB;
+       case BTRFS_COMPRESS_LZO:
+               /*
+                * The LZO format depends on the sector size. 64K is the maximum
+                * sector size that we support.
+                */
+               if (fs_info->sectorsize < SZ_4K || fs_info->sectorsize > SZ_64K)
+                       return -EINVAL;
+               return BTRFS_ENCODED_IO_COMPRESSION_LZO_4K +
+                      (fs_info->sectorsize_bits - 12);
+       case BTRFS_COMPRESS_ZSTD:
+               return BTRFS_ENCODED_IO_COMPRESSION_ZSTD;
+       default:
+               return -EUCLEAN;
+       }
+}
+
+static ssize_t btrfs_encoded_read_inline(
+                               struct kiocb *iocb,
+                               struct iov_iter *iter, u64 start,
+                               u64 lockend,
+                               struct extent_state **cached_state,
+                               u64 extent_start, size_t count,
+                               struct btrfs_ioctl_encoded_io_args *encoded,
+                               bool *unlocked)
+{
+       struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
+       struct btrfs_root *root = inode->root;
+       struct btrfs_fs_info *fs_info = root->fs_info;
+       struct extent_io_tree *io_tree = &inode->io_tree;
+       struct btrfs_path *path;
+       struct extent_buffer *leaf;
+       struct btrfs_file_extent_item *item;
+       u64 ram_bytes;
+       unsigned long ptr;
+       void *tmp;
+       ssize_t ret;
+
+       path = btrfs_alloc_path();
+       if (!path) {
+               ret = -ENOMEM;
+               goto out;
+       }
+       ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
+                                      extent_start, 0);
+       if (ret) {
+               if (ret > 0) {
+                       /* The extent item disappeared? */
+                       ret = -EIO;
+               }
+               goto out;
+       }
+       leaf = path->nodes[0];
+       item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
+
+       ram_bytes = btrfs_file_extent_ram_bytes(leaf, item);
+       ptr = btrfs_file_extent_inline_start(item);
+
+       encoded->len = min_t(u64, extent_start + ram_bytes,
+                            inode->vfs_inode.i_size) - iocb->ki_pos;
+       ret = btrfs_encoded_io_compression_from_extent(fs_info,
+                                btrfs_file_extent_compression(leaf, item));
+       if (ret < 0)
+               goto out;
+       encoded->compression = ret;
+       if (encoded->compression) {
+               size_t inline_size;
+
+               inline_size = btrfs_file_extent_inline_item_len(leaf,
+                                                               path->slots[0]);
+               if (inline_size > count) {
+                       ret = -ENOBUFS;
+                       goto out;
+               }
+               count = inline_size;
+               encoded->unencoded_len = ram_bytes;
+               encoded->unencoded_offset = iocb->ki_pos - extent_start;
+       } else {
+               count = min_t(u64, count, encoded->len);
+               encoded->len = count;
+               encoded->unencoded_len = count;
+               ptr += iocb->ki_pos - extent_start;
+       }
+
+       tmp = kmalloc(count, GFP_NOFS);
+       if (!tmp) {
+               ret = -ENOMEM;
+               goto out;
+       }
+       read_extent_buffer(leaf, tmp, ptr, count);
+       btrfs_release_path(path);
+       unlock_extent_cached(io_tree, start, lockend, cached_state);
+       btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+       *unlocked = true;
+
+       ret = copy_to_iter(tmp, count, iter);
+       if (ret != count)
+               ret = -EFAULT;
+       kfree(tmp);
+out:
+       btrfs_free_path(path);
+       return ret;
+}
+
+struct btrfs_encoded_read_private {
+       struct btrfs_inode *inode;
+       u64 file_offset;
+       wait_queue_head_t wait;
+       atomic_t pending;
+       blk_status_t status;
+       bool skip_csum;
+};
+
+static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode,
+                                           struct bio *bio, int mirror_num)
+{
+       struct btrfs_encoded_read_private *priv = bio->bi_private;
+       struct btrfs_bio *bbio = btrfs_bio(bio);
+       struct btrfs_fs_info *fs_info = inode->root->fs_info;
+       blk_status_t ret;
+
+       if (!priv->skip_csum) {
+               ret = btrfs_lookup_bio_sums(&inode->vfs_inode, bio, NULL);
+               if (ret)
+                       return ret;
+       }
+
+       ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
+       if (ret) {
+               btrfs_bio_free_csum(bbio);
+               return ret;
+       }
+
+       atomic_inc(&priv->pending);
+       ret = btrfs_map_bio(fs_info, bio, mirror_num);
+       if (ret) {
+               atomic_dec(&priv->pending);
+               btrfs_bio_free_csum(bbio);
+       }
+       return ret;
+}
+
+static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio)
+{
+       const bool uptodate = (bbio->bio.bi_status == BLK_STS_OK);
+       struct btrfs_encoded_read_private *priv = bbio->bio.bi_private;
+       struct btrfs_inode *inode = priv->inode;
+       struct btrfs_fs_info *fs_info = inode->root->fs_info;
+       u32 sectorsize = fs_info->sectorsize;
+       struct bio_vec *bvec;
+       struct bvec_iter_all iter_all;
+       u64 start = priv->file_offset;
+       u32 bio_offset = 0;
+
+       if (priv->skip_csum || !uptodate)
+               return bbio->bio.bi_status;
+
+       bio_for_each_segment_all(bvec, &bbio->bio, iter_all) {
+               unsigned int i, nr_sectors, pgoff;
+
+               nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len);
+               pgoff = bvec->bv_offset;
+               for (i = 0; i < nr_sectors; i++) {
+                       ASSERT(pgoff < PAGE_SIZE);
+                       if (check_data_csum(&inode->vfs_inode, bbio, bio_offset,
+                                           bvec->bv_page, pgoff, start))
+                               return BLK_STS_IOERR;
+                       start += sectorsize;
+                       bio_offset += sectorsize;
+                       pgoff += sectorsize;
+               }
+       }
+       return BLK_STS_OK;
+}
+
+static void btrfs_encoded_read_endio(struct bio *bio)
+{
+       struct btrfs_encoded_read_private *priv = bio->bi_private;
+       struct btrfs_bio *bbio = btrfs_bio(bio);
+       blk_status_t status;
+
+       status = btrfs_encoded_read_verify_csum(bbio);
+       if (status) {
+               /*
+                * The memory barrier implied by the atomic_dec_return() here
+                * pairs with the memory barrier implied by the
+                * atomic_dec_return() or io_wait_event() in
+                * btrfs_encoded_read_regular_fill_pages() to ensure that this
+                * write is observed before the load of status in
+                * btrfs_encoded_read_regular_fill_pages().
+                */
+               WRITE_ONCE(priv->status, status);
+       }
+       if (!atomic_dec_return(&priv->pending))
+               wake_up(&priv->wait);
+       btrfs_bio_free_csum(bbio);
+       bio_put(bio);
+}
+
+static int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
+                                                u64 file_offset,
+                                                u64 disk_bytenr,
+                                                u64 disk_io_size,
+                                                struct page **pages)
+{
+       struct btrfs_fs_info *fs_info = inode->root->fs_info;
+       struct btrfs_encoded_read_private priv = {
+               .inode = inode,
+               .file_offset = file_offset,
+               .pending = ATOMIC_INIT(1),
+               .skip_csum = (inode->flags & BTRFS_INODE_NODATASUM),
+       };
+       unsigned long i = 0;
+       u64 cur = 0;
+       int ret;
+
+       init_waitqueue_head(&priv.wait);
+       /*
+        * Submit bios for the extent, splitting due to bio or stripe limits as
+        * necessary.
+        */
+       while (cur < disk_io_size) {
+               struct extent_map *em;
+               struct btrfs_io_geometry geom;
+               struct bio *bio = NULL;
+               u64 remaining;
+
+               em = btrfs_get_chunk_map(fs_info, disk_bytenr + cur,
+                                        disk_io_size - cur);
+               if (IS_ERR(em)) {
+                       ret = PTR_ERR(em);
+               } else {
+                       ret = btrfs_get_io_geometry(fs_info, em, BTRFS_MAP_READ,
+                                                   disk_bytenr + cur, &geom);
+                       free_extent_map(em);
+               }
+               if (ret) {
+                       WRITE_ONCE(priv.status, errno_to_blk_status(ret));
+                       break;
+               }
+               remaining = min(geom.len, disk_io_size - cur);
+               while (bio || remaining) {
+                       size_t bytes = min_t(u64, remaining, PAGE_SIZE);
+
+                       if (!bio) {
+                               bio = btrfs_bio_alloc(BIO_MAX_VECS);
+                               bio->bi_iter.bi_sector =
+                                       (disk_bytenr + cur) >> SECTOR_SHIFT;
+                               bio->bi_end_io = btrfs_encoded_read_endio;
+                               bio->bi_private = &priv;
+                               bio->bi_opf = REQ_OP_READ;
+                       }
+
+                       if (!bytes ||
+                           bio_add_page(bio, pages[i], bytes, 0) < bytes) {
+                               blk_status_t status;
+
+                               status = submit_encoded_read_bio(inode, bio, 0);
+                               if (status) {
+                                       WRITE_ONCE(priv.status, status);
+                                       bio_put(bio);
+                                       goto out;
+                               }
+                               bio = NULL;
+                               continue;
+                       }
+
+                       i++;
+                       cur += bytes;
+                       remaining -= bytes;
+               }
+       }
+
+out:
+       if (atomic_dec_return(&priv.pending))
+               io_wait_event(priv.wait, !atomic_read(&priv.pending));
+       /* See btrfs_encoded_read_endio() for ordering. */
+       return blk_status_to_errno(READ_ONCE(priv.status));
+}
+
+static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
+                                         struct iov_iter *iter,
+                                         u64 start, u64 lockend,
+                                         struct extent_state **cached_state,
+                                         u64 disk_bytenr, u64 disk_io_size,
+                                         size_t count, bool compressed,
+                                         bool *unlocked)
+{
+       struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
+       struct extent_io_tree *io_tree = &inode->io_tree;
+       struct page **pages;
+       unsigned long nr_pages, i;
+       u64 cur;
+       size_t page_offset;
+       ssize_t ret;
+
+       nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE);
+       pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
+       if (!pages)
+               return -ENOMEM;
+       for (i = 0; i < nr_pages; i++) {
+               pages[i] = alloc_page(GFP_NOFS);
+               if (!pages[i]) {
+                       ret = -ENOMEM;
+                       goto out;
+               }
+       }
+
+       ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr,
+                                                   disk_io_size, pages);
+       if (ret)
+               goto out;
+
+       unlock_extent_cached(io_tree, start, lockend, cached_state);
+       btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+       *unlocked = true;
+
+       if (compressed) {
+               i = 0;
+               page_offset = 0;
+       } else {
+               i = (iocb->ki_pos - start) >> PAGE_SHIFT;
+               page_offset = (iocb->ki_pos - start) & (PAGE_SIZE - 1);
+       }
+       cur = 0;
+       while (cur < count) {
+               size_t bytes = min_t(size_t, count - cur,
+                                    PAGE_SIZE - page_offset);
+
+               if (copy_page_to_iter(pages[i], page_offset, bytes,
+                                     iter) != bytes) {
+                       ret = -EFAULT;
+                       goto out;
+               }
+               i++;
+               cur += bytes;
+               page_offset = 0;
+       }
+       ret = count;
+out:
+       for (i = 0; i < nr_pages; i++) {
+               if (pages[i])
+                       __free_page(pages[i]);
+       }
+       kfree(pages);
+       return ret;
+}
+
+ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
+                          struct btrfs_ioctl_encoded_io_args *encoded)
+{
+       struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
+       struct btrfs_fs_info *fs_info = inode->root->fs_info;
+       struct extent_io_tree *io_tree = &inode->io_tree;
+       ssize_t ret;
+       size_t count = iov_iter_count(iter);
+       u64 start, lockend, disk_bytenr, disk_io_size;
+       struct extent_state *cached_state = NULL;
+       struct extent_map *em;
+       bool unlocked = false;
+
+       file_accessed(iocb->ki_filp);
+
+       btrfs_inode_lock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+
+       if (iocb->ki_pos >= inode->vfs_inode.i_size) {
+               btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+               return 0;
+       }
+       start = ALIGN_DOWN(iocb->ki_pos, fs_info->sectorsize);
+       /*
+        * We don't know how long the extent containing iocb->ki_pos is, but if
+        * it's compressed we know that it won't be longer than this.
+        */
+       lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
+
+       for (;;) {
+               struct btrfs_ordered_extent *ordered;
+
+               ret = btrfs_wait_ordered_range(&inode->vfs_inode, start,
+                                              lockend - start + 1);
+               if (ret)
+                       goto out_unlock_inode;
+               lock_extent_bits(io_tree, start, lockend, &cached_state);
+               ordered = btrfs_lookup_ordered_range(inode, start,
+                                                    lockend - start + 1);
+               if (!ordered)
+                       break;
+               btrfs_put_ordered_extent(ordered);
+               unlock_extent_cached(io_tree, start, lockend, &cached_state);
+               cond_resched();
+       }
+
+       em = btrfs_get_extent(inode, NULL, 0, start, lockend - start + 1);
+       if (IS_ERR(em)) {
+               ret = PTR_ERR(em);
+               goto out_unlock_extent;
+       }
+
+       if (em->block_start == EXTENT_MAP_INLINE) {
+               u64 extent_start = em->start;
+
+               /*
+                * For inline extents we get everything we need out of the
+                * extent item.
+                */
+               free_extent_map(em);
+               em = NULL;
+               ret = btrfs_encoded_read_inline(iocb, iter, start, lockend,
+                                               &cached_state, extent_start,
+                                               count, encoded, &unlocked);
+               goto out;
+       }
+
+       /*
+        * We only want to return up to EOF even if the extent extends beyond
+        * that.
+        */
+       encoded->len = min_t(u64, extent_map_end(em),
+                            inode->vfs_inode.i_size) - iocb->ki_pos;
+       if (em->block_start == EXTENT_MAP_HOLE ||
+           test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+               disk_bytenr = EXTENT_MAP_HOLE;
+               count = min_t(u64, count, encoded->len);
+               encoded->len = count;
+               encoded->unencoded_len = count;
+       } else if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+               disk_bytenr = em->block_start;
+               /*
+                * Bail if the buffer isn't large enough to return the whole
+                * compressed extent.
+                */
+               if (em->block_len > count) {
+                       ret = -ENOBUFS;
+                       goto out_em;
+               }
+               disk_io_size = count = em->block_len;
+               encoded->unencoded_len = em->ram_bytes;
+               encoded->unencoded_offset = iocb->ki_pos - em->orig_start;
+               ret = btrfs_encoded_io_compression_from_extent(fs_info,
+                                                            em->compress_type);
+               if (ret < 0)
+                       goto out_em;
+               encoded->compression = ret;
+       } else {
+               disk_bytenr = em->block_start + (start - em->start);
+               if (encoded->len > count)
+                       encoded->len = count;
+               /*
+                * Don't read beyond what we locked. This also limits the page
+                * allocations that we'll do.
+                */
+               disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start;
+               count = start + disk_io_size - iocb->ki_pos;
+               encoded->len = count;
+               encoded->unencoded_len = count;
+               disk_io_size = ALIGN(disk_io_size, fs_info->sectorsize);
+       }
+       free_extent_map(em);
+       em = NULL;
+
+       if (disk_bytenr == EXTENT_MAP_HOLE) {
+               unlock_extent_cached(io_tree, start, lockend, &cached_state);
+               btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+               unlocked = true;
+               ret = iov_iter_zero(count, iter);
+               if (ret != count)
+                       ret = -EFAULT;
+       } else {
+               ret = btrfs_encoded_read_regular(iocb, iter, start, lockend,
+                                                &cached_state, disk_bytenr,
+                                                disk_io_size, count,
+                                                encoded->compression,
+                                                &unlocked);
+       }
+
+out:
+       if (ret >= 0)
+               iocb->ki_pos += encoded->len;
+out_em:
+       free_extent_map(em);
+out_unlock_extent:
+       if (!unlocked)
+               unlock_extent_cached(io_tree, start, lockend, &cached_state);
+out_unlock_inode:
+       if (!unlocked)
+               btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+       return ret;
+}
+
 #ifdef CONFIG_SWAP
 /*
  * Add an entry indicating a block group or device which is pinned by a
index 413e2e7..a6994d2 100644 (file)
@@ -28,6 +28,7 @@
 #include <linux/iversion.h>
 #include <linux/fileattr.h>
 #include <linux/fsverity.h>
+#include <linux/sched/xacct.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "export.h"
@@ -88,6 +89,22 @@ struct btrfs_ioctl_send_args_32 {
 
 #define BTRFS_IOC_SEND_32 _IOW(BTRFS_IOCTL_MAGIC, 38, \
                               struct btrfs_ioctl_send_args_32)
+
+struct btrfs_ioctl_encoded_io_args_32 {
+       compat_uptr_t iov;
+       compat_ulong_t iovcnt;
+       __s64 offset;
+       __u64 flags;
+       __u64 len;
+       __u64 unencoded_len;
+       __u64 unencoded_offset;
+       __u32 compression;
+       __u32 encryption;
+       __u8 reserved[64];
+};
+
+#define BTRFS_IOC_ENCODED_READ_32 _IOR(BTRFS_IOCTL_MAGIC, 64, \
+                                      struct btrfs_ioctl_encoded_io_args_32)
 #endif
 
 /* Mask out flags that are inappropriate for the given type of inode. */
@@ -5195,6 +5212,89 @@ static int _btrfs_ioctl_send(struct inode *inode, void __user *argp, bool compat
        return ret;
 }
 
+static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp,
+                                   bool compat)
+{
+       struct btrfs_ioctl_encoded_io_args args = { 0 };
+       size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args,
+                                            flags);
+       size_t copy_end;
+       struct iovec iovstack[UIO_FASTIOV];
+       struct iovec *iov = iovstack;
+       struct iov_iter iter;
+       loff_t pos;
+       struct kiocb kiocb;
+       ssize_t ret;
+
+       if (!capable(CAP_SYS_ADMIN)) {
+               ret = -EPERM;
+               goto out_acct;
+       }
+
+       if (compat) {
+#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
+               struct btrfs_ioctl_encoded_io_args_32 args32;
+
+               copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32,
+                                      flags);
+               if (copy_from_user(&args32, argp, copy_end)) {
+                       ret = -EFAULT;
+                       goto out_acct;
+               }
+               args.iov = compat_ptr(args32.iov);
+               args.iovcnt = args32.iovcnt;
+               args.offset = args32.offset;
+               args.flags = args32.flags;
+#else
+               return -ENOTTY;
+#endif
+       } else {
+               copy_end = copy_end_kernel;
+               if (copy_from_user(&args, argp, copy_end)) {
+                       ret = -EFAULT;
+                       goto out_acct;
+               }
+       }
+       if (args.flags != 0) {
+               ret = -EINVAL;
+               goto out_acct;
+       }
+
+       ret = import_iovec(READ, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
+                          &iov, &iter);
+       if (ret < 0)
+               goto out_acct;
+
+       if (iov_iter_count(&iter) == 0) {
+               ret = 0;
+               goto out_iov;
+       }
+       pos = args.offset;
+       ret = rw_verify_area(READ, file, &pos, args.len);
+       if (ret < 0)
+               goto out_iov;
+
+       init_sync_kiocb(&kiocb, file);
+       kiocb.ki_pos = pos;
+
+       ret = btrfs_encoded_read(&kiocb, &iter, &args);
+       if (ret >= 0) {
+               fsnotify_access(file);
+               if (copy_to_user(argp + copy_end,
+                                (char *)&args + copy_end_kernel,
+                                sizeof(args) - copy_end_kernel))
+                       ret = -EFAULT;
+       }
+
+out_iov:
+       kfree(iov);
+out_acct:
+       if (ret > 0)
+               add_rchar(current, ret);
+       inc_syscr(current);
+       return ret;
+}
+
 long btrfs_ioctl(struct file *file, unsigned int
                cmd, unsigned long arg)
 {
@@ -5339,6 +5439,12 @@ long btrfs_ioctl(struct file *file, unsigned int
                return fsverity_ioctl_enable(file, (const void __user *)argp);
        case FS_IOC_MEASURE_VERITY:
                return fsverity_ioctl_measure(file, argp);
+       case BTRFS_IOC_ENCODED_READ:
+               return btrfs_ioctl_encoded_read(file, argp, false);
+#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
+       case BTRFS_IOC_ENCODED_READ_32:
+               return btrfs_ioctl_encoded_read(file, argp, true);
+#endif
        }
 
        return -ENOTTY;