ceph: plumb in decryption during reads
authorJeff Layton <jlayton@kernel.org>
Thu, 25 Aug 2022 13:31:22 +0000 (09:31 -0400)
committerIlya Dryomov <idryomov@gmail.com>
Thu, 24 Aug 2023 09:24:36 +0000 (11:24 +0200)
Force the use of sparse reads when the inode is encrypted, and add the
appropriate code to decrypt the extent map after receiving.

Note that the crypto block may be smaller than a page, but the reverse
cannot be true.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
Reviewed-and-tested-by: Luís Henriques <lhenriques@suse.de>
Reviewed-by: Milind Changire <mchangir@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
fs/ceph/addr.c
fs/ceph/file.c

index 89ca278..95ff849 100644 (file)
@@ -18,6 +18,7 @@
 #include "mds_client.h"
 #include "cache.h"
 #include "metric.h"
+#include "crypto.h"
 #include <linux/ceph/osd_client.h>
 #include <linux/ceph/striper.h>
 
@@ -242,7 +243,8 @@ static bool ceph_netfs_clamp_length(struct netfs_io_subrequest *subreq)
 
 static void finish_netfs_read(struct ceph_osd_request *req)
 {
-       struct ceph_fs_client *fsc = ceph_inode_to_client(req->r_inode);
+       struct inode *inode = req->r_inode;
+       struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
        struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
        struct netfs_io_subrequest *subreq = req->r_priv;
        struct ceph_osd_req_op *op = &req->r_ops[0];
@@ -256,16 +258,31 @@ static void finish_netfs_read(struct ceph_osd_request *req)
             subreq->len, i_size_read(req->r_inode));
 
        /* no object means success but no data */
-       if (sparse && err >= 0)
-               err = ceph_sparse_ext_map_end(op);
-       else if (err == -ENOENT)
+       if (err == -ENOENT)
                err = 0;
        else if (err == -EBLOCKLISTED)
                fsc->blocklisted = true;
 
-       if (err >= 0 && err < subreq->len)
-               __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
+       if (err >= 0) {
+               if (sparse && err > 0)
+                       err = ceph_sparse_ext_map_end(op);
+               if (err < subreq->len)
+                       __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
+               if (IS_ENCRYPTED(inode) && err > 0) {
+                       err = ceph_fscrypt_decrypt_extents(inode,
+                                       osd_data->pages, subreq->start,
+                                       op->extent.sparse_ext,
+                                       op->extent.sparse_ext_cnt);
+                       if (err > subreq->len)
+                               err = subreq->len;
+               }
+       }
 
+       if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) {
+               ceph_put_page_vector(osd_data->pages,
+                                    calc_pages_for(osd_data->alignment,
+                                       osd_data->length), false);
+       }
        netfs_subreq_terminated(subreq, err, false);
        iput(req->r_inode);
 }
@@ -336,7 +353,8 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
        struct iov_iter iter;
        int err = 0;
        u64 len = subreq->len;
-       bool sparse = ceph_test_mount_opt(fsc, SPARSEREAD);
+       bool sparse = IS_ENCRYPTED(inode) || ceph_test_mount_opt(fsc, SPARSEREAD);
+       u64 off = subreq->start;
 
        if (ceph_inode_is_shutdown(inode)) {
                err = -EIO;
@@ -346,8 +364,10 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
        if (ceph_has_inline_data(ci) && ceph_netfs_issue_op_inline(subreq))
                return;
 
-       req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, subreq->start, &len,
-                       0, 1, sparse ? CEPH_OSD_OP_SPARSE_READ : CEPH_OSD_OP_READ,
+       ceph_fscrypt_adjust_off_and_len(inode, &off, &len);
+
+       req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino,
+                       off, &len, 0, 1, sparse ? CEPH_OSD_OP_SPARSE_READ : CEPH_OSD_OP_READ,
                        CEPH_OSD_FLAG_READ | fsc->client->osdc.client->options->read_from_replica,
                        NULL, ci->i_truncate_seq, ci->i_truncate_size, false);
        if (IS_ERR(req)) {
@@ -363,8 +383,37 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
        }
 
        dout("%s: pos=%llu orig_len=%zu len=%llu\n", __func__, subreq->start, subreq->len, len);
+
        iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, subreq->start, len);
-       osd_req_op_extent_osd_iter(req, 0, &iter);
+
+       /*
+        * FIXME: For now, use CEPH_OSD_DATA_TYPE_PAGES instead of _ITER for
+        * encrypted inodes. We'd need infrastructure that handles an iov_iter
+        * instead of page arrays, and we don't have that as of yet. Once the
+        * dust settles on the write helpers and encrypt/decrypt routines for
+        * netfs, we should be able to rework this.
+        */
+       if (IS_ENCRYPTED(inode)) {
+               struct page **pages;
+               size_t page_off;
+
+               err = iov_iter_get_pages_alloc2(&iter, &pages, len, &page_off);
+               if (err < 0) {
+                       dout("%s: iov_ter_get_pages_alloc returned %d\n",
+                            __func__, err);
+                       goto out;
+               }
+
+               /* should always give us a page-aligned read */
+               WARN_ON_ONCE(page_off);
+               len = err;
+               err = 0;
+
+               osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false,
+                                                false);
+       } else {
+               osd_req_op_extent_osd_iter(req, 0, &iter);
+       }
        req->r_callback = finish_netfs_read;
        req->r_priv = subreq;
        req->r_inode = inode;
index 45e00e4..9d1a77c 100644 (file)
@@ -970,7 +970,7 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
        u64 off = *ki_pos;
        u64 len = iov_iter_count(to);
        u64 i_size = i_size_read(inode);
-       bool sparse = ceph_test_mount_opt(fsc, SPARSEREAD);
+       bool sparse = IS_ENCRYPTED(inode) || ceph_test_mount_opt(fsc, SPARSEREAD);
        u64 objver = 0;
 
        dout("sync_read on inode %p %llx~%llx\n", inode, *ki_pos, len);
@@ -1001,10 +1001,19 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
                int idx;
                size_t left;
                struct ceph_osd_req_op *op;
+               u64 read_off = off;
+               u64 read_len = len;
+
+               /* determine new offset/length if encrypted */
+               ceph_fscrypt_adjust_off_and_len(inode, &read_off, &read_len);
+
+               dout("sync_read orig %llu~%llu reading %llu~%llu",
+                    off, len, read_off, read_len);
 
                req = ceph_osdc_new_request(osdc, &ci->i_layout,
-                                       ci->i_vino, off, &len, 0, 1,
-                                       sparse ? CEPH_OSD_OP_SPARSE_READ : CEPH_OSD_OP_READ,
+                                       ci->i_vino, read_off, &read_len, 0, 1,
+                                       sparse ? CEPH_OSD_OP_SPARSE_READ :
+                                                CEPH_OSD_OP_READ,
                                        CEPH_OSD_FLAG_READ,
                                        NULL, ci->i_truncate_seq,
                                        ci->i_truncate_size, false);
@@ -1013,10 +1022,13 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
                        break;
                }
 
+               /* adjust len downward if the request truncated the len */
+               if (off + len > read_off + read_len)
+                       len = read_off + read_len - off;
                more = len < iov_iter_count(to);
 
-               num_pages = calc_pages_for(off, len);
-               page_off = off & ~PAGE_MASK;
+               num_pages = calc_pages_for(read_off, read_len);
+               page_off = offset_in_page(off);
                pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
                if (IS_ERR(pages)) {
                        ceph_osdc_put_request(req);
@@ -1024,7 +1036,8 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
                        break;
                }
 
-               osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_off,
+               osd_req_op_extent_osd_data_pages(req, 0, pages, read_len,
+                                                offset_in_page(read_off),
                                                 false, false);
 
                op = &req->r_ops[0];
@@ -1042,7 +1055,7 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
                ceph_update_read_metrics(&fsc->mdsc->metric,
                                         req->r_start_latency,
                                         req->r_end_latency,
-                                        len, ret);
+                                        read_len, ret);
 
                if (ret > 0)
                        objver = req->r_version;
@@ -1057,8 +1070,35 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
                else if (ret == -ENOENT)
                        ret = 0;
 
+               if (ret > 0 && IS_ENCRYPTED(inode)) {
+                       int fret;
+
+                       fret = ceph_fscrypt_decrypt_extents(inode, pages,
+                                       read_off, op->extent.sparse_ext,
+                                       op->extent.sparse_ext_cnt);
+                       if (fret < 0) {
+                               ret = fret;
+                               ceph_osdc_put_request(req);
+                               break;
+                       }
+
+                       /* account for any partial block at the beginning */
+                       fret -= (off - read_off);
+
+                       /*
+                        * Short read after big offset adjustment?
+                        * Nothing is usable, just call it a zero
+                        * len read.
+                        */
+                       fret = max(fret, 0);
+
+                       /* account for partial block at the end */
+                       ret = min_t(ssize_t, fret, len);
+               }
+
                ceph_osdc_put_request(req);
 
+               /* Short read but not EOF? Zero out the remainder. */
                if (ret >= 0 && ret < len && (off + ret < i_size)) {
                        int zlen = min(len - ret, i_size - off - ret);
                        int zoff = page_off + ret;
@@ -1072,15 +1112,16 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
                idx = 0;
                left = ret > 0 ? ret : 0;
                while (left > 0) {
-                       size_t len, copied;
-                       page_off = off & ~PAGE_MASK;
-                       len = min_t(size_t, left, PAGE_SIZE - page_off);
+                       size_t plen, copied;
+
+                       plen = min_t(size_t, left, PAGE_SIZE - page_off);
                        SetPageUptodate(pages[idx]);
                        copied = copy_page_to_iter(pages[idx++],
-                                                  page_off, len, to);
+                                                  page_off, plen, to);
                        off += copied;
                        left -= copied;
-                       if (copied < len) {
+                       page_off = 0;
+                       if (copied < plen) {
                                ret = -EFAULT;
                                break;
                        }
@@ -1097,20 +1138,21 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
                        break;
        }
 
-       if (off > *ki_pos) {
-               if (off >= i_size) {
-                       *retry_op = CHECK_EOF;
-                       ret = i_size - *ki_pos;
-                       *ki_pos = i_size;
-               } else {
-                       ret = off - *ki_pos;
-                       *ki_pos = off;
+       if (ret > 0) {
+               if (off > *ki_pos) {
+                       if (off >= i_size) {
+                               *retry_op = CHECK_EOF;
+                               ret = i_size - *ki_pos;
+                               *ki_pos = i_size;
+                       } else {
+                               ret = off - *ki_pos;
+                               *ki_pos = off;
+                       }
                }
-       }
-
-       if (last_objver && ret > 0)
-               *last_objver = objver;
 
+               if (last_objver)
+                       *last_objver = objver;
+       }
        dout("sync_read result %zd retry_op %d\n", ret, *retry_op);
        return ret;
 }