Merge tag 'ceph-for-6.6-rc1' of https://github.com/ceph/ceph-client
[platform/kernel/linux-rpi.git] / fs / ceph / inode.c
index fd05d68..800ab79 100644 (file)
 #include <linux/random.h>
 #include <linux/sort.h>
 #include <linux/iversion.h>
+#include <linux/fscrypt.h>
 
 #include "super.h"
 #include "mds_client.h"
 #include "cache.h"
+#include "crypto.h"
 #include <linux/ceph/decode.h>
 
 /*
@@ -33,6 +35,7 @@
  */
 
 static const struct inode_operations ceph_symlink_iops;
+static const struct inode_operations ceph_encrypted_symlink_iops;
 
 static void ceph_inode_work(struct work_struct *work);
 
@@ -52,17 +55,99 @@ static int ceph_set_ino_cb(struct inode *inode, void *data)
        return 0;
 }
 
-struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
+/**
+ * ceph_new_inode - allocate a new inode in advance of an expected create
+ * @dir: parent directory for new inode
+ * @dentry: dentry that may eventually point to new inode
+ * @mode: mode of new inode
+ * @as_ctx: pointer to inherited security context
+ *
+ * Allocate a new inode in advance of an operation to create a new inode.
+ * This allocates the inode and sets up the acl_sec_ctx with appropriate
+ * info for the new inode.
+ *
+ * Returns a pointer to the new inode or an ERR_PTR.
+ */
+struct inode *ceph_new_inode(struct inode *dir, struct dentry *dentry,
+                            umode_t *mode, struct ceph_acl_sec_ctx *as_ctx)
+{
+       int err;
+       struct inode *inode;
+
+       inode = new_inode(dir->i_sb);
+       if (!inode)
+               return ERR_PTR(-ENOMEM);
+
+       if (!S_ISLNK(*mode)) {
+               err = ceph_pre_init_acls(dir, mode, as_ctx);
+               if (err < 0)
+                       goto out_err;
+       }
+
+       inode->i_state = 0;
+       inode->i_mode = *mode;
+
+       err = ceph_security_init_secctx(dentry, *mode, as_ctx);
+       if (err < 0)
+               goto out_err;
+
+       /*
+        * We'll skip setting fscrypt context for snapshots, leaving that for
+        * the handle_reply().
+        */
+       if (ceph_snap(dir) != CEPH_SNAPDIR) {
+               err = ceph_fscrypt_prepare_context(dir, inode, as_ctx);
+               if (err)
+                       goto out_err;
+       }
+
+       return inode;
+out_err:
+       iput(inode);
+       return ERR_PTR(err);
+}
+
+void ceph_as_ctx_to_req(struct ceph_mds_request *req,
+                       struct ceph_acl_sec_ctx *as_ctx)
+{
+       if (as_ctx->pagelist) {
+               req->r_pagelist = as_ctx->pagelist;
+               as_ctx->pagelist = NULL;
+       }
+       ceph_fscrypt_as_ctx_to_req(req, as_ctx);
+}
+
+/**
+ * ceph_get_inode - find or create/hash a new inode
+ * @sb: superblock to search and allocate in
+ * @vino: vino to search for
+ * @newino: optional new inode to insert if one isn't found (may be NULL)
+ *
+ * Search for or insert a new inode into the hash for the given vino, and
+ * return a reference to it. If new is non-NULL, its reference is consumed.
+ */
+struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino,
+                            struct inode *newino)
 {
        struct inode *inode;
 
        if (ceph_vino_is_reserved(vino))
                return ERR_PTR(-EREMOTEIO);
 
-       inode = iget5_locked(sb, (unsigned long)vino.ino, ceph_ino_compare,
-                            ceph_set_ino_cb, &vino);
-       if (!inode)
+       if (newino) {
+               inode = inode_insert5(newino, (unsigned long)vino.ino,
+                                     ceph_ino_compare, ceph_set_ino_cb, &vino);
+               if (inode != newino)
+                       iput(newino);
+       } else {
+               inode = iget5_locked(sb, (unsigned long)vino.ino,
+                                    ceph_ino_compare, ceph_set_ino_cb, &vino);
+       }
+
+       if (!inode) {
+               dout("No inode found for %llx.%llx\n", vino.ino, vino.snap);
                return ERR_PTR(-ENOMEM);
+       }
 
        dout("get_inode on %llu=%llx.%llx got %p new %d\n", ceph_present_inode(inode),
             ceph_vinop(inode), inode, !!(inode->i_state & I_NEW));
@@ -78,8 +163,9 @@ struct inode *ceph_get_snapdir(struct inode *parent)
                .ino = ceph_ino(parent),
                .snap = CEPH_SNAPDIR,
        };
-       struct inode *inode = ceph_get_inode(parent->i_sb, vino);
+       struct inode *inode = ceph_get_inode(parent->i_sb, vino, NULL);
        struct ceph_inode_info *ci = ceph_inode(inode);
+       int ret = -ENOTDIR;
 
        if (IS_ERR(inode))
                return inode;
@@ -105,6 +191,24 @@ struct inode *ceph_get_snapdir(struct inode *parent)
        ci->i_rbytes = 0;
        ci->i_btime = ceph_inode(parent)->i_btime;
 
+#ifdef CONFIG_FS_ENCRYPTION
+       /* if encrypted, just borrow fscrypt_auth from parent */
+       if (IS_ENCRYPTED(parent)) {
+               struct ceph_inode_info *pci = ceph_inode(parent);
+
+               ci->fscrypt_auth = kmemdup(pci->fscrypt_auth,
+                                          pci->fscrypt_auth_len,
+                                          GFP_KERNEL);
+               if (ci->fscrypt_auth) {
+                       inode->i_flags |= S_ENCRYPTED;
+                       ci->fscrypt_auth_len = pci->fscrypt_auth_len;
+               } else {
+                       dout("Failed to alloc snapdir fscrypt_auth\n");
+                       ret = -ENOMEM;
+                       goto err;
+               }
+       }
+#endif
        if (inode->i_state & I_NEW) {
                inode->i_op = &ceph_snapdir_iops;
                inode->i_fop = &ceph_snapdir_fops;
@@ -118,7 +222,7 @@ err:
                discard_new_inode(inode);
        else
                iput(inode);
-       return ERR_PTR(-ENOTDIR);
+       return ERR_PTR(ret);
 }
 
 const struct inode_operations ceph_file_iops = {
@@ -517,6 +621,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
        ci->i_truncate_seq = 0;
        ci->i_truncate_size = 0;
        ci->i_truncate_pending = 0;
+       ci->i_truncate_pagecache_size = 0;
 
        ci->i_max_size = 0;
        ci->i_reported_size = 0;
@@ -547,6 +652,10 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
        INIT_WORK(&ci->i_work, ceph_inode_work);
        ci->i_work_mask = 0;
        memset(&ci->i_btime, '\0', sizeof(ci->i_btime));
+#ifdef CONFIG_FS_ENCRYPTION
+       ci->fscrypt_auth = NULL;
+       ci->fscrypt_auth_len = 0;
+#endif
        return &ci->netfs.inode;
 }
 
@@ -555,6 +664,10 @@ void ceph_free_inode(struct inode *inode)
        struct ceph_inode_info *ci = ceph_inode(inode);
 
        kfree(ci->i_symlink);
+#ifdef CONFIG_FS_ENCRYPTION
+       kfree(ci->fscrypt_auth);
+#endif
+       fscrypt_free_inode(inode);
        kmem_cache_free(ceph_inode_cachep, ci);
 }
 
@@ -575,6 +688,7 @@ void ceph_evict_inode(struct inode *inode)
        clear_inode(inode);
 
        ceph_fscache_unregister_inode_cookie(ci);
+       fscrypt_put_encryption_info(inode);
 
        __ceph_remove_caps(ci);
 
@@ -650,7 +764,7 @@ int ceph_fill_file_size(struct inode *inode, int issued,
                        ceph_fscache_update(inode);
                ci->i_reported_size = size;
                if (truncate_seq != ci->i_truncate_seq) {
-                       dout("truncate_seq %u -> %u\n",
+                       dout("%s truncate_seq %u -> %u\n", __func__,
                             ci->i_truncate_seq, truncate_seq);
                        ci->i_truncate_seq = truncate_seq;
 
@@ -674,11 +788,26 @@ int ceph_fill_file_size(struct inode *inode, int issued,
                        }
                }
        }
-       if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0 &&
-           ci->i_truncate_size != truncate_size) {
-               dout("truncate_size %lld -> %llu\n", ci->i_truncate_size,
-                    truncate_size);
+
+       /*
+        * It's possible that the new sizes of the two consecutive
+        * size truncations will be in the same fscrypt last block,
+        * and we need to truncate the corresponding page caches
+        * anyway.
+        */
+       if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0) {
+               dout("%s truncate_size %lld -> %llu, encrypted %d\n", __func__,
+                    ci->i_truncate_size, truncate_size, !!IS_ENCRYPTED(inode));
+
                ci->i_truncate_size = truncate_size;
+
+               if (IS_ENCRYPTED(inode)) {
+                       dout("%s truncate_pagecache_size %lld -> %llu\n",
+                            __func__, ci->i_truncate_pagecache_size, size);
+                       ci->i_truncate_pagecache_size = size;
+               } else {
+                       ci->i_truncate_pagecache_size = truncate_size;
+               }
        }
        return queue_trunc;
 }
@@ -752,6 +881,34 @@ void ceph_fill_file_time(struct inode *inode, int issued,
                     inode, time_warp_seq, ci->i_time_warp_seq);
 }
 
+#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+static int decode_encrypted_symlink(const char *encsym, int enclen, u8 **decsym)
+{
+       int declen;
+       u8 *sym;
+
+       sym = kmalloc(enclen + 1, GFP_NOFS);
+       if (!sym)
+               return -ENOMEM;
+
+       declen = ceph_base64_decode(encsym, enclen, sym);
+       if (declen < 0) {
+               pr_err("%s: can't decode symlink (%d). Content: %.*s\n",
+                      __func__, declen, enclen, encsym);
+               kfree(sym);
+               return -EIO;
+       }
+       sym[declen + 1] = '\0';
+       *decsym = sym;
+       return declen;
+}
+#else
+static int decode_encrypted_symlink(const char *encsym, int symlen, u8 **decsym)
+{
+       return -EOPNOTSUPP;
+}
+#endif
+
 /*
  * Populate an inode based on info from mds.  May be called on new or
  * existing inodes.
@@ -857,15 +1014,20 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
        issued |= __ceph_caps_dirty(ci);
        new_issued = ~issued & info_caps;
 
-       /* directories have fl_stripe_unit set to zero */
-       if (le32_to_cpu(info->layout.fl_stripe_unit))
-               inode->i_blkbits =
-                       fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
-       else
-               inode->i_blkbits = CEPH_BLOCK_SHIFT;
-
        __ceph_update_quota(ci, iinfo->max_bytes, iinfo->max_files);
 
+#ifdef CONFIG_FS_ENCRYPTION
+       if (iinfo->fscrypt_auth_len &&
+           ((inode->i_state & I_NEW) || (ci->fscrypt_auth_len == 0))) {
+               kfree(ci->fscrypt_auth);
+               ci->fscrypt_auth_len = iinfo->fscrypt_auth_len;
+               ci->fscrypt_auth = iinfo->fscrypt_auth;
+               iinfo->fscrypt_auth = NULL;
+               iinfo->fscrypt_auth_len = 0;
+               inode_set_flags(inode, S_ENCRYPTED, S_ENCRYPTED);
+       }
+#endif
+
        if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
            (issued & CEPH_CAP_AUTH_EXCL) == 0) {
                inode->i_mode = mode;
@@ -878,6 +1040,15 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
                ceph_decode_timespec64(&ci->i_snap_btime, &iinfo->snap_btime);
        }
 
+       /* directories have fl_stripe_unit set to zero */
+       if (IS_ENCRYPTED(inode))
+               inode->i_blkbits = CEPH_FSCRYPT_BLOCK_SHIFT;
+       else if (le32_to_cpu(info->layout.fl_stripe_unit))
+               inode->i_blkbits =
+                       fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
+       else
+               inode->i_blkbits = CEPH_BLOCK_SHIFT;
+
        if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
            (issued & CEPH_CAP_LINK_EXCL) == 0)
                set_nlink(inode, le32_to_cpu(info->nlink));
@@ -899,6 +1070,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
 
        if (new_version ||
            (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
+               u64 size = le64_to_cpu(info->size);
                s64 old_pool = ci->i_layout.pool_id;
                struct ceph_string *old_ns;
 
@@ -912,10 +1084,22 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
 
                pool_ns = old_ns;
 
+               if (IS_ENCRYPTED(inode) && size &&
+                   iinfo->fscrypt_file_len == sizeof(__le64)) {
+                       u64 fsize = __le64_to_cpu(*(__le64 *)iinfo->fscrypt_file);
+
+                       if (size == round_up(fsize, CEPH_FSCRYPT_BLOCK_SIZE)) {
+                               size = fsize;
+                       } else {
+                               pr_warn("fscrypt size mismatch: size=%llu fscrypt_file=%llu, discarding fscrypt_file size.\n",
+                                       info->size, size);
+                       }
+               }
+
                queue_trunc = ceph_fill_file_size(inode, issued,
                                        le32_to_cpu(info->truncate_seq),
                                        le64_to_cpu(info->truncate_size),
-                                       le64_to_cpu(info->size));
+                                       size);
                /* only update max_size on auth cap */
                if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
                    ci->i_max_size != le64_to_cpu(info->max_size)) {
@@ -975,26 +1159,42 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
                inode->i_fop = &ceph_file_fops;
                break;
        case S_IFLNK:
-               inode->i_op = &ceph_symlink_iops;
                if (!ci->i_symlink) {
                        u32 symlen = iinfo->symlink_len;
                        char *sym;
 
                        spin_unlock(&ci->i_ceph_lock);
 
-                       if (symlen != i_size_read(inode)) {
-                               pr_err("%s %llx.%llx BAD symlink "
-                                       "size %lld\n", __func__,
-                                       ceph_vinop(inode),
-                                       i_size_read(inode));
+                       if (IS_ENCRYPTED(inode)) {
+                               if (symlen != i_size_read(inode))
+                                       pr_err("%s %llx.%llx BAD symlink size %lld\n",
+                                               __func__, ceph_vinop(inode),
+                                               i_size_read(inode));
+
+                               err = decode_encrypted_symlink(iinfo->symlink,
+                                                              symlen, (u8 **)&sym);
+                               if (err < 0) {
+                                       pr_err("%s decoding encrypted symlink failed: %d\n",
+                                               __func__, err);
+                                       goto out;
+                               }
+                               symlen = err;
                                i_size_write(inode, symlen);
                                inode->i_blocks = calc_inode_blocks(symlen);
-                       }
+                       } else {
+                               if (symlen != i_size_read(inode)) {
+                                       pr_err("%s %llx.%llx BAD symlink size %lld\n",
+                                               __func__, ceph_vinop(inode),
+                                               i_size_read(inode));
+                                       i_size_write(inode, symlen);
+                                       inode->i_blocks = calc_inode_blocks(symlen);
+                               }
 
-                       err = -ENOMEM;
-                       sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS);
-                       if (!sym)
-                               goto out;
+                               err = -ENOMEM;
+                               sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS);
+                               if (!sym)
+                                       goto out;
+                       }
 
                        spin_lock(&ci->i_ceph_lock);
                        if (!ci->i_symlink)
@@ -1002,7 +1202,17 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
                        else
                                kfree(sym); /* lost a race */
                }
-               inode->i_link = ci->i_symlink;
+
+               if (IS_ENCRYPTED(inode)) {
+                       /*
+                        * Encrypted symlinks need to be decrypted before we can
+                        * cache their targets in i_link. Don't touch it here.
+                        */
+                       inode->i_op = &ceph_encrypted_symlink_iops;
+               } else {
+                       inode->i_link = ci->i_symlink;
+                       inode->i_op = &ceph_symlink_iops;
+               }
                break;
        case S_IFDIR:
                inode->i_op = &ceph_dir_iops;
@@ -1310,8 +1520,15 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
                if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME &&
                    test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
                    !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
+                       bool is_nokey = false;
                        struct qstr dname;
                        struct dentry *dn, *parent;
+                       struct fscrypt_str oname = FSTR_INIT(NULL, 0);
+                       struct ceph_fname fname = { .dir        = dir,
+                                                   .name       = rinfo->dname,
+                                                   .ctext      = rinfo->altname,
+                                                   .name_len   = rinfo->dname_len,
+                                                   .ctext_len  = rinfo->altname_len };
 
                        BUG_ON(!rinfo->head->is_target);
                        BUG_ON(req->r_dentry);
@@ -1319,8 +1536,20 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
                        parent = d_find_any_alias(dir);
                        BUG_ON(!parent);
 
-                       dname.name = rinfo->dname;
-                       dname.len = rinfo->dname_len;
+                       err = ceph_fname_alloc_buffer(dir, &oname);
+                       if (err < 0) {
+                               dput(parent);
+                               goto done;
+                       }
+
+                       err = ceph_fname_to_usr(&fname, NULL, &oname, &is_nokey);
+                       if (err < 0) {
+                               dput(parent);
+                               ceph_fname_free_buffer(dir, &oname);
+                               goto done;
+                       }
+                       dname.name = oname.name;
+                       dname.len = oname.len;
                        dname.hash = full_name_hash(parent, dname.name, dname.len);
                        tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
                        tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
@@ -1335,9 +1564,15 @@ retry_lookup:
                                     dname.len, dname.name, dn);
                                if (!dn) {
                                        dput(parent);
+                                       ceph_fname_free_buffer(dir, &oname);
                                        err = -ENOMEM;
                                        goto done;
                                }
+                               if (is_nokey) {
+                                       spin_lock(&dn->d_lock);
+                                       dn->d_flags |= DCACHE_NOKEY_NAME;
+                                       spin_unlock(&dn->d_lock);
+                               }
                                err = 0;
                        } else if (d_really_is_positive(dn) &&
                                   (ceph_ino(d_inode(dn)) != tvino.ino ||
@@ -1349,6 +1584,7 @@ retry_lookup:
                                dput(dn);
                                goto retry_lookup;
                        }
+                       ceph_fname_free_buffer(dir, &oname);
 
                        req->r_dentry = dn;
                        dput(parent);
@@ -1552,7 +1788,7 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
                vino.ino = le64_to_cpu(rde->inode.in->ino);
                vino.snap = le64_to_cpu(rde->inode.in->snapid);
 
-               in = ceph_get_inode(req->r_dentry->d_sb, vino);
+               in = ceph_get_inode(req->r_dentry->d_sb, vino, NULL);
                if (IS_ERR(in)) {
                        err = PTR_ERR(in);
                        dout("new_inode badness got %d\n", err);
@@ -1630,7 +1866,8 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
                             struct ceph_mds_session *session)
 {
        struct dentry *parent = req->r_dentry;
-       struct ceph_inode_info *ci = ceph_inode(d_inode(parent));
+       struct inode *inode = d_inode(parent);
+       struct ceph_inode_info *ci = ceph_inode(inode);
        struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
        struct qstr dname;
        struct dentry *dn;
@@ -1704,9 +1941,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
                tvino.snap = le64_to_cpu(rde->inode.in->snapid);
 
                if (rinfo->hash_order) {
-                       u32 hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
-                                                rde->name, rde->name_len);
-                       hash = ceph_frag_value(hash);
+                       u32 hash = ceph_frag_value(rde->raw_hash);
                        if (hash != last_hash)
                                fpos_offset = 2;
                        last_hash = hash;
@@ -1729,6 +1964,11 @@ retry_lookup:
                                err = -ENOMEM;
                                goto out;
                        }
+                       if (rde->is_nokey) {
+                               spin_lock(&dn->d_lock);
+                               dn->d_flags |= DCACHE_NOKEY_NAME;
+                               spin_unlock(&dn->d_lock);
+                       }
                } else if (d_really_is_positive(dn) &&
                           (ceph_ino(d_inode(dn)) != tvino.ino ||
                            ceph_snap(d_inode(dn)) != tvino.snap)) {
@@ -1754,7 +1994,7 @@ retry_lookup:
                if (d_really_is_positive(dn)) {
                        in = d_inode(dn);
                } else {
-                       in = ceph_get_inode(parent->d_sb, tvino);
+                       in = ceph_get_inode(parent->d_sb, tvino, NULL);
                        if (IS_ERR(in)) {
                                dout("new_inode badness\n");
                                d_drop(dn);
@@ -1927,7 +2167,7 @@ void __ceph_do_pending_vmtruncate(struct inode *inode)
 retry:
        spin_lock(&ci->i_ceph_lock);
        if (ci->i_truncate_pending == 0) {
-               dout("__do_pending_vmtruncate %p none pending\n", inode);
+               dout("%s %p none pending\n", __func__, inode);
                spin_unlock(&ci->i_ceph_lock);
                mutex_unlock(&ci->i_truncate_mutex);
                return;
@@ -1939,8 +2179,7 @@ retry:
         */
        if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) {
                spin_unlock(&ci->i_ceph_lock);
-               dout("__do_pending_vmtruncate %p flushing snaps first\n",
-                    inode);
+               dout("%s %p flushing snaps first\n", __func__, inode);
                filemap_write_and_wait_range(&inode->i_data, 0,
                                             inode->i_sb->s_maxbytes);
                goto retry;
@@ -1949,9 +2188,9 @@ retry:
        /* there should be no reader or writer */
        WARN_ON_ONCE(ci->i_rd_ref || ci->i_wr_ref);
 
-       to = ci->i_truncate_size;
+       to = ci->i_truncate_pagecache_size;
        wrbuffer_refs = ci->i_wrbuffer_ref;
-       dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode,
+       dout("%s %p (%d) to %lld\n", __func__, inode,
             ci->i_truncate_pending, to);
        spin_unlock(&ci->i_ceph_lock);
 
@@ -1959,7 +2198,7 @@ retry:
        truncate_pagecache(inode, to);
 
        spin_lock(&ci->i_ceph_lock);
-       if (to == ci->i_truncate_size) {
+       if (to == ci->i_truncate_pagecache_size) {
                ci->i_truncate_pending = 0;
                finish = 1;
        }
@@ -2000,6 +2239,32 @@ static void ceph_inode_work(struct work_struct *work)
        iput(inode);
 }
 
+static const char *ceph_encrypted_get_link(struct dentry *dentry,
+                                          struct inode *inode,
+                                          struct delayed_call *done)
+{
+       struct ceph_inode_info *ci = ceph_inode(inode);
+
+       if (!dentry)
+               return ERR_PTR(-ECHILD);
+
+       return fscrypt_get_symlink(inode, ci->i_symlink, i_size_read(inode),
+                                  done);
+}
+
+static int ceph_encrypted_symlink_getattr(struct mnt_idmap *idmap,
+                                         const struct path *path,
+                                         struct kstat *stat, u32 request_mask,
+                                         unsigned int query_flags)
+{
+       int ret;
+
+       ret = ceph_getattr(idmap, path, stat, request_mask, query_flags);
+       if (ret)
+               return ret;
+       return fscrypt_symlink_getattr(path, stat);
+}
+
 /*
  * symlinks
  */
@@ -2010,20 +2275,173 @@ static const struct inode_operations ceph_symlink_iops = {
        .listxattr = ceph_listxattr,
 };
 
-int __ceph_setattr(struct inode *inode, struct iattr *attr)
+static const struct inode_operations ceph_encrypted_symlink_iops = {
+       .get_link = ceph_encrypted_get_link,
+       .setattr = ceph_setattr,
+       .getattr = ceph_encrypted_symlink_getattr,
+       .listxattr = ceph_listxattr,
+};
+
+/*
+ * Transfer the encrypted last block to the MDS and the MDS
+ * will help update it when truncating a smaller size.
+ *
+ * We don't support a PAGE_SIZE that is smaller than the
+ * CEPH_FSCRYPT_BLOCK_SIZE.
+ */
+static int fill_fscrypt_truncate(struct inode *inode,
+                                struct ceph_mds_request *req,
+                                struct iattr *attr)
+{
+       struct ceph_inode_info *ci = ceph_inode(inode);
+       int boff = attr->ia_size % CEPH_FSCRYPT_BLOCK_SIZE;
+       loff_t pos, orig_pos = round_down(attr->ia_size,
+                                         CEPH_FSCRYPT_BLOCK_SIZE);
+       u64 block = orig_pos >> CEPH_FSCRYPT_BLOCK_SHIFT;
+       struct ceph_pagelist *pagelist = NULL;
+       struct kvec iov = {0};
+       struct iov_iter iter;
+       struct page *page = NULL;
+       struct ceph_fscrypt_truncate_size_header header;
+       int retry_op = 0;
+       int len = CEPH_FSCRYPT_BLOCK_SIZE;
+       loff_t i_size = i_size_read(inode);
+       int got, ret, issued;
+       u64 objver;
+
+       ret = __ceph_get_caps(inode, NULL, CEPH_CAP_FILE_RD, 0, -1, &got);
+       if (ret < 0)
+               return ret;
+
+       issued = __ceph_caps_issued(ci, NULL);
+
+       dout("%s size %lld -> %lld got cap refs on %s, issued %s\n", __func__,
+            i_size, attr->ia_size, ceph_cap_string(got),
+            ceph_cap_string(issued));
+
+       /* Try to writeback the dirty pagecaches */
+       if (issued & (CEPH_CAP_FILE_BUFFER)) {
+               loff_t lend = orig_pos + CEPH_FSCRYPT_BLOCK_SHIFT - 1;
+
+               ret = filemap_write_and_wait_range(inode->i_mapping,
+                                                  orig_pos, lend);
+               if (ret < 0)
+                       goto out;
+       }
+
+       page = __page_cache_alloc(GFP_KERNEL);
+       if (page == NULL) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       pagelist = ceph_pagelist_alloc(GFP_KERNEL);
+       if (!pagelist) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       iov.iov_base = kmap_local_page(page);
+       iov.iov_len = len;
+       iov_iter_kvec(&iter, READ, &iov, 1, len);
+
+       pos = orig_pos;
+       ret = __ceph_sync_read(inode, &pos, &iter, &retry_op, &objver);
+       if (ret < 0)
+               goto out;
+
+       /* Insert the header first */
+       header.ver = 1;
+       header.compat = 1;
+       header.change_attr = cpu_to_le64(inode_peek_iversion_raw(inode));
+
+       /*
+        * Always set the block_size to CEPH_FSCRYPT_BLOCK_SIZE,
+        * because in MDS it may need this to do the truncate.
+        */
+       header.block_size = cpu_to_le32(CEPH_FSCRYPT_BLOCK_SIZE);
+
+       /*
+        * If we hit a hole here, we should just skip filling
+        * the fscrypt for the request, because once the fscrypt
+        * is enabled, the file will be split into many blocks
+        * with the size of CEPH_FSCRYPT_BLOCK_SIZE, if there
+        * has a hole, the hole size should be multiple of block
+        * size.
+        *
+        * If the Rados object doesn't exist, it will be set to 0.
+        */
+       if (!objver) {
+               dout("%s hit hole, ppos %lld < size %lld\n", __func__,
+                    pos, i_size);
+
+               header.data_len = cpu_to_le32(8 + 8 + 4);
+               header.file_offset = 0;
+               ret = 0;
+       } else {
+               header.data_len = cpu_to_le32(8 + 8 + 4 + CEPH_FSCRYPT_BLOCK_SIZE);
+               header.file_offset = cpu_to_le64(orig_pos);
+
+               dout("%s encrypt block boff/bsize %d/%lu\n", __func__,
+                    boff, CEPH_FSCRYPT_BLOCK_SIZE);
+
+               /* truncate and zero out the extra contents for the last block */
+               memset(iov.iov_base + boff, 0, PAGE_SIZE - boff);
+
+               /* encrypt the last block */
+               ret = ceph_fscrypt_encrypt_block_inplace(inode, page,
+                                                   CEPH_FSCRYPT_BLOCK_SIZE,
+                                                   0, block,
+                                                   GFP_KERNEL);
+               if (ret)
+                       goto out;
+       }
+
+       /* Insert the header */
+       ret = ceph_pagelist_append(pagelist, &header, sizeof(header));
+       if (ret)
+               goto out;
+
+       if (header.block_size) {
+               /* Append the last block contents to pagelist */
+               ret = ceph_pagelist_append(pagelist, iov.iov_base,
+                                          CEPH_FSCRYPT_BLOCK_SIZE);
+               if (ret)
+                       goto out;
+       }
+       req->r_pagelist = pagelist;
+out:
+       dout("%s %p size dropping cap refs on %s\n", __func__,
+            inode, ceph_cap_string(got));
+       ceph_put_cap_refs(ci, got);
+       if (iov.iov_base)
+               kunmap_local(iov.iov_base);
+       if (page)
+               __free_pages(page, 0);
+       if (ret && pagelist)
+               ceph_pagelist_release(pagelist);
+       return ret;
+}
+
+int __ceph_setattr(struct inode *inode, struct iattr *attr,
+                  struct ceph_iattr *cia)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
        unsigned int ia_valid = attr->ia_valid;
        struct ceph_mds_request *req;
        struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
        struct ceph_cap_flush *prealloc_cf;
+       loff_t isize = i_size_read(inode);
        int issued;
        int release = 0, dirtied = 0;
        int mask = 0;
        int err = 0;
        int inode_dirty_flags = 0;
        bool lock_snap_rwsem = false;
+       bool fill_fscrypt;
+       int truncate_retry = 20; /* The RMW will take around 50ms */
 
+retry:
        prealloc_cf = ceph_alloc_cap_flush();
        if (!prealloc_cf)
                return -ENOMEM;
@@ -2035,6 +2453,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
                return PTR_ERR(req);
        }
 
+       fill_fscrypt = false;
        spin_lock(&ci->i_ceph_lock);
        issued = __ceph_caps_issued(ci, NULL);
 
@@ -2050,6 +2469,43 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
        }
 
        dout("setattr %p issued %s\n", inode, ceph_cap_string(issued));
+#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+       if (cia && cia->fscrypt_auth) {
+               u32 len = ceph_fscrypt_auth_len(cia->fscrypt_auth);
+
+               if (len > sizeof(*cia->fscrypt_auth)) {
+                       err = -EINVAL;
+                       spin_unlock(&ci->i_ceph_lock);
+                       goto out;
+               }
+
+               dout("setattr %llx:%llx fscrypt_auth len %u to %u)\n",
+                       ceph_vinop(inode), ci->fscrypt_auth_len, len);
+
+               /* It should never be re-set once set */
+               WARN_ON_ONCE(ci->fscrypt_auth);
+
+               if (issued & CEPH_CAP_AUTH_EXCL) {
+                       dirtied |= CEPH_CAP_AUTH_EXCL;
+                       kfree(ci->fscrypt_auth);
+                       ci->fscrypt_auth = (u8 *)cia->fscrypt_auth;
+                       ci->fscrypt_auth_len = len;
+               } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
+                          ci->fscrypt_auth_len != len ||
+                          memcmp(ci->fscrypt_auth, cia->fscrypt_auth, len)) {
+                       req->r_fscrypt_auth = cia->fscrypt_auth;
+                       mask |= CEPH_SETATTR_FSCRYPT_AUTH;
+                       release |= CEPH_CAP_AUTH_SHARED;
+               }
+               cia->fscrypt_auth = NULL;
+       }
+#else
+       if (cia && cia->fscrypt_auth) {
+               err = -EINVAL;
+               spin_unlock(&ci->i_ceph_lock);
+               goto out;
+       }
+#endif /* CONFIG_FS_ENCRYPTION */
 
        if (ia_valid & ATTR_UID) {
                dout("setattr %p uid %d -> %d\n", inode,
@@ -2119,10 +2575,27 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
                }
        }
        if (ia_valid & ATTR_SIZE) {
-               loff_t isize = i_size_read(inode);
-
                dout("setattr %p size %lld -> %lld\n", inode, isize, attr->ia_size);
-               if ((issued & CEPH_CAP_FILE_EXCL) && attr->ia_size >= isize) {
+               /*
+                * Only when the new size is smaller and not aligned to
+                * CEPH_FSCRYPT_BLOCK_SIZE will the RMW is needed.
+                */
+               if (IS_ENCRYPTED(inode) && attr->ia_size < isize &&
+                   (attr->ia_size % CEPH_FSCRYPT_BLOCK_SIZE)) {
+                       mask |= CEPH_SETATTR_SIZE;
+                       release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL |
+                                  CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
+                       set_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags);
+                       mask |= CEPH_SETATTR_FSCRYPT_FILE;
+                       req->r_args.setattr.size =
+                               cpu_to_le64(round_up(attr->ia_size,
+                                                    CEPH_FSCRYPT_BLOCK_SIZE));
+                       req->r_args.setattr.old_size =
+                               cpu_to_le64(round_up(isize,
+                                                    CEPH_FSCRYPT_BLOCK_SIZE));
+                       req->r_fscrypt_file = attr->ia_size;
+                       fill_fscrypt = true;
+               } else if ((issued & CEPH_CAP_FILE_EXCL) && attr->ia_size >= isize) {
                        if (attr->ia_size > isize) {
                                i_size_write(inode, attr->ia_size);
                                inode->i_blocks = calc_inode_blocks(attr->ia_size);
@@ -2132,11 +2605,24 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
                        }
                } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
                           attr->ia_size != isize) {
-                       req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
-                       req->r_args.setattr.old_size = cpu_to_le64(isize);
                        mask |= CEPH_SETATTR_SIZE;
                        release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL |
                                   CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
+                       if (IS_ENCRYPTED(inode) && attr->ia_size) {
+                               set_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags);
+                               mask |= CEPH_SETATTR_FSCRYPT_FILE;
+                               req->r_args.setattr.size =
+                                       cpu_to_le64(round_up(attr->ia_size,
+                                                            CEPH_FSCRYPT_BLOCK_SIZE));
+                               req->r_args.setattr.old_size =
+                                       cpu_to_le64(round_up(isize,
+                                                            CEPH_FSCRYPT_BLOCK_SIZE));
+                               req->r_fscrypt_file = attr->ia_size;
+                       } else {
+                               req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
+                               req->r_args.setattr.old_size = cpu_to_le64(isize);
+                               req->r_fscrypt_file = 0;
+                       }
                }
        }
        if (ia_valid & ATTR_MTIME) {
@@ -2199,8 +2685,10 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
 
        release &= issued;
        spin_unlock(&ci->i_ceph_lock);
-       if (lock_snap_rwsem)
+       if (lock_snap_rwsem) {
                up_read(&mdsc->snap_rwsem);
+               lock_snap_rwsem = false;
+       }
 
        if (inode_dirty_flags)
                __mark_inode_dirty(inode, inode_dirty_flags);
@@ -2212,8 +2700,29 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
                req->r_args.setattr.mask = cpu_to_le32(mask);
                req->r_num_caps = 1;
                req->r_stamp = attr->ia_ctime;
+               if (fill_fscrypt) {
+                       err = fill_fscrypt_truncate(inode, req, attr);
+                       if (err)
+                               goto out;
+               }
+
+               /*
+                * The truncate request will return -EAGAIN when the
+                * last block has been updated just before the MDS
+                * successfully gets the xlock for the FILE lock. To
+                * avoid corrupting the file contents we need to retry
+                * it.
+                */
                err = ceph_mdsc_do_request(mdsc, NULL, req);
+               if (err == -EAGAIN && truncate_retry--) {
+                       dout("setattr %p result=%d (%s locally, %d remote), retry it!\n",
+                            inode, err, ceph_cap_string(dirtied), mask);
+                       ceph_mdsc_put_request(req);
+                       ceph_free_cap_flush(prealloc_cf);
+                       goto retry;
+               }
        }
+out:
        dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err,
             ceph_cap_string(dirtied), mask);
 
@@ -2242,6 +2751,10 @@ int ceph_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
        if (ceph_inode_is_shutdown(inode))
                return -ESTALE;
 
+       err = fscrypt_prepare_setattr(dentry, attr);
+       if (err)
+               return err;
+
        err = setattr_prepare(&nop_mnt_idmap, dentry, attr);
        if (err != 0)
                return err;
@@ -2254,7 +2767,7 @@ int ceph_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
            ceph_quota_is_max_bytes_exceeded(inode, attr->ia_size))
                return -EDQUOT;
 
-       err = __ceph_setattr(inode, attr);
+       err = __ceph_setattr(inode, attr, NULL);
 
        if (err >= 0 && (attr->ia_valid & ATTR_MODE))
                err = posix_acl_chmod(&nop_mnt_idmap, dentry, attr->ia_mode);
@@ -2525,8 +3038,12 @@ int ceph_getattr(struct mnt_idmap *idmap, const struct path *path,
                        stat->nlink = 1 + 1 + ci->i_subdirs;
        }
 
-       stat->attributes_mask |= STATX_ATTR_CHANGE_MONOTONIC;
        stat->attributes |= STATX_ATTR_CHANGE_MONOTONIC;
+       if (IS_ENCRYPTED(inode))
+               stat->attributes |= STATX_ATTR_ENCRYPTED;
+       stat->attributes_mask |= (STATX_ATTR_CHANGE_MONOTONIC |
+                                 STATX_ATTR_ENCRYPTED);
+
        stat->result_mask = request_mask & valid_mask;
        return err;
 }