ceph: blocklist the kclient when receiving corrupted snap trace
authorXiubo Li <xiubli@redhat.com>
Wed, 1 Feb 2023 01:36:45 +0000 (09:36 +0800)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 22 Feb 2023 11:59:45 +0000 (12:59 +0100)
[ Upstream commit a68e564adcaa69b0930809fb64d9d5f7d9c32ba9 ]

When received corrupted snap trace we don't know what exactly has
happened in MDS side. And we shouldn't continue IOs and metadatas
access to MDS, which may corrupt or get incorrect contents.

This patch will just block all the further IO/MDS requests
immediately and then evict the kclient itself.

The reason why we still need to evict the kclient just after
blocking all the further IOs is that the MDS could revoke the caps
faster.

Link: https://tracker.ceph.com/issues/57686
Signed-off-by: Xiubo Li <xiubli@redhat.com>
Reviewed-by: Venky Shankar <vshankar@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
fs/ceph/addr.c
fs/ceph/caps.c
fs/ceph/file.c
fs/ceph/mds_client.c
fs/ceph/snap.c
fs/ceph/super.h

index 61f47debec5ac655c33ad519ecc8648c7c37133a..478c03bfba663a17eec5085302dfdb376480ffb6 100644 (file)
@@ -305,7 +305,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
        struct inode *inode = rreq->inode;
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
-       struct ceph_osd_request *req;
+       struct ceph_osd_request *req = NULL;
        struct ceph_vino vino = ceph_vino(inode);
        struct iov_iter iter;
        struct page **pages;
@@ -313,6 +313,11 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
        int err = 0;
        u64 len = subreq->len;
 
+       if (ceph_inode_is_shutdown(inode)) {
+               err = -EIO;
+               goto out;
+       }
+
        if (ceph_has_inline_data(ci) && ceph_netfs_issue_op_inline(subreq))
                return;
 
@@ -563,6 +568,9 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 
        dout("writepage %p idx %lu\n", page, page->index);
 
+       if (ceph_inode_is_shutdown(inode))
+               return -EIO;
+
        /* verify this is a writeable snap context */
        snapc = page_snap_context(page);
        if (!snapc) {
@@ -1643,7 +1651,7 @@ int ceph_uninline_data(struct file *file)
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
        struct ceph_osd_request *req = NULL;
-       struct ceph_cap_flush *prealloc_cf;
+       struct ceph_cap_flush *prealloc_cf = NULL;
        struct folio *folio = NULL;
        u64 inline_version = CEPH_INLINE_NONE;
        struct page *pages[1];
@@ -1657,6 +1665,11 @@ int ceph_uninline_data(struct file *file)
        dout("uninline_data %p %llx.%llx inline_version %llu\n",
             inode, ceph_vinop(inode), inline_version);
 
+       if (ceph_inode_is_shutdown(inode)) {
+               err = -EIO;
+               goto out;
+       }
+
        if (inline_version == CEPH_INLINE_NONE)
                return 0;
 
index cd69bf267d1b10edd3f43d52b921544dd9877e37..795fd6d84bde0178c89d555cd5cf68f0397f5991 100644 (file)
@@ -4081,6 +4081,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
        void *p, *end;
        struct cap_extra_info extra_info = {};
        bool queue_trunc;
+       bool close_sessions = false;
 
        dout("handle_caps from mds%d\n", session->s_mds);
 
@@ -4218,9 +4219,13 @@ void ceph_handle_caps(struct ceph_mds_session *session,
                realm = NULL;
                if (snaptrace_len) {
                        down_write(&mdsc->snap_rwsem);
-                       ceph_update_snap_trace(mdsc, snaptrace,
-                                              snaptrace + snaptrace_len,
-                                              false, &realm);
+                       if (ceph_update_snap_trace(mdsc, snaptrace,
+                                                  snaptrace + snaptrace_len,
+                                                  false, &realm)) {
+                               up_write(&mdsc->snap_rwsem);
+                               close_sessions = true;
+                               goto done;
+                       }
                        downgrade_write(&mdsc->snap_rwsem);
                } else {
                        down_read(&mdsc->snap_rwsem);
@@ -4280,6 +4285,11 @@ done_unlocked:
        iput(inode);
 out:
        ceph_put_string(extra_info.pool_ns);
+
+       /* Defer closing the sessions after s_mutex lock being released */
+       if (close_sessions)
+               ceph_mdsc_close_sessions(mdsc);
+
        return;
 
 flush_cap_releases:
index 6f9580defb2b390784f355a0608a67609f9e2298..5895797f3104a6cb6bf0d54521f7c1d46bdf4a1a 100644 (file)
@@ -2004,6 +2004,9 @@ static int ceph_zero_partial_object(struct inode *inode,
        loff_t zero = 0;
        int op;
 
+       if (ceph_inode_is_shutdown(inode))
+               return -EIO;
+
        if (!length) {
                op = offset ? CEPH_OSD_OP_DELETE : CEPH_OSD_OP_TRUNCATE;
                length = &zero;
index 756560df3bdbd82912e0c1237054d660adc9c36c..27a245d959c0ad691f5ab35511b2e012479c5283 100644 (file)
@@ -806,6 +806,9 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
 {
        struct ceph_mds_session *s;
 
+       if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO)
+               return ERR_PTR(-EIO);
+
        if (mds >= mdsc->mdsmap->possible_max_rank)
                return ERR_PTR(-EINVAL);
 
@@ -1478,6 +1481,9 @@ static int __open_session(struct ceph_mds_client *mdsc,
        int mstate;
        int mds = session->s_mds;
 
+       if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO)
+               return -EIO;
+
        /* wait for mds to go active? */
        mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);
        dout("open_session to mds%d (%s)\n", mds,
@@ -2860,6 +2866,11 @@ static void __do_request(struct ceph_mds_client *mdsc,
                return;
        }
 
+       if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO) {
+               dout("do_request metadata corrupted\n");
+               err = -EIO;
+               goto finish;
+       }
        if (req->r_timeout &&
            time_after_eq(jiffies, req->r_started + req->r_timeout)) {
                dout("do_request timed out\n");
@@ -3245,6 +3256,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
        u64 tid;
        int err, result;
        int mds = session->s_mds;
+       bool close_sessions = false;
 
        if (msg->front.iov_len < sizeof(*head)) {
                pr_err("mdsc_handle_reply got corrupt (short) reply\n");
@@ -3351,10 +3363,17 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
        realm = NULL;
        if (rinfo->snapblob_len) {
                down_write(&mdsc->snap_rwsem);
-               ceph_update_snap_trace(mdsc, rinfo->snapblob,
+               err = ceph_update_snap_trace(mdsc, rinfo->snapblob,
                                rinfo->snapblob + rinfo->snapblob_len,
                                le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP,
                                &realm);
+               if (err) {
+                       up_write(&mdsc->snap_rwsem);
+                       close_sessions = true;
+                       if (err == -EIO)
+                               ceph_msg_dump(msg);
+                       goto out_err;
+               }
                downgrade_write(&mdsc->snap_rwsem);
        } else {
                down_read(&mdsc->snap_rwsem);
@@ -3412,6 +3431,10 @@ out_err:
                                     req->r_end_latency, err);
 out:
        ceph_mdsc_put_request(req);
+
+       /* Defer closing the sessions after s_mutex lock being released */
+       if (close_sessions)
+               ceph_mdsc_close_sessions(mdsc);
        return;
 }
 
@@ -5017,7 +5040,7 @@ static bool done_closing_sessions(struct ceph_mds_client *mdsc, int skipped)
 }
 
 /*
- * called after sb is ro.
+ * called after sb is ro or when metadata corrupted.
  */
 void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
 {
@@ -5307,7 +5330,8 @@ static void mds_peer_reset(struct ceph_connection *con)
        struct ceph_mds_client *mdsc = s->s_mdsc;
 
        pr_warn("mds%d closed our session\n", s->s_mds);
-       send_mds_reconnect(mdsc, s);
+       if (READ_ONCE(mdsc->fsc->mount_state) != CEPH_MOUNT_FENCE_IO)
+               send_mds_reconnect(mdsc, s);
 }
 
 static void mds_dispatch(struct ceph_connection *con, struct ceph_msg *msg)
index e4151852184e04d092b6ff0b1b4d139d3b2f5dad..87007203f130e14abf51cdf54a5c3d388ba9ab97 100644 (file)
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/ceph/ceph_debug.h>
 
+#include <linux/fs.h>
 #include <linux/sort.h>
 #include <linux/slab.h>
 #include <linux/iversion.h>
@@ -766,8 +767,10 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
        struct ceph_snap_realm *realm;
        struct ceph_snap_realm *first_realm = NULL;
        struct ceph_snap_realm *realm_to_rebuild = NULL;
+       struct ceph_client *client = mdsc->fsc->client;
        int rebuild_snapcs;
        int err = -ENOMEM;
+       int ret;
        LIST_HEAD(dirty_realms);
 
        lockdep_assert_held_write(&mdsc->snap_rwsem);
@@ -884,6 +887,27 @@ fail:
        if (first_realm)
                ceph_put_snap_realm(mdsc, first_realm);
        pr_err("%s error %d\n", __func__, err);
+
+       /*
+        * When receiving a corrupted snap trace we don't know what
+        * exactly has happened in MDS side. And we shouldn't continue
+        * writing to OSD, which may corrupt the snapshot contents.
+        *
+        * Just try to blocklist this kclient and then this kclient
+        * must be remounted to continue after the corrupted metadata
+        * fixed in the MDS side.
+        */
+       WRITE_ONCE(mdsc->fsc->mount_state, CEPH_MOUNT_FENCE_IO);
+       ret = ceph_monc_blocklist_add(&client->monc, &client->msgr.inst.addr);
+       if (ret)
+               pr_err("%s failed to blocklist %s: %d\n", __func__,
+                      ceph_pr_addr(&client->msgr.inst.addr), ret);
+
+       WARN(1, "%s: %s%sdo remount to continue%s",
+            __func__, ret ? "" : ceph_pr_addr(&client->msgr.inst.addr),
+            ret ? "" : " was blocklisted, ",
+            err == -EIO ? " after corrupted snaptrace is fixed" : "");
+
        return err;
 }
 
@@ -984,6 +1008,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
        __le64 *split_inos = NULL, *split_realms = NULL;
        int i;
        int locked_rwsem = 0;
+       bool close_sessions = false;
 
        /* decode */
        if (msg->front.iov_len < sizeof(*h))
@@ -1092,8 +1117,12 @@ skip_inode:
         * update using the provided snap trace. if we are deleting a
         * snap, we can avoid queueing cap_snaps.
         */
-       ceph_update_snap_trace(mdsc, p, e,
-                              op == CEPH_SNAP_OP_DESTROY, NULL);
+       if (ceph_update_snap_trace(mdsc, p, e,
+                                  op == CEPH_SNAP_OP_DESTROY,
+                                  NULL)) {
+               close_sessions = true;
+               goto bad;
+       }
 
        if (op == CEPH_SNAP_OP_SPLIT)
                /* we took a reference when we created the realm, above */
@@ -1112,6 +1141,9 @@ bad:
 out:
        if (locked_rwsem)
                up_write(&mdsc->snap_rwsem);
+
+       if (close_sessions)
+               ceph_mdsc_close_sessions(mdsc);
        return;
 }
 
index 735279b2ceb55fef95128e00f0bfdd30f9e51323..3599fefa91f99b3c5622460676e5ef066f746b74 100644 (file)
@@ -108,6 +108,7 @@ enum {
        CEPH_MOUNT_UNMOUNTED,
        CEPH_MOUNT_SHUTDOWN,
        CEPH_MOUNT_RECOVER,
+       CEPH_MOUNT_FENCE_IO,
 };
 
 #define CEPH_ASYNC_CREATE_CONFLICT_BITS 8