ceph: make num_fwd and num_retry to __u32

author Xiubo Li <xiubli@redhat.com>

Tue, 25 Jul 2023 09:51:59 +0000 (17:51 +0800)

committer Ilya Dryomov <idryomov@gmail.com>

Thu, 31 Aug 2023 12:56:27 +0000 (14:56 +0200)
author Xiubo Li <xiubli@redhat.com>
Tue, 25 Jul 2023 09:51:59 +0000 (17:51 +0800)
committer Ilya Dryomov <idryomov@gmail.com>
Thu, 31 Aug 2023 12:56:27 +0000 (14:56 +0200)
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c

index 04a8813..615db14 100644 (file)
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2840,6 +2840,18 @@ static void encode_mclientrequest_tail(void **p,
         }
  }
  
+static struct ceph_mds_request_head_legacy *
+find_legacy_request_head(void *p, u64 features)
+{
+       bool legacy = !(features & CEPH_FEATURE_FS_BTIME);
+       struct ceph_mds_request_head_old *ohead;
+
+       if (legacy)
+               return (struct ceph_mds_request_head_legacy *)p;
+       ohead = (struct ceph_mds_request_head_old *)p;
+       return (struct ceph_mds_request_head_legacy *)&ohead->oldest_client_tid;
+}
+
  /*
   * called under mdsc->mutex
   */
@@ -2850,7 +2862,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
         int mds = session->s_mds;
         struct ceph_mds_client *mdsc = session->s_mdsc;
         struct ceph_msg *msg;
-       struct ceph_mds_request_head_old *head;
+       struct ceph_mds_request_head_legacy *lhead;
         const char *path1 = NULL;
         const char *path2 = NULL;
         u64 ino1 = 0, ino2 = 0;
@@ -2862,6 +2874,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
         void *p, *end;
         int ret;
         bool legacy = !(session->s_con.peer_features & CEPH_FEATURE_FS_BTIME);
+       bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD,
+                                    &session->s_features);
  
         ret = set_request_path_attr(req->r_inode, req->r_dentry,
                               req->r_parent, req->r_path1, req->r_ino1.ino,
@@ -2893,7 +2907,19 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
                 goto out_free2;
         }
  
-       len = legacy ? sizeof(*head) : sizeof(struct ceph_mds_request_head);
+       /*
+        * For old cephs without supporting the 32bit retry/fwd feature
+        * it will copy the raw memories directly when decoding the
+        * requests. While new cephs will decode the head depending the
+        * version member, so we need to make sure it will be compatible
+        * with them both.
+        */
+       if (legacy)
+               len = sizeof(struct ceph_mds_request_head_legacy);
+       else if (old_version)
+               len = sizeof(struct ceph_mds_request_head_old);
+       else
+               len = sizeof(struct ceph_mds_request_head);
  
         /* filepaths */
         len += 2 * (1 + sizeof(u32) + sizeof(u64));
@@ -2938,33 +2964,40 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
  
         msg->hdr.tid = cpu_to_le64(req->r_tid);
  
+       lhead = find_legacy_request_head(msg->front.iov_base,
+                                        session->s_con.peer_features);
+
         /*
-        * The old ceph_mds_request_head didn't contain a version field, and
+        * The ceph_mds_request_head_legacy didn't contain a version field, and
          * one was added when we moved the message version from 3->4.
          */
         if (legacy) {
                 msg->hdr.version = cpu_to_le16(3);
-               head = msg->front.iov_base;
-               p = msg->front.iov_base + sizeof(*head);
+               p = msg->front.iov_base + sizeof(*lhead);
+       } else if (old_version) {
+               struct ceph_mds_request_head_old *ohead = msg->front.iov_base;
+
+               msg->hdr.version = cpu_to_le16(4);
+               ohead->version = cpu_to_le16(1);
+               p = msg->front.iov_base + sizeof(*ohead);
         } else {
-               struct ceph_mds_request_head *new_head = msg->front.iov_base;
+               struct ceph_mds_request_head *nhead = msg->front.iov_base;
  
                 msg->hdr.version = cpu_to_le16(6);
-               new_head->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION);
-               head = (struct ceph_mds_request_head_old *)&new_head->oldest_client_tid;
-               p = msg->front.iov_base + sizeof(*new_head);
+               nhead->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION);
+               p = msg->front.iov_base + sizeof(*nhead);
         }
  
         end = msg->front.iov_base + msg->front.iov_len;
  
-       head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
-       head->op = cpu_to_le32(req->r_op);
-       head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns,
-                                                req->r_cred->fsuid));
-       head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns,
-                                                req->r_cred->fsgid));
-       head->ino = cpu_to_le64(req->r_deleg_ino);
-       head->args = req->r_args;
+       lhead->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
+       lhead->op = cpu_to_le32(req->r_op);
+       lhead->caller_uid = cpu_to_le32(from_kuid(&init_user_ns,
+                                                 req->r_cred->fsuid));
+       lhead->caller_gid = cpu_to_le32(from_kgid(&init_user_ns,
+                                                 req->r_cred->fsgid));
+       lhead->ino = cpu_to_le64(req->r_deleg_ino);
+       lhead->args = req->r_args;
  
         ceph_encode_filepath(&p, end, ino1, path1);
         ceph_encode_filepath(&p, end, ino2, path2);
@@ -3006,7 +3039,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
                 p = msg->front.iov_base + req->r_request_release_offset;
         }
  
-       head->num_releases = cpu_to_le16(releases);
+       lhead->num_releases = cpu_to_le16(releases);
  
         encode_mclientrequest_tail(&p, req);
  
@@ -3057,18 +3090,6 @@ static void complete_request(struct ceph_mds_client *mdsc,
         complete_all(&req->r_completion);
  }
  
-static struct ceph_mds_request_head_old *
-find_old_request_head(void *p, u64 features)
-{
-       bool legacy = !(features & CEPH_FEATURE_FS_BTIME);
-       struct ceph_mds_request_head *new_head;
-
-       if (legacy)
-               return (struct ceph_mds_request_head_old *)p;
-       new_head = (struct ceph_mds_request_head *)p;
-       return (struct ceph_mds_request_head_old *)&new_head->oldest_client_tid;
-}
-
  /*
   * called under mdsc->mutex
   */
@@ -3078,29 +3099,28 @@ static int __prepare_send_request(struct ceph_mds_session *session,
  {
         int mds = session->s_mds;
         struct ceph_mds_client *mdsc = session->s_mdsc;
-       struct ceph_mds_request_head_old *rhead;
+       struct ceph_mds_request_head_legacy *lhead;
+       struct ceph_mds_request_head *nhead;
         struct ceph_msg *msg;
-       int flags = 0, max_retry;
+       int flags = 0, old_max_retry;
+       bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD,
+                                    &session->s_features);
  
         /*
-        * The type of 'r_attempts' in kernel 'ceph_mds_request'
-        * is 'int', while in 'ceph_mds_request_head' the type of
-        * 'num_retry' is '__u8'. So in case the request retries
-        *  exceeding 256 times, the MDS will receive a incorrect
-        *  retry seq.
-        *
-        * In this case it's ususally a bug in MDS and continue
-        * retrying the request makes no sense.
-        *
-        * In future this could be fixed in ceph code, so avoid
-        * using the hardcode here.
+        * Avoid inifinite retrying after overflow. The client will
+        * increase the retry count and if the MDS is old version,
+        * so we limit to retry at most 256 times.
          */
-       max_retry = sizeof_field(struct ceph_mds_request_head, num_retry);
-       max_retry = 1 << (max_retry * BITS_PER_BYTE);
-       if (req->r_attempts >= max_retry) {
-               pr_warn_ratelimited("%s request tid %llu seq overflow\n",
-                                   __func__, req->r_tid);
-               return -EMULTIHOP;
+       if (req->r_attempts) {
+              old_max_retry = sizeof_field(struct ceph_mds_request_head_old,
+                                           num_retry);
+              old_max_retry = 1 << (old_max_retry * BITS_PER_BYTE);
+              if ((old_version && req->r_attempts >= old_max_retry) ||
+                  ((uint32_t)req->r_attempts >= U32_MAX)) {
+                       pr_warn_ratelimited("%s request tid %llu seq overflow\n",
+                                           __func__, req->r_tid);
+                       return -EMULTIHOP;
+              }
         }
  
         req->r_attempts++;
@@ -3126,20 +3146,24 @@ static int __prepare_send_request(struct ceph_mds_session *session,
                  * d_move mangles the src name.
                  */
                 msg = req->r_request;
-               rhead = find_old_request_head(msg->front.iov_base,
-                                             session->s_con.peer_features);
+               lhead = find_legacy_request_head(msg->front.iov_base,
+                                                session->s_con.peer_features);
  
-               flags = le32_to_cpu(rhead->flags);
+               flags = le32_to_cpu(lhead->flags);
                 flags |= CEPH_MDS_FLAG_REPLAY;
-               rhead->flags = cpu_to_le32(flags);
+               lhead->flags = cpu_to_le32(flags);
  
                 if (req->r_target_inode)
-                       rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
+                       lhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
  
-               rhead->num_retry = req->r_attempts - 1;
+               lhead->num_retry = req->r_attempts - 1;
+               if (!old_version) {
+                       nhead = (struct ceph_mds_request_head*)msg->front.iov_base;
+                       nhead->ext_num_retry = cpu_to_le32(req->r_attempts - 1);
+               }
  
                 /* remove cap/dentry releases from message */
-               rhead->num_releases = 0;
+               lhead->num_releases = 0;
  
                 p = msg->front.iov_base + req->r_request_release_offset;
                 encode_mclientrequest_tail(&p, req);
@@ -3160,18 +3184,23 @@ static int __prepare_send_request(struct ceph_mds_session *session,
         }
         req->r_request = msg;
  
-       rhead = find_old_request_head(msg->front.iov_base,
-                                     session->s_con.peer_features);
-       rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
+       lhead = find_legacy_request_head(msg->front.iov_base,
+                                        session->s_con.peer_features);
+       lhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
         if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
                 flags |= CEPH_MDS_FLAG_REPLAY;
         if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags))
                 flags |= CEPH_MDS_FLAG_ASYNC;
         if (req->r_parent)
                 flags |= CEPH_MDS_FLAG_WANT_DENTRY;
-       rhead->flags = cpu_to_le32(flags);
-       rhead->num_fwd = req->r_num_fwd;
-       rhead->num_retry = req->r_attempts - 1;
+       lhead->flags = cpu_to_le32(flags);
+       lhead->num_fwd = req->r_num_fwd;
+       lhead->num_retry = req->r_attempts - 1;
+       if (!old_version) {
+               nhead = (struct ceph_mds_request_head*)msg->front.iov_base;
+               nhead->ext_num_fwd = cpu_to_le32(req->r_num_fwd);
+               nhead->ext_num_retry = cpu_to_le32(req->r_attempts - 1);
+       }
  
         dout(" r_parent = %p\n", req->r_parent);
         return 0;
@@ -3830,33 +3859,21 @@ static void handle_forward(struct ceph_mds_client *mdsc,
         if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
                 dout("forward tid %llu aborted, unregistering\n", tid);
                 __unregister_request(mdsc, req);
-       } else if (fwd_seq <= req->r_num_fwd) {
+       } else if (fwd_seq <= req->r_num_fwd || (uint32_t)fwd_seq >= U32_MAX) {
                 /*
-                * The type of 'num_fwd' in ceph 'MClientRequestForward'
-                * is 'int32_t', while in 'ceph_mds_request_head' the
-                * type is '__u8'. So in case the request bounces between
-                * MDSes exceeding 256 times, the client will get stuck.
-                *
-                * In this case it's ususally a bug in MDS and continue
-                * bouncing the request makes no sense.
+                * Avoid inifinite retrying after overflow.
                  *
-                * In future this could be fixed in ceph code, so avoid
-                * using the hardcode here.
+                * The MDS will increase the fwd count and in client side
+                * if the num_fwd is less than the one saved in request
+                * that means the MDS is an old version and overflowed of
+                * 8 bits.
                  */
-               int max = sizeof_field(struct ceph_mds_request_head, num_fwd);
-               max = 1 << (max * BITS_PER_BYTE);
-               if (req->r_num_fwd >= max) {
-                       mutex_lock(&req->r_fill_mutex);
-                       req->r_err = -EMULTIHOP;
-                       set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
-                       mutex_unlock(&req->r_fill_mutex);
-                       aborted = true;
-                       pr_warn_ratelimited("forward tid %llu seq overflow\n",
-                                           tid);
-               } else {
-                       dout("forward tid %llu to mds%d - old seq %d <= %d\n",
-                            tid, next_mds, req->r_num_fwd, fwd_seq);
-               }
+               mutex_lock(&req->r_fill_mutex);
+               req->r_err = -EMULTIHOP;
+               set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
+               mutex_unlock(&req->r_fill_mutex);
+               aborted = true;
+               pr_warn_ratelimited("forward tid %llu seq overflow\n", tid);
         } else {
                 /* resend. forward race not possible; mds would drop */
                 dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h

index 1fa0f78..5a3714b 100644 (file)
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -32,8 +32,9 @@ enum ceph_feature_type {
         CEPHFS_FEATURE_ALTERNATE_NAME,
         CEPHFS_FEATURE_NOTIFY_SESSION_STATE,
         CEPHFS_FEATURE_OP_GETVXATTR,
+       CEPHFS_FEATURE_32BITS_RETRY_FWD,
  
-       CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_OP_GETVXATTR,
+       CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_32BITS_RETRY_FWD,
  };
  
  #define CEPHFS_FEATURES_CLIENT_SUPPORTED {     \
@@ -47,6 +48,7 @@ enum ceph_feature_type {
         CEPHFS_FEATURE_ALTERNATE_NAME,          \
         CEPHFS_FEATURE_NOTIFY_SESSION_STATE,    \
         CEPHFS_FEATURE_OP_GETVXATTR,            \
+       CEPHFS_FEATURE_32BITS_RETRY_FWD,        \
  }
  
  /*
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h

index ae44812..5f2301e 100644 (file)
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -486,7 +486,7 @@ union ceph_mds_request_args_ext {
  #define CEPH_MDS_FLAG_WANT_DENTRY      2 /* want dentry in reply */
  #define CEPH_MDS_FLAG_ASYNC            4 /* request is asynchronous */
  
-struct ceph_mds_request_head_old {
+struct ceph_mds_request_head_legacy {
         __le64 oldest_client_tid;
         __le32 mdsmap_epoch;           /* on client */
         __le32 flags;                  /* CEPH_MDS_FLAG_* */
@@ -499,9 +499,9 @@ struct ceph_mds_request_head_old {
         union ceph_mds_request_args args;
  } __attribute__ ((packed));
  
-#define CEPH_MDS_REQUEST_HEAD_VERSION  1
+#define CEPH_MDS_REQUEST_HEAD_VERSION  2
  
-struct ceph_mds_request_head {
+struct ceph_mds_request_head_old {
         __le16 version;                /* struct version */
         __le64 oldest_client_tid;
         __le32 mdsmap_epoch;           /* on client */
@@ -515,6 +515,23 @@ struct ceph_mds_request_head {
         union ceph_mds_request_args_ext args;
  } __attribute__ ((packed));
  
+struct ceph_mds_request_head {
+       __le16 version;                /* struct version */
+       __le64 oldest_client_tid;
+       __le32 mdsmap_epoch;           /* on client */
+       __le32 flags;                  /* CEPH_MDS_FLAG_* */
+       __u8 num_retry, num_fwd;       /* legacy count retry and fwd attempts */
+       __le16 num_releases;           /* # include cap/lease release records */
+       __le32 op;                     /* mds op code */
+       __le32 caller_uid, caller_gid;
+       __le64 ino;                    /* use this ino for openc, mkdir, mknod,
+                                         etc. (if replaying) */
+       union ceph_mds_request_args_ext args;
+
+       __le32 ext_num_retry;          /* new count retry attempts */
+       __le32 ext_num_fwd;            /* new count fwd attempts */
+} __attribute__ ((packed));
+
  /* cap/lease release record */
  struct ceph_mds_request_release {
         __le64 ino, cap_id;            /* ino and unique cap id */
author	Xiubo Li <xiubli@redhat.com>
	Tue, 25 Jul 2023 09:51:59 +0000 (17:51 +0800)
committer	Ilya Dryomov <idryomov@gmail.com>
	Thu, 31 Aug 2023 12:56:27 +0000 (14:56 +0200)
fs/ceph/mds_client.c		patch \| blob \| history
fs/ceph/mds_client.h		patch \| blob \| history
include/linux/ceph/ceph_fs.h		patch \| blob \| history