md/cluster: fix deadlock when node is doing resync job

author Zhao Heming <heming.zhao@suse.com>

Thu, 19 Nov 2020 11:41:34 +0000 (19:41 +0800)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Wed, 30 Dec 2020 10:54:25 +0000 (11:54 +0100)
author Zhao Heming <heming.zhao@suse.com>
Thu, 19 Nov 2020 11:41:34 +0000 (19:41 +0800)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 30 Dec 2020 10:54:25 +0000 (11:54 +0100)
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c

index 4aaf482..f0e64e7 100644 (file)
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -664,9 +664,27 @@ out:
   * Takes the lock on the TOKEN lock resource so no other
   * node can communicate while the operation is underway.
   */
-static int lock_token(struct md_cluster_info *cinfo, bool mddev_locked)
+static int lock_token(struct md_cluster_info *cinfo)
  {
-       int error, set_bit = 0;
+       int error;
+
+       error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX);
+       if (error) {
+               pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n",
+                               __func__, __LINE__, error);
+       } else {
+               /* Lock the receive sequence */
+               mutex_lock(&cinfo->recv_mutex);
+       }
+       return error;
+}
+
+/* lock_comm()
+ * Sets the MD_CLUSTER_SEND_LOCK bit to lock the send channel.
+ */
+static int lock_comm(struct md_cluster_info *cinfo, bool mddev_locked)
+{
+       int rv, set_bit = 0;
         struct mddev *mddev = cinfo->mddev;
  
         /*
@@ -677,34 +695,19 @@ static int lock_token(struct md_cluster_info *cinfo, bool mddev_locked)
          */
         if (mddev_locked && !test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD,
                                       &cinfo->state)) {
-               error = test_and_set_bit_lock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD,
+               rv = test_and_set_bit_lock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD,
                                               &cinfo->state);
-               WARN_ON_ONCE(error);
+               WARN_ON_ONCE(rv);
                 md_wakeup_thread(mddev->thread);
                 set_bit = 1;
         }
-       error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX);
-       if (set_bit)
-               clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
  
-       if (error)
-               pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n",
-                               __func__, __LINE__, error);
-
-       /* Lock the receive sequence */
-       mutex_lock(&cinfo->recv_mutex);
-       return error;
-}
-
-/* lock_comm()
- * Sets the MD_CLUSTER_SEND_LOCK bit to lock the send channel.
- */
-static int lock_comm(struct md_cluster_info *cinfo, bool mddev_locked)
-{
         wait_event(cinfo->wait,
                    !test_and_set_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state));
-
-       return lock_token(cinfo, mddev_locked);
+       rv = lock_token(cinfo);
+       if (set_bit)
+               clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
+       return rv;
  }
  
  static void unlock_comm(struct md_cluster_info *cinfo)
@@ -784,9 +787,11 @@ static int sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg,
  {
         int ret;
  
-       lock_comm(cinfo, mddev_locked);
-       ret = __sendmsg(cinfo, cmsg);
-       unlock_comm(cinfo);
+       ret = lock_comm(cinfo, mddev_locked);
+       if (!ret) {
+               ret = __sendmsg(cinfo, cmsg);
+               unlock_comm(cinfo);
+       }
         return ret;
  }
  
@@ -1061,7 +1066,7 @@ static int metadata_update_start(struct mddev *mddev)
                 return 0;
         }
  
-       ret = lock_token(cinfo, 1);
+       ret = lock_token(cinfo);
         clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
         return ret;
  }
@@ -1255,7 +1260,10 @@ static void update_size(struct mddev *mddev, sector_t old_dev_sectors)
         int raid_slot = -1;
  
         md_update_sb(mddev, 1);
-       lock_comm(cinfo, 1);
+       if (lock_comm(cinfo, 1)) {
+               pr_err("%s: lock_comm failed\n", __func__);
+               return;
+       }
  
         memset(&cmsg, 0, sizeof(cmsg));
         cmsg.type = cpu_to_le32(METADATA_UPDATED);
@@ -1407,7 +1415,8 @@ static int add_new_disk(struct mddev *mddev, struct md_rdev *rdev)
         cmsg.type = cpu_to_le32(NEWDISK);
         memcpy(cmsg.uuid, uuid, 16);
         cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
-       lock_comm(cinfo, 1);
+       if (lock_comm(cinfo, 1))
+               return -EAGAIN;
         ret = __sendmsg(cinfo, &cmsg);
         if (ret) {
                 unlock_comm(cinfo);
diff --git a/drivers/md/md.c b/drivers/md/md.c

index 55f9ddf..3be74cf 100644 (file)
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -6948,8 +6948,10 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev)
                 goto busy;
  
  kick_rdev:
-       if (mddev_is_clustered(mddev))
-               md_cluster_ops->remove_disk(mddev, rdev);
+       if (mddev_is_clustered(mddev)) {
+               if (md_cluster_ops->remove_disk(mddev, rdev))
+                       goto busy;
+       }
  
         md_kick_rdev_from_array(rdev);
         set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
author	Zhao Heming <heming.zhao@suse.com>
	Thu, 19 Nov 2020 11:41:34 +0000 (19:41 +0800)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Wed, 30 Dec 2020 10:54:25 +0000 (11:54 +0100)
drivers/md/md-cluster.c		patch \| blob \| history
drivers/md/md.c		patch \| blob \| history