fs: dlm: make new_lockspace() wait until recovery completes
authorAlexander Aring <aahringo@redhat.com>
Wed, 22 Jun 2022 18:45:15 +0000 (14:45 -0400)
committerDavid Teigland <teigland@redhat.com>
Fri, 24 Jun 2022 16:57:47 +0000 (11:57 -0500)
Make dlm_new_lockspace() wait until a full recovery completes
sucessfully or fails. Previously, dlm_new_lockspace() returned
to the caller after dlm_recover_members() finished, which is
only partially through recovery.  The result of the previous
behavior is that the new lockspace would not be usable for some
time (especially with overlapping recoveries), and some errors
in the later part of recovery could not be returned to the caller.

Kernel callers gfs2 and cluster-md have their own wait handling to
wait for recovery to complete after calling dlm_new_lockspace().
This continues to work, but will be unnecessary.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
fs/dlm/dlm_internal.h
fs/dlm/lockspace.c
fs/dlm/member.c
fs/dlm/recoverd.c

index 776c3ed519f0231d8b62dab34fb41c5e886caad5..c03388a3875ca5d4ebce1047d8a0aacbf9211b5f 100644 (file)
@@ -606,8 +606,8 @@ struct dlm_ls {
 
        wait_queue_head_t       ls_uevent_wait; /* user part of join/leave */
        int                     ls_uevent_result;
-       struct completion       ls_members_done;
-       int                     ls_members_result;
+       struct completion       ls_recovery_done;
+       int                     ls_recovery_result;
 
        struct miscdevice       ls_device;
 
index 19ed41a5da93cb3687a0caef270095e9b380c71d..0c3613d09c5e3059328035733956f797ebecfe80 100644 (file)
@@ -548,8 +548,8 @@ static int new_lockspace(const char *name, const char *cluster,
 
        init_waitqueue_head(&ls->ls_uevent_wait);
        ls->ls_uevent_result = 0;
-       init_completion(&ls->ls_members_done);
-       ls->ls_members_result = -1;
+       init_completion(&ls->ls_recovery_done);
+       ls->ls_recovery_result = -1;
 
        mutex_init(&ls->ls_cb_mutex);
        INIT_LIST_HEAD(&ls->ls_cb_delay);
@@ -645,8 +645,9 @@ static int new_lockspace(const char *name, const char *cluster,
        if (error)
                goto out_recoverd;
 
-       wait_for_completion(&ls->ls_members_done);
-       error = ls->ls_members_result;
+       /* wait until recovery is successful or failed */
+       wait_for_completion(&ls->ls_recovery_done);
+       error = ls->ls_recovery_result;
        if (error)
                goto out_members;
 
index 67b056634f033a55d6ac3524dbf83803e17c0f0f..2af2ccfe43a9d33b9ba1961a6bbd321a7d068b94 100644 (file)
@@ -587,19 +587,6 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
        *neg_out = neg;
 
        error = ping_members(ls);
-       /* error -EINTR means that a new recovery action is triggered.
-        * We ignore this recovery action and let run the new one which might
-        * have new member configuration.
-        */
-       if (error == -EINTR)
-               error = 0;
-
-       /* new_lockspace() may be waiting to know if the config
-        * is good or bad
-        */
-       ls->ls_members_result = error;
-       complete(&ls->ls_members_done);
-
        log_rinfo(ls, "dlm_recover_members %d nodes", ls->ls_num_nodes);
        return error;
 }
index 2d46e65d0101a5cb041b6fd64d7a80a06f47ff97..ff00b710486b189b24717d0856add46828312011 100644 (file)
@@ -243,6 +243,9 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
                  jiffies_to_msecs(jiffies - start));
        mutex_unlock(&ls->ls_recoverd_active);
 
+       ls->ls_recovery_result = 0;
+       complete(&ls->ls_recovery_done);
+
        dlm_lsop_recover_done(ls);
        return 0;
 
@@ -251,6 +254,16 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
        log_rinfo(ls, "dlm_recover %llu error %d",
                  (unsigned long long)rv->seq, error);
        mutex_unlock(&ls->ls_recoverd_active);
+
+       /* let new_lockspace() get aware of critical error if recovery
+        * was interrupted -EINTR we wait for the next ls_recover()
+        * iteration until it succeeds.
+        */
+       if (error != -EINTR) {
+               ls->ls_recovery_result = error;
+               complete(&ls->ls_recovery_done);
+       }
+
        return error;
 }