drbd: fix race between drbdadm invalidate/verify and finishing resync
authorLars Ellenberg <lars.ellenberg@linbit.com>
Mon, 7 May 2012 10:00:56 +0000 (12:00 +0200)
committerPhilipp Reisner <philipp.reisner@linbit.com>
Thu, 8 Nov 2012 15:58:27 +0000 (16:58 +0100)
When a resync or online verify is finished or aborted,
drbd does a bulk write-out of changed bitmap pages.

If *in that very moment* a new verify or resync is triggered,
this can race:
 ASSERT( !test_bit(BITMAP_IO, &mdev->flags) ) in drbd_main.c
 FIXME going to queue 'set_n_write from StartingSync' but 'write from resync_finished' still pending?
and similar.

This can be observed with e.g. tight invalidate loops in test scripts,
and probably has no real-life implication.

Still, that race can be solved by first quiescen the device,
before starting a new resync or verify.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
drivers/block/drbd/drbd_nl.c

index 3a8fa89..cbd45de 100644 (file)
@@ -2372,6 +2372,7 @@ int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info)
 
        /* If there is still bitmap IO pending, probably because of a previous
         * resync just being finished, wait for it before requesting a new resync. */
+       drbd_suspend_io(mdev);
        wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
 
        retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T), CS_ORDERED);
@@ -2390,6 +2391,7 @@ int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info)
 
                retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T));
        }
+       drbd_resume_io(mdev);
 
 out:
        drbd_adm_finish(info, retcode);
@@ -2435,6 +2437,11 @@ int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info)
 
        mdev = adm_ctx.mdev;
 
+       /* If there is still bitmap IO pending, probably because of a previous
+        * resync just being finished, wait for it before requesting a new resync. */
+       drbd_suspend_io(mdev);
+       wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
+
        retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S), CS_ORDERED);
        if (retcode < SS_SUCCESS) {
                if (retcode == SS_NEED_CONNECTION && mdev->state.role == R_PRIMARY) {
@@ -2450,6 +2457,7 @@ int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info)
                } else
                        retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S));
        }
+       drbd_resume_io(mdev);
 
 out:
        drbd_adm_finish(info, retcode);
@@ -2903,8 +2911,10 @@ int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info)
        }
        /* If there is still bitmap IO pending, e.g. previous resync or verify
         * just being finished, wait for it before requesting a new resync. */
+       drbd_suspend_io(mdev);
        wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
        retcode = drbd_request_state(mdev,NS(conn,C_VERIFY_S));
+       drbd_resume_io(mdev);
 out:
        drbd_adm_finish(info, retcode);
        return 0;