raid5-cache: fix a deadlock in superblock write

author Shaohua Li <shli@fb.com>

Thu, 25 Aug 2016 17:09:39 +0000 (10:09 -0700)

committer Shaohua Li <shli@fb.com>

Wed, 31 Aug 2016 16:05:18 +0000 (09:05 -0700)
author Shaohua Li <shli@fb.com>
Thu, 25 Aug 2016 17:09:39 +0000 (10:09 -0700)
committer Shaohua Li <shli@fb.com>
Wed, 31 Aug 2016 16:05:18 +0000 (09:05 -0700)
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c

index 51f76dd..1b1ab4a 100644 (file)
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -96,7 +96,6 @@ struct r5l_log {
         spinlock_t no_space_stripes_lock;
  
         bool need_cache_flush;
-       bool in_teardown;
  };
  
  /*
@@ -704,31 +703,22 @@ static void r5l_write_super_and_discard_space(struct r5l_log *log,
  
         mddev = log->rdev->mddev;
         /*
-        * This is to avoid a deadlock. r5l_quiesce holds reconfig_mutex and
-        * wait for this thread to finish. This thread waits for
-        * MD_CHANGE_PENDING clear, which is supposed to be done in
-        * md_check_recovery(). md_check_recovery() tries to get
-        * reconfig_mutex. Since r5l_quiesce already holds the mutex,
-        * md_check_recovery() fails, so the PENDING never get cleared. The
-        * in_teardown check workaround this issue.
+        * Discard could zero data, so before discard we must make sure
+        * superblock is updated to new log tail. Updating superblock (either
+        * directly call md_update_sb() or depend on md thread) must hold
+        * reconfig mutex. On the other hand, raid5_quiesce is called with
+        * reconfig_mutex hold. The first step of raid5_quiesce() is waitting
+        * for all IO finish, hence waitting for reclaim thread, while reclaim
+        * thread is calling this function and waitting for reconfig mutex. So
+        * there is a deadlock. We workaround this issue with a trylock.
+        * FIXME: we could miss discard if we can't take reconfig mutex
          */
-       if (!log->in_teardown) {
-               set_mask_bits(&mddev->flags, 0,
-                             BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING));
-               md_wakeup_thread(mddev->thread);
-               wait_event(mddev->sb_wait,
-                       !test_bit(MD_CHANGE_PENDING, &mddev->flags) ||
-                       log->in_teardown);
-               /*
-                * r5l_quiesce could run after in_teardown check and hold
-                * mutex first. Superblock might get updated twice.
-                */
-               if (log->in_teardown)
-                       md_update_sb(mddev, 1);
-       } else {
-               WARN_ON(!mddev_is_locked(mddev));
-               md_update_sb(mddev, 1);
-       }
+       set_mask_bits(&mddev->flags, 0,
+               BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING));
+       if (!mddev_trylock(mddev))
+               return;
+       md_update_sb(mddev, 1);
+       mddev_unlock(mddev);
  
         /* discard IO error really doesn't matter, ignore it */
         if (log->last_checkpoint < end) {
@@ -827,7 +817,6 @@ void r5l_quiesce(struct r5l_log *log, int state)
         if (!log || state == 2)
                 return;
         if (state == 0) {
-               log->in_teardown = 0;
                 /*
                  * This is a special case for hotadd. In suspend, the array has
                  * no journal. In resume, journal is initialized as well as the
@@ -838,11 +827,6 @@ void r5l_quiesce(struct r5l_log *log, int state)
                 log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
                                         log->rdev->mddev, "reclaim");
         } else if (state == 1) {
-               /*
-                * at this point all stripes are finished, so io_unit is at
-                * least in STRIPE_END state
-                */
-               log->in_teardown = 1;
                 /* make sure r5l_write_super_and_discard_space exits */
                 mddev = log->rdev->mddev;
                 wake_up(&mddev->sb_wait);
author	Shaohua Li <shli@fb.com>
	Thu, 25 Aug 2016 17:09:39 +0000 (10:09 -0700)
committer	Shaohua Li <shli@fb.com>
	Wed, 31 Aug 2016 16:05:18 +0000 (09:05 -0700)