libceph: fix potential hang in ceph_osdc_notify()
authorIlya Dryomov <idryomov@gmail.com>
Tue, 1 Aug 2023 17:14:24 +0000 (19:14 +0200)
committerIlya Dryomov <idryomov@gmail.com>
Wed, 2 Aug 2023 07:07:34 +0000 (09:07 +0200)
If the cluster becomes unavailable, ceph_osdc_notify() may hang even
with osd_request_timeout option set because linger_notify_finish_wait()
waits for MWatchNotify NOTIFY_COMPLETE message with no associated OSD
request in flight -- it's completely asynchronous.

Introduce an additional timeout, derived from the specified notify
timeout.  While at it, switch both waits to killable which is more
correct.

Cc: stable@vger.kernel.org
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Dongsheng Yang <dongsheng.yang@easystack.cn>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
net/ceph/osd_client.c

index 11c04e7..658a6f2 100644 (file)
@@ -3334,17 +3334,24 @@ static int linger_reg_commit_wait(struct ceph_osd_linger_request *lreq)
        int ret;
 
        dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
-       ret = wait_for_completion_interruptible(&lreq->reg_commit_wait);
+       ret = wait_for_completion_killable(&lreq->reg_commit_wait);
        return ret ?: lreq->reg_commit_error;
 }
 
-static int linger_notify_finish_wait(struct ceph_osd_linger_request *lreq)
+static int linger_notify_finish_wait(struct ceph_osd_linger_request *lreq,
+                                    unsigned long timeout)
 {
-       int ret;
+       long left;
 
        dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
-       ret = wait_for_completion_interruptible(&lreq->notify_finish_wait);
-       return ret ?: lreq->notify_finish_error;
+       left = wait_for_completion_killable_timeout(&lreq->notify_finish_wait,
+                                               ceph_timeout_jiffies(timeout));
+       if (left <= 0)
+               left = left ?: -ETIMEDOUT;
+       else
+               left = lreq->notify_finish_error; /* completed */
+
+       return left;
 }
 
 /*
@@ -4896,7 +4903,8 @@ int ceph_osdc_notify(struct ceph_osd_client *osdc,
        linger_submit(lreq);
        ret = linger_reg_commit_wait(lreq);
        if (!ret)
-               ret = linger_notify_finish_wait(lreq);
+               ret = linger_notify_finish_wait(lreq,
+                                msecs_to_jiffies(2 * timeout * MSEC_PER_SEC));
        else
                dout("lreq %p failed to initiate notify %d\n", lreq, ret);