ceph: check availability of mds cluster on mount after wait timeout
authorXiubo Li <xiubli@redhat.com>
Wed, 11 Dec 2019 01:29:40 +0000 (20:29 -0500)
committerIlya Dryomov <idryomov@gmail.com>
Mon, 27 Jan 2020 15:53:39 +0000 (16:53 +0100)
If all the MDS daemons are down for some reason, then the first mount
attempt will fail with EIO after the mount request times out.  A mount
attempt will also fail with EIO if all of the MDS's are laggy.

This patch changes the code to return -EHOSTUNREACH in these situations
and adds a pr_info error message to help the admin determine the cause.

URL: https://tracker.ceph.com/issues/4386
Signed-off-by: Xiubo Li <xiubli@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
fs/ceph/mds_client.c
fs/ceph/super.c

index 18fa8f8..e190266 100644 (file)
@@ -2583,8 +2583,7 @@ static void __do_request(struct ceph_mds_client *mdsc,
                if (!(mdsc->fsc->mount_options->flags &
                      CEPH_MOUNT_OPT_MOUNTWAIT) &&
                    !ceph_mdsmap_is_cluster_available(mdsc->mdsmap)) {
-                       err = -ENOENT;
-                       pr_info("probably no mds server is up\n");
+                       err = -EHOSTUNREACH;
                        goto finish;
                }
        }
index 29a795f..430dcf3 100644 (file)
@@ -1070,6 +1070,11 @@ static int ceph_get_tree(struct fs_context *fc)
        return 0;
 
 out_splat:
+       if (!ceph_mdsmap_is_cluster_available(fsc->mdsc->mdsmap)) {
+               pr_info("No mds server is up or the cluster is laggy\n");
+               err = -EHOSTUNREACH;
+       }
+
        ceph_mdsc_close_sessions(fsc->mdsc);
        deactivate_locked_super(sb);
        goto out_final;