nvme-fabrics: Allow ctrl loss timeout configuration
authorSagi Grimberg <sagi@grimberg.me>
Sat, 18 Mar 2017 18:52:36 +0000 (20:52 +0200)
committerJens Axboe <axboe@fb.com>
Tue, 4 Apr 2017 15:48:23 +0000 (09:48 -0600)
When a host sense that its controller session is damaged,
it tries to re-establish it periodically (reconnect every
reconnect_delay). It may very well be that the controller
is gone and never coming back, in this case the host will
try to reconnect forever.

Add a ctrl_loss_tmo to bound the number of reconnect attempts
to a specific controller (default to a reasonable 10 minutes).
The timeout configuration is actually translated into number of
reconnect attempts and not a schedule on its own but rather
divided with reconnect_delay. This is useful to prevent
racing flows of remove and reconnect, and it doesn't really
matter if we remove slightly sooner than what the user requested.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
drivers/nvme/host/fabrics.c
drivers/nvme/host/fabrics.h

index 5b7386f..990e6fb 100644 (file)
@@ -471,6 +471,16 @@ int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid)
 }
 EXPORT_SYMBOL_GPL(nvmf_connect_io_queue);
 
+bool nvmf_should_reconnect(struct nvme_ctrl *ctrl)
+{
+       if (ctrl->opts->max_reconnects != -1 &&
+           ctrl->opts->nr_reconnects < ctrl->opts->max_reconnects)
+               return true;
+
+       return false;
+}
+EXPORT_SYMBOL_GPL(nvmf_should_reconnect);
+
 /**
  * nvmf_register_transport() - NVMe Fabrics Library registration function.
  * @ops:       Transport ops instance to be registered to the
@@ -533,6 +543,7 @@ static const match_table_t opt_tokens = {
        { NVMF_OPT_QUEUE_SIZE,          "queue_size=%d"         },
        { NVMF_OPT_NR_IO_QUEUES,        "nr_io_queues=%d"       },
        { NVMF_OPT_RECONNECT_DELAY,     "reconnect_delay=%d"    },
+       { NVMF_OPT_CTRL_LOSS_TMO,       "ctrl_loss_tmo=%d"      },
        { NVMF_OPT_KATO,                "keep_alive_tmo=%d"     },
        { NVMF_OPT_HOSTNQN,             "hostnqn=%s"            },
        { NVMF_OPT_HOST_TRADDR,         "host_traddr=%s"        },
@@ -546,6 +557,7 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
        char *options, *o, *p;
        int token, ret = 0;
        size_t nqnlen  = 0;
+       int ctrl_loss_tmo = NVMF_DEF_CTRL_LOSS_TMO;
 
        /* Set defaults */
        opts->queue_size = NVMF_DEF_QUEUE_SIZE;
@@ -655,6 +667,16 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
                        }
                        opts->kato = token;
                        break;
+               case NVMF_OPT_CTRL_LOSS_TMO:
+                       if (match_int(args, &token)) {
+                               ret = -EINVAL;
+                               goto out;
+                       }
+
+                       if (token < 0)
+                               pr_warn("ctrl_loss_tmo < 0 will reconnect forever\n");
+                       ctrl_loss_tmo = token;
+                       break;
                case NVMF_OPT_HOSTNQN:
                        if (opts->host) {
                                pr_err("hostnqn already user-assigned: %s\n",
@@ -710,6 +732,12 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
                }
        }
 
+       if (ctrl_loss_tmo < 0)
+               opts->max_reconnects = -1;
+       else
+               opts->max_reconnects = DIV_ROUND_UP(ctrl_loss_tmo,
+                                               opts->reconnect_delay);
+
        if (!opts->host) {
                kref_get(&nvmf_default_host->ref);
                opts->host = nvmf_default_host;
index 1560181..f5a9c1f 100644 (file)
@@ -21,6 +21,8 @@
 #define NVMF_MAX_QUEUE_SIZE    1024
 #define NVMF_DEF_QUEUE_SIZE    128
 #define NVMF_DEF_RECONNECT_DELAY       10
+/* default to 600 seconds of reconnect attempts before giving up */
+#define NVMF_DEF_CTRL_LOSS_TMO         600
 
 /*
  * Define a host as seen by the target.  We allocate one at boot, but also
@@ -53,6 +55,7 @@ enum {
        NVMF_OPT_HOSTNQN        = 1 << 8,
        NVMF_OPT_RECONNECT_DELAY = 1 << 9,
        NVMF_OPT_HOST_TRADDR    = 1 << 10,
+       NVMF_OPT_CTRL_LOSS_TMO  = 1 << 11,
 };
 
 /**
@@ -77,6 +80,10 @@ enum {
  * @discovery_nqn: indicates if the subsysnqn is the well-known discovery NQN.
  * @kato:      Keep-alive timeout.
  * @host:      Virtual NVMe host, contains the NQN and Host ID.
+ * @nr_reconnects: number of reconnect attempted since the last ctrl failure
+ * @max_reconnects: maximum number of allowed reconnect attempts before removing
+ *              the controller, (-1) means reconnect forever, zero means remove
+ *              immediately;
  */
 struct nvmf_ctrl_options {
        unsigned                mask;
@@ -91,6 +98,8 @@ struct nvmf_ctrl_options {
        bool                    discovery_nqn;
        unsigned int            kato;
        struct nvmf_host        *host;
+       int                     nr_reconnects;
+       int                     max_reconnects;
 };
 
 /*
@@ -133,5 +142,6 @@ void nvmf_unregister_transport(struct nvmf_transport_ops *ops);
 void nvmf_free_options(struct nvmf_ctrl_options *opts);
 const char *nvmf_get_subsysnqn(struct nvme_ctrl *ctrl);
 int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size);
+bool nvmf_should_reconnect(struct nvme_ctrl *ctrl);
 
 #endif /* _NVME_FABRICS_H */