bc20a2442a04256dcdfeffd882bda9043b314bfc
[platform/kernel/linux-starfive.git] / drivers / nvme / host / rdma.c
1 /*
2  * NVMe over Fabrics RDMA host code.
3  * Copyright (c) 2015-2016 HGST, a Western Digital Company.
4  *
5  * This program is free software; you can redistribute it and/or modify it
6  * under the terms and conditions of the GNU General Public License,
7  * version 2, as published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope it will be useful, but WITHOUT
10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
12  * more details.
13  */
14 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
15 #include <linux/module.h>
16 #include <linux/init.h>
17 #include <linux/slab.h>
18 #include <linux/err.h>
19 #include <linux/string.h>
20 #include <linux/atomic.h>
21 #include <linux/blk-mq.h>
22 #include <linux/types.h>
23 #include <linux/list.h>
24 #include <linux/mutex.h>
25 #include <linux/scatterlist.h>
26 #include <linux/nvme.h>
27 #include <asm/unaligned.h>
28
29 #include <rdma/ib_verbs.h>
30 #include <rdma/rdma_cm.h>
31 #include <linux/nvme-rdma.h>
32
33 #include "nvme.h"
34 #include "fabrics.h"
35
36
37 #define NVME_RDMA_CONNECT_TIMEOUT_MS    1000            /* 1 second */
38
39 #define NVME_RDMA_MAX_SEGMENT_SIZE      0xffffff        /* 24-bit SGL field */
40
41 #define NVME_RDMA_MAX_SEGMENTS          256
42
43 #define NVME_RDMA_MAX_INLINE_SEGMENTS   1
44
45 /*
46  * We handle AEN commands ourselves and don't even let the
47  * block layer know about them.
48  */
49 #define NVME_RDMA_NR_AEN_COMMANDS      1
50 #define NVME_RDMA_AQ_BLKMQ_DEPTH       \
51         (NVMF_AQ_DEPTH - NVME_RDMA_NR_AEN_COMMANDS)
52
53 struct nvme_rdma_device {
54         struct ib_device       *dev;
55         struct ib_pd           *pd;
56         struct kref             ref;
57         struct list_head        entry;
58 };
59
60 struct nvme_rdma_qe {
61         struct ib_cqe           cqe;
62         void                    *data;
63         u64                     dma;
64 };
65
66 struct nvme_rdma_queue;
67 struct nvme_rdma_request {
68         struct nvme_request     req;
69         struct ib_mr            *mr;
70         struct nvme_rdma_qe     sqe;
71         struct ib_sge           sge[1 + NVME_RDMA_MAX_INLINE_SEGMENTS];
72         u32                     num_sge;
73         int                     nents;
74         bool                    inline_data;
75         struct ib_reg_wr        reg_wr;
76         struct ib_cqe           reg_cqe;
77         struct nvme_rdma_queue  *queue;
78         struct sg_table         sg_table;
79         struct scatterlist      first_sgl[];
80 };
81
82 enum nvme_rdma_queue_flags {
83         NVME_RDMA_Q_CONNECTED = (1 << 0),
84         NVME_RDMA_IB_QUEUE_ALLOCATED = (1 << 1),
85         NVME_RDMA_Q_DELETING = (1 << 2),
86         NVME_RDMA_Q_LIVE = (1 << 3),
87 };
88
89 struct nvme_rdma_queue {
90         struct nvme_rdma_qe     *rsp_ring;
91         u8                      sig_count;
92         int                     queue_size;
93         size_t                  cmnd_capsule_len;
94         struct nvme_rdma_ctrl   *ctrl;
95         struct nvme_rdma_device *device;
96         struct ib_cq            *ib_cq;
97         struct ib_qp            *qp;
98
99         unsigned long           flags;
100         struct rdma_cm_id       *cm_id;
101         int                     cm_error;
102         struct completion       cm_done;
103 };
104
105 struct nvme_rdma_ctrl {
106         /* read and written in the hot path */
107         spinlock_t              lock;
108
109         /* read only in the hot path */
110         struct nvme_rdma_queue  *queues;
111         u32                     queue_count;
112
113         /* other member variables */
114         struct blk_mq_tag_set   tag_set;
115         struct work_struct      delete_work;
116         struct work_struct      reset_work;
117         struct work_struct      err_work;
118
119         struct nvme_rdma_qe     async_event_sqe;
120
121         int                     reconnect_delay;
122         struct delayed_work     reconnect_work;
123
124         struct list_head        list;
125
126         struct blk_mq_tag_set   admin_tag_set;
127         struct nvme_rdma_device *device;
128
129         u64                     cap;
130         u32                     max_fr_pages;
131
132         union {
133                 struct sockaddr addr;
134                 struct sockaddr_in addr_in;
135         };
136         union {
137                 struct sockaddr src_addr;
138                 struct sockaddr_in src_addr_in;
139         };
140
141         struct nvme_ctrl        ctrl;
142 };
143
144 static inline struct nvme_rdma_ctrl *to_rdma_ctrl(struct nvme_ctrl *ctrl)
145 {
146         return container_of(ctrl, struct nvme_rdma_ctrl, ctrl);
147 }
148
149 static LIST_HEAD(device_list);
150 static DEFINE_MUTEX(device_list_mutex);
151
152 static LIST_HEAD(nvme_rdma_ctrl_list);
153 static DEFINE_MUTEX(nvme_rdma_ctrl_mutex);
154
155 static struct workqueue_struct *nvme_rdma_wq;
156
157 /*
158  * Disabling this option makes small I/O goes faster, but is fundamentally
159  * unsafe.  With it turned off we will have to register a global rkey that
160  * allows read and write access to all physical memory.
161  */
162 static bool register_always = true;
163 module_param(register_always, bool, 0444);
164 MODULE_PARM_DESC(register_always,
165          "Use memory registration even for contiguous memory regions");
166
167 static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
168                 struct rdma_cm_event *event);
169 static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc);
170
171 /* XXX: really should move to a generic header sooner or later.. */
172 static inline void put_unaligned_le24(u32 val, u8 *p)
173 {
174         *p++ = val;
175         *p++ = val >> 8;
176         *p++ = val >> 16;
177 }
178
179 static inline int nvme_rdma_queue_idx(struct nvme_rdma_queue *queue)
180 {
181         return queue - queue->ctrl->queues;
182 }
183
184 static inline size_t nvme_rdma_inline_data_size(struct nvme_rdma_queue *queue)
185 {
186         return queue->cmnd_capsule_len - sizeof(struct nvme_command);
187 }
188
189 static void nvme_rdma_free_qe(struct ib_device *ibdev, struct nvme_rdma_qe *qe,
190                 size_t capsule_size, enum dma_data_direction dir)
191 {
192         ib_dma_unmap_single(ibdev, qe->dma, capsule_size, dir);
193         kfree(qe->data);
194 }
195
196 static int nvme_rdma_alloc_qe(struct ib_device *ibdev, struct nvme_rdma_qe *qe,
197                 size_t capsule_size, enum dma_data_direction dir)
198 {
199         qe->data = kzalloc(capsule_size, GFP_KERNEL);
200         if (!qe->data)
201                 return -ENOMEM;
202
203         qe->dma = ib_dma_map_single(ibdev, qe->data, capsule_size, dir);
204         if (ib_dma_mapping_error(ibdev, qe->dma)) {
205                 kfree(qe->data);
206                 return -ENOMEM;
207         }
208
209         return 0;
210 }
211
212 static void nvme_rdma_free_ring(struct ib_device *ibdev,
213                 struct nvme_rdma_qe *ring, size_t ib_queue_size,
214                 size_t capsule_size, enum dma_data_direction dir)
215 {
216         int i;
217
218         for (i = 0; i < ib_queue_size; i++)
219                 nvme_rdma_free_qe(ibdev, &ring[i], capsule_size, dir);
220         kfree(ring);
221 }
222
223 static struct nvme_rdma_qe *nvme_rdma_alloc_ring(struct ib_device *ibdev,
224                 size_t ib_queue_size, size_t capsule_size,
225                 enum dma_data_direction dir)
226 {
227         struct nvme_rdma_qe *ring;
228         int i;
229
230         ring = kcalloc(ib_queue_size, sizeof(struct nvme_rdma_qe), GFP_KERNEL);
231         if (!ring)
232                 return NULL;
233
234         for (i = 0; i < ib_queue_size; i++) {
235                 if (nvme_rdma_alloc_qe(ibdev, &ring[i], capsule_size, dir))
236                         goto out_free_ring;
237         }
238
239         return ring;
240
241 out_free_ring:
242         nvme_rdma_free_ring(ibdev, ring, i, capsule_size, dir);
243         return NULL;
244 }
245
246 static void nvme_rdma_qp_event(struct ib_event *event, void *context)
247 {
248         pr_debug("QP event %s (%d)\n",
249                  ib_event_msg(event->event), event->event);
250
251 }
252
253 static int nvme_rdma_wait_for_cm(struct nvme_rdma_queue *queue)
254 {
255         wait_for_completion_interruptible_timeout(&queue->cm_done,
256                         msecs_to_jiffies(NVME_RDMA_CONNECT_TIMEOUT_MS) + 1);
257         return queue->cm_error;
258 }
259
260 static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor)
261 {
262         struct nvme_rdma_device *dev = queue->device;
263         struct ib_qp_init_attr init_attr;
264         int ret;
265
266         memset(&init_attr, 0, sizeof(init_attr));
267         init_attr.event_handler = nvme_rdma_qp_event;
268         /* +1 for drain */
269         init_attr.cap.max_send_wr = factor * queue->queue_size + 1;
270         /* +1 for drain */
271         init_attr.cap.max_recv_wr = queue->queue_size + 1;
272         init_attr.cap.max_recv_sge = 1;
273         init_attr.cap.max_send_sge = 1 + NVME_RDMA_MAX_INLINE_SEGMENTS;
274         init_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
275         init_attr.qp_type = IB_QPT_RC;
276         init_attr.send_cq = queue->ib_cq;
277         init_attr.recv_cq = queue->ib_cq;
278
279         ret = rdma_create_qp(queue->cm_id, dev->pd, &init_attr);
280
281         queue->qp = queue->cm_id->qp;
282         return ret;
283 }
284
285 static int nvme_rdma_reinit_request(void *data, struct request *rq)
286 {
287         struct nvme_rdma_ctrl *ctrl = data;
288         struct nvme_rdma_device *dev = ctrl->device;
289         struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
290         int ret = 0;
291
292         if (!req->mr->need_inval)
293                 goto out;
294
295         ib_dereg_mr(req->mr);
296
297         req->mr = ib_alloc_mr(dev->pd, IB_MR_TYPE_MEM_REG,
298                         ctrl->max_fr_pages);
299         if (IS_ERR(req->mr)) {
300                 ret = PTR_ERR(req->mr);
301                 req->mr = NULL;
302                 goto out;
303         }
304
305         req->mr->need_inval = false;
306
307 out:
308         return ret;
309 }
310
311 static void __nvme_rdma_exit_request(struct nvme_rdma_ctrl *ctrl,
312                 struct request *rq, unsigned int queue_idx)
313 {
314         struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
315         struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx];
316         struct nvme_rdma_device *dev = queue->device;
317
318         if (req->mr)
319                 ib_dereg_mr(req->mr);
320
321         nvme_rdma_free_qe(dev->dev, &req->sqe, sizeof(struct nvme_command),
322                         DMA_TO_DEVICE);
323 }
324
325 static void nvme_rdma_exit_request(void *data, struct request *rq,
326                                 unsigned int hctx_idx, unsigned int rq_idx)
327 {
328         return __nvme_rdma_exit_request(data, rq, hctx_idx + 1);
329 }
330
331 static void nvme_rdma_exit_admin_request(void *data, struct request *rq,
332                                 unsigned int hctx_idx, unsigned int rq_idx)
333 {
334         return __nvme_rdma_exit_request(data, rq, 0);
335 }
336
337 static int __nvme_rdma_init_request(struct nvme_rdma_ctrl *ctrl,
338                 struct request *rq, unsigned int queue_idx)
339 {
340         struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
341         struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx];
342         struct nvme_rdma_device *dev = queue->device;
343         struct ib_device *ibdev = dev->dev;
344         int ret;
345
346         BUG_ON(queue_idx >= ctrl->queue_count);
347
348         ret = nvme_rdma_alloc_qe(ibdev, &req->sqe, sizeof(struct nvme_command),
349                         DMA_TO_DEVICE);
350         if (ret)
351                 return ret;
352
353         req->mr = ib_alloc_mr(dev->pd, IB_MR_TYPE_MEM_REG,
354                         ctrl->max_fr_pages);
355         if (IS_ERR(req->mr)) {
356                 ret = PTR_ERR(req->mr);
357                 goto out_free_qe;
358         }
359
360         req->queue = queue;
361
362         return 0;
363
364 out_free_qe:
365         nvme_rdma_free_qe(dev->dev, &req->sqe, sizeof(struct nvme_command),
366                         DMA_TO_DEVICE);
367         return -ENOMEM;
368 }
369
370 static int nvme_rdma_init_request(void *data, struct request *rq,
371                                 unsigned int hctx_idx, unsigned int rq_idx,
372                                 unsigned int numa_node)
373 {
374         return __nvme_rdma_init_request(data, rq, hctx_idx + 1);
375 }
376
377 static int nvme_rdma_init_admin_request(void *data, struct request *rq,
378                                 unsigned int hctx_idx, unsigned int rq_idx,
379                                 unsigned int numa_node)
380 {
381         return __nvme_rdma_init_request(data, rq, 0);
382 }
383
384 static int nvme_rdma_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
385                 unsigned int hctx_idx)
386 {
387         struct nvme_rdma_ctrl *ctrl = data;
388         struct nvme_rdma_queue *queue = &ctrl->queues[hctx_idx + 1];
389
390         BUG_ON(hctx_idx >= ctrl->queue_count);
391
392         hctx->driver_data = queue;
393         return 0;
394 }
395
396 static int nvme_rdma_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
397                 unsigned int hctx_idx)
398 {
399         struct nvme_rdma_ctrl *ctrl = data;
400         struct nvme_rdma_queue *queue = &ctrl->queues[0];
401
402         BUG_ON(hctx_idx != 0);
403
404         hctx->driver_data = queue;
405         return 0;
406 }
407
408 static void nvme_rdma_free_dev(struct kref *ref)
409 {
410         struct nvme_rdma_device *ndev =
411                 container_of(ref, struct nvme_rdma_device, ref);
412
413         mutex_lock(&device_list_mutex);
414         list_del(&ndev->entry);
415         mutex_unlock(&device_list_mutex);
416
417         ib_dealloc_pd(ndev->pd);
418         kfree(ndev);
419 }
420
421 static void nvme_rdma_dev_put(struct nvme_rdma_device *dev)
422 {
423         kref_put(&dev->ref, nvme_rdma_free_dev);
424 }
425
426 static int nvme_rdma_dev_get(struct nvme_rdma_device *dev)
427 {
428         return kref_get_unless_zero(&dev->ref);
429 }
430
431 static struct nvme_rdma_device *
432 nvme_rdma_find_get_device(struct rdma_cm_id *cm_id)
433 {
434         struct nvme_rdma_device *ndev;
435
436         mutex_lock(&device_list_mutex);
437         list_for_each_entry(ndev, &device_list, entry) {
438                 if (ndev->dev->node_guid == cm_id->device->node_guid &&
439                     nvme_rdma_dev_get(ndev))
440                         goto out_unlock;
441         }
442
443         ndev = kzalloc(sizeof(*ndev), GFP_KERNEL);
444         if (!ndev)
445                 goto out_err;
446
447         ndev->dev = cm_id->device;
448         kref_init(&ndev->ref);
449
450         ndev->pd = ib_alloc_pd(ndev->dev,
451                 register_always ? 0 : IB_PD_UNSAFE_GLOBAL_RKEY);
452         if (IS_ERR(ndev->pd))
453                 goto out_free_dev;
454
455         if (!(ndev->dev->attrs.device_cap_flags &
456               IB_DEVICE_MEM_MGT_EXTENSIONS)) {
457                 dev_err(&ndev->dev->dev,
458                         "Memory registrations not supported.\n");
459                 goto out_free_pd;
460         }
461
462         list_add(&ndev->entry, &device_list);
463 out_unlock:
464         mutex_unlock(&device_list_mutex);
465         return ndev;
466
467 out_free_pd:
468         ib_dealloc_pd(ndev->pd);
469 out_free_dev:
470         kfree(ndev);
471 out_err:
472         mutex_unlock(&device_list_mutex);
473         return NULL;
474 }
475
476 static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue)
477 {
478         struct nvme_rdma_device *dev;
479         struct ib_device *ibdev;
480
481         if (!test_and_clear_bit(NVME_RDMA_IB_QUEUE_ALLOCATED, &queue->flags))
482                 return;
483
484         dev = queue->device;
485         ibdev = dev->dev;
486         rdma_destroy_qp(queue->cm_id);
487         ib_free_cq(queue->ib_cq);
488
489         nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size,
490                         sizeof(struct nvme_completion), DMA_FROM_DEVICE);
491
492         nvme_rdma_dev_put(dev);
493 }
494
495 static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue,
496                 struct nvme_rdma_device *dev)
497 {
498         struct ib_device *ibdev = dev->dev;
499         const int send_wr_factor = 3;                   /* MR, SEND, INV */
500         const int cq_factor = send_wr_factor + 1;       /* + RECV */
501         int comp_vector, idx = nvme_rdma_queue_idx(queue);
502
503         int ret;
504
505         queue->device = dev;
506
507         /*
508          * The admin queue is barely used once the controller is live, so don't
509          * bother to spread it out.
510          */
511         if (idx == 0)
512                 comp_vector = 0;
513         else
514                 comp_vector = idx % ibdev->num_comp_vectors;
515
516
517         /* +1 for ib_stop_cq */
518         queue->ib_cq = ib_alloc_cq(dev->dev, queue,
519                                 cq_factor * queue->queue_size + 1, comp_vector,
520                                 IB_POLL_SOFTIRQ);
521         if (IS_ERR(queue->ib_cq)) {
522                 ret = PTR_ERR(queue->ib_cq);
523                 goto out;
524         }
525
526         ret = nvme_rdma_create_qp(queue, send_wr_factor);
527         if (ret)
528                 goto out_destroy_ib_cq;
529
530         queue->rsp_ring = nvme_rdma_alloc_ring(ibdev, queue->queue_size,
531                         sizeof(struct nvme_completion), DMA_FROM_DEVICE);
532         if (!queue->rsp_ring) {
533                 ret = -ENOMEM;
534                 goto out_destroy_qp;
535         }
536         set_bit(NVME_RDMA_IB_QUEUE_ALLOCATED, &queue->flags);
537
538         return 0;
539
540 out_destroy_qp:
541         ib_destroy_qp(queue->qp);
542 out_destroy_ib_cq:
543         ib_free_cq(queue->ib_cq);
544 out:
545         return ret;
546 }
547
548 static int nvme_rdma_init_queue(struct nvme_rdma_ctrl *ctrl,
549                 int idx, size_t queue_size)
550 {
551         struct nvme_rdma_queue *queue;
552         struct sockaddr *src_addr = NULL;
553         int ret;
554
555         queue = &ctrl->queues[idx];
556         queue->ctrl = ctrl;
557         init_completion(&queue->cm_done);
558
559         if (idx > 0)
560                 queue->cmnd_capsule_len = ctrl->ctrl.ioccsz * 16;
561         else
562                 queue->cmnd_capsule_len = sizeof(struct nvme_command);
563
564         queue->queue_size = queue_size;
565
566         queue->cm_id = rdma_create_id(&init_net, nvme_rdma_cm_handler, queue,
567                         RDMA_PS_TCP, IB_QPT_RC);
568         if (IS_ERR(queue->cm_id)) {
569                 dev_info(ctrl->ctrl.device,
570                         "failed to create CM ID: %ld\n", PTR_ERR(queue->cm_id));
571                 return PTR_ERR(queue->cm_id);
572         }
573
574         queue->cm_error = -ETIMEDOUT;
575         if (ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR)
576                 src_addr = &ctrl->src_addr;
577
578         ret = rdma_resolve_addr(queue->cm_id, src_addr, &ctrl->addr,
579                         NVME_RDMA_CONNECT_TIMEOUT_MS);
580         if (ret) {
581                 dev_info(ctrl->ctrl.device,
582                         "rdma_resolve_addr failed (%d).\n", ret);
583                 goto out_destroy_cm_id;
584         }
585
586         ret = nvme_rdma_wait_for_cm(queue);
587         if (ret) {
588                 dev_info(ctrl->ctrl.device,
589                         "rdma_resolve_addr wait failed (%d).\n", ret);
590                 goto out_destroy_cm_id;
591         }
592
593         clear_bit(NVME_RDMA_Q_DELETING, &queue->flags);
594         set_bit(NVME_RDMA_Q_CONNECTED, &queue->flags);
595
596         return 0;
597
598 out_destroy_cm_id:
599         nvme_rdma_destroy_queue_ib(queue);
600         rdma_destroy_id(queue->cm_id);
601         return ret;
602 }
603
604 static void nvme_rdma_stop_queue(struct nvme_rdma_queue *queue)
605 {
606         rdma_disconnect(queue->cm_id);
607         ib_drain_qp(queue->qp);
608 }
609
610 static void nvme_rdma_free_queue(struct nvme_rdma_queue *queue)
611 {
612         nvme_rdma_destroy_queue_ib(queue);
613         rdma_destroy_id(queue->cm_id);
614 }
615
616 static void nvme_rdma_stop_and_free_queue(struct nvme_rdma_queue *queue)
617 {
618         if (test_and_set_bit(NVME_RDMA_Q_DELETING, &queue->flags))
619                 return;
620         nvme_rdma_stop_queue(queue);
621         nvme_rdma_free_queue(queue);
622 }
623
624 static void nvme_rdma_free_io_queues(struct nvme_rdma_ctrl *ctrl)
625 {
626         int i;
627
628         for (i = 1; i < ctrl->queue_count; i++)
629                 nvme_rdma_stop_and_free_queue(&ctrl->queues[i]);
630 }
631
632 static int nvme_rdma_connect_io_queues(struct nvme_rdma_ctrl *ctrl)
633 {
634         int i, ret = 0;
635
636         for (i = 1; i < ctrl->queue_count; i++) {
637                 ret = nvmf_connect_io_queue(&ctrl->ctrl, i);
638                 if (ret) {
639                         dev_info(ctrl->ctrl.device,
640                                 "failed to connect i/o queue: %d\n", ret);
641                         goto out_free_queues;
642                 }
643                 set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[i].flags);
644         }
645
646         return 0;
647
648 out_free_queues:
649         nvme_rdma_free_io_queues(ctrl);
650         return ret;
651 }
652
653 static int nvme_rdma_init_io_queues(struct nvme_rdma_ctrl *ctrl)
654 {
655         int i, ret;
656
657         for (i = 1; i < ctrl->queue_count; i++) {
658                 ret = nvme_rdma_init_queue(ctrl, i,
659                                            ctrl->ctrl.opts->queue_size);
660                 if (ret) {
661                         dev_info(ctrl->ctrl.device,
662                                 "failed to initialize i/o queue: %d\n", ret);
663                         goto out_free_queues;
664                 }
665         }
666
667         return 0;
668
669 out_free_queues:
670         for (i--; i >= 1; i--)
671                 nvme_rdma_stop_and_free_queue(&ctrl->queues[i]);
672
673         return ret;
674 }
675
676 static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl)
677 {
678         nvme_rdma_free_qe(ctrl->queues[0].device->dev, &ctrl->async_event_sqe,
679                         sizeof(struct nvme_command), DMA_TO_DEVICE);
680         nvme_rdma_stop_and_free_queue(&ctrl->queues[0]);
681         blk_cleanup_queue(ctrl->ctrl.admin_q);
682         blk_mq_free_tag_set(&ctrl->admin_tag_set);
683         nvme_rdma_dev_put(ctrl->device);
684 }
685
686 static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl)
687 {
688         struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
689
690         if (list_empty(&ctrl->list))
691                 goto free_ctrl;
692
693         mutex_lock(&nvme_rdma_ctrl_mutex);
694         list_del(&ctrl->list);
695         mutex_unlock(&nvme_rdma_ctrl_mutex);
696
697         kfree(ctrl->queues);
698         nvmf_free_options(nctrl->opts);
699 free_ctrl:
700         kfree(ctrl);
701 }
702
703 static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
704 {
705         struct nvme_rdma_ctrl *ctrl = container_of(to_delayed_work(work),
706                         struct nvme_rdma_ctrl, reconnect_work);
707         bool changed;
708         int ret;
709
710         if (ctrl->queue_count > 1) {
711                 nvme_rdma_free_io_queues(ctrl);
712
713                 ret = blk_mq_reinit_tagset(&ctrl->tag_set);
714                 if (ret)
715                         goto requeue;
716         }
717
718         nvme_rdma_stop_and_free_queue(&ctrl->queues[0]);
719
720         ret = blk_mq_reinit_tagset(&ctrl->admin_tag_set);
721         if (ret)
722                 goto requeue;
723
724         ret = nvme_rdma_init_queue(ctrl, 0, NVMF_AQ_DEPTH);
725         if (ret)
726                 goto requeue;
727
728         blk_mq_start_stopped_hw_queues(ctrl->ctrl.admin_q, true);
729
730         ret = nvmf_connect_admin_queue(&ctrl->ctrl);
731         if (ret)
732                 goto stop_admin_q;
733
734         set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[0].flags);
735
736         ret = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap);
737         if (ret)
738                 goto stop_admin_q;
739
740         nvme_start_keep_alive(&ctrl->ctrl);
741
742         if (ctrl->queue_count > 1) {
743                 ret = nvme_rdma_init_io_queues(ctrl);
744                 if (ret)
745                         goto stop_admin_q;
746
747                 ret = nvme_rdma_connect_io_queues(ctrl);
748                 if (ret)
749                         goto stop_admin_q;
750         }
751
752         changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
753         WARN_ON_ONCE(!changed);
754
755         if (ctrl->queue_count > 1) {
756                 nvme_start_queues(&ctrl->ctrl);
757                 nvme_queue_scan(&ctrl->ctrl);
758                 nvme_queue_async_events(&ctrl->ctrl);
759         }
760
761         dev_info(ctrl->ctrl.device, "Successfully reconnected\n");
762
763         return;
764
765 stop_admin_q:
766         blk_mq_stop_hw_queues(ctrl->ctrl.admin_q);
767 requeue:
768         /* Make sure we are not resetting/deleting */
769         if (ctrl->ctrl.state == NVME_CTRL_RECONNECTING) {
770                 dev_info(ctrl->ctrl.device,
771                         "Failed reconnect attempt, requeueing...\n");
772                 queue_delayed_work(nvme_rdma_wq, &ctrl->reconnect_work,
773                                         ctrl->reconnect_delay * HZ);
774         }
775 }
776
777 static void nvme_rdma_error_recovery_work(struct work_struct *work)
778 {
779         struct nvme_rdma_ctrl *ctrl = container_of(work,
780                         struct nvme_rdma_ctrl, err_work);
781         int i;
782
783         nvme_stop_keep_alive(&ctrl->ctrl);
784
785         for (i = 0; i < ctrl->queue_count; i++) {
786                 clear_bit(NVME_RDMA_Q_CONNECTED, &ctrl->queues[i].flags);
787                 clear_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[i].flags);
788         }
789
790         if (ctrl->queue_count > 1)
791                 nvme_stop_queues(&ctrl->ctrl);
792         blk_mq_stop_hw_queues(ctrl->ctrl.admin_q);
793
794         /* We must take care of fastfail/requeue all our inflight requests */
795         if (ctrl->queue_count > 1)
796                 blk_mq_tagset_busy_iter(&ctrl->tag_set,
797                                         nvme_cancel_request, &ctrl->ctrl);
798         blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
799                                 nvme_cancel_request, &ctrl->ctrl);
800
801         dev_info(ctrl->ctrl.device, "reconnecting in %d seconds\n",
802                 ctrl->reconnect_delay);
803
804         queue_delayed_work(nvme_rdma_wq, &ctrl->reconnect_work,
805                                 ctrl->reconnect_delay * HZ);
806 }
807
808 static void nvme_rdma_error_recovery(struct nvme_rdma_ctrl *ctrl)
809 {
810         if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING))
811                 return;
812
813         queue_work(nvme_rdma_wq, &ctrl->err_work);
814 }
815
816 static void nvme_rdma_wr_error(struct ib_cq *cq, struct ib_wc *wc,
817                 const char *op)
818 {
819         struct nvme_rdma_queue *queue = cq->cq_context;
820         struct nvme_rdma_ctrl *ctrl = queue->ctrl;
821
822         if (ctrl->ctrl.state == NVME_CTRL_LIVE)
823                 dev_info(ctrl->ctrl.device,
824                              "%s for CQE 0x%p failed with status %s (%d)\n",
825                              op, wc->wr_cqe,
826                              ib_wc_status_msg(wc->status), wc->status);
827         nvme_rdma_error_recovery(ctrl);
828 }
829
830 static void nvme_rdma_memreg_done(struct ib_cq *cq, struct ib_wc *wc)
831 {
832         if (unlikely(wc->status != IB_WC_SUCCESS))
833                 nvme_rdma_wr_error(cq, wc, "MEMREG");
834 }
835
836 static void nvme_rdma_inv_rkey_done(struct ib_cq *cq, struct ib_wc *wc)
837 {
838         if (unlikely(wc->status != IB_WC_SUCCESS))
839                 nvme_rdma_wr_error(cq, wc, "LOCAL_INV");
840 }
841
842 static int nvme_rdma_inv_rkey(struct nvme_rdma_queue *queue,
843                 struct nvme_rdma_request *req)
844 {
845         struct ib_send_wr *bad_wr;
846         struct ib_send_wr wr = {
847                 .opcode             = IB_WR_LOCAL_INV,
848                 .next               = NULL,
849                 .num_sge            = 0,
850                 .send_flags         = 0,
851                 .ex.invalidate_rkey = req->mr->rkey,
852         };
853
854         req->reg_cqe.done = nvme_rdma_inv_rkey_done;
855         wr.wr_cqe = &req->reg_cqe;
856
857         return ib_post_send(queue->qp, &wr, &bad_wr);
858 }
859
860 static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue,
861                 struct request *rq)
862 {
863         struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
864         struct nvme_rdma_ctrl *ctrl = queue->ctrl;
865         struct nvme_rdma_device *dev = queue->device;
866         struct ib_device *ibdev = dev->dev;
867         int res;
868
869         if (!blk_rq_bytes(rq))
870                 return;
871
872         if (req->mr->need_inval) {
873                 res = nvme_rdma_inv_rkey(queue, req);
874                 if (res < 0) {
875                         dev_err(ctrl->ctrl.device,
876                                 "Queueing INV WR for rkey %#x failed (%d)\n",
877                                 req->mr->rkey, res);
878                         nvme_rdma_error_recovery(queue->ctrl);
879                 }
880         }
881
882         ib_dma_unmap_sg(ibdev, req->sg_table.sgl,
883                         req->nents, rq_data_dir(rq) ==
884                                     WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
885
886         nvme_cleanup_cmd(rq);
887         sg_free_table_chained(&req->sg_table, true);
888 }
889
890 static int nvme_rdma_set_sg_null(struct nvme_command *c)
891 {
892         struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
893
894         sg->addr = 0;
895         put_unaligned_le24(0, sg->length);
896         put_unaligned_le32(0, sg->key);
897         sg->type = NVME_KEY_SGL_FMT_DATA_DESC << 4;
898         return 0;
899 }
900
901 static int nvme_rdma_map_sg_inline(struct nvme_rdma_queue *queue,
902                 struct nvme_rdma_request *req, struct nvme_command *c)
903 {
904         struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
905
906         req->sge[1].addr = sg_dma_address(req->sg_table.sgl);
907         req->sge[1].length = sg_dma_len(req->sg_table.sgl);
908         req->sge[1].lkey = queue->device->pd->local_dma_lkey;
909
910         sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff);
911         sg->length = cpu_to_le32(sg_dma_len(req->sg_table.sgl));
912         sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET;
913
914         req->inline_data = true;
915         req->num_sge++;
916         return 0;
917 }
918
919 static int nvme_rdma_map_sg_single(struct nvme_rdma_queue *queue,
920                 struct nvme_rdma_request *req, struct nvme_command *c)
921 {
922         struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
923
924         sg->addr = cpu_to_le64(sg_dma_address(req->sg_table.sgl));
925         put_unaligned_le24(sg_dma_len(req->sg_table.sgl), sg->length);
926         put_unaligned_le32(queue->device->pd->unsafe_global_rkey, sg->key);
927         sg->type = NVME_KEY_SGL_FMT_DATA_DESC << 4;
928         return 0;
929 }
930
931 static int nvme_rdma_map_sg_fr(struct nvme_rdma_queue *queue,
932                 struct nvme_rdma_request *req, struct nvme_command *c,
933                 int count)
934 {
935         struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
936         int nr;
937
938         nr = ib_map_mr_sg(req->mr, req->sg_table.sgl, count, NULL, PAGE_SIZE);
939         if (nr < count) {
940                 if (nr < 0)
941                         return nr;
942                 return -EINVAL;
943         }
944
945         ib_update_fast_reg_key(req->mr, ib_inc_rkey(req->mr->rkey));
946
947         req->reg_cqe.done = nvme_rdma_memreg_done;
948         memset(&req->reg_wr, 0, sizeof(req->reg_wr));
949         req->reg_wr.wr.opcode = IB_WR_REG_MR;
950         req->reg_wr.wr.wr_cqe = &req->reg_cqe;
951         req->reg_wr.wr.num_sge = 0;
952         req->reg_wr.mr = req->mr;
953         req->reg_wr.key = req->mr->rkey;
954         req->reg_wr.access = IB_ACCESS_LOCAL_WRITE |
955                              IB_ACCESS_REMOTE_READ |
956                              IB_ACCESS_REMOTE_WRITE;
957
958         req->mr->need_inval = true;
959
960         sg->addr = cpu_to_le64(req->mr->iova);
961         put_unaligned_le24(req->mr->length, sg->length);
962         put_unaligned_le32(req->mr->rkey, sg->key);
963         sg->type = (NVME_KEY_SGL_FMT_DATA_DESC << 4) |
964                         NVME_SGL_FMT_INVALIDATE;
965
966         return 0;
967 }
968
969 static int nvme_rdma_map_data(struct nvme_rdma_queue *queue,
970                 struct request *rq, struct nvme_command *c)
971 {
972         struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
973         struct nvme_rdma_device *dev = queue->device;
974         struct ib_device *ibdev = dev->dev;
975         int count, ret;
976
977         req->num_sge = 1;
978         req->inline_data = false;
979         req->mr->need_inval = false;
980
981         c->common.flags |= NVME_CMD_SGL_METABUF;
982
983         if (!blk_rq_bytes(rq))
984                 return nvme_rdma_set_sg_null(c);
985
986         req->sg_table.sgl = req->first_sgl;
987         ret = sg_alloc_table_chained(&req->sg_table,
988                         blk_rq_nr_phys_segments(rq), req->sg_table.sgl);
989         if (ret)
990                 return -ENOMEM;
991
992         req->nents = blk_rq_map_sg(rq->q, rq, req->sg_table.sgl);
993
994         count = ib_dma_map_sg(ibdev, req->sg_table.sgl, req->nents,
995                     rq_data_dir(rq) == WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
996         if (unlikely(count <= 0)) {
997                 sg_free_table_chained(&req->sg_table, true);
998                 return -EIO;
999         }
1000
1001         if (count == 1) {
1002                 if (rq_data_dir(rq) == WRITE && nvme_rdma_queue_idx(queue) &&
1003                     blk_rq_payload_bytes(rq) <=
1004                                 nvme_rdma_inline_data_size(queue))
1005                         return nvme_rdma_map_sg_inline(queue, req, c);
1006
1007                 if (dev->pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY)
1008                         return nvme_rdma_map_sg_single(queue, req, c);
1009         }
1010
1011         return nvme_rdma_map_sg_fr(queue, req, c, count);
1012 }
1013
1014 static void nvme_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc)
1015 {
1016         if (unlikely(wc->status != IB_WC_SUCCESS))
1017                 nvme_rdma_wr_error(cq, wc, "SEND");
1018 }
1019
1020 static int nvme_rdma_post_send(struct nvme_rdma_queue *queue,
1021                 struct nvme_rdma_qe *qe, struct ib_sge *sge, u32 num_sge,
1022                 struct ib_send_wr *first, bool flush)
1023 {
1024         struct ib_send_wr wr, *bad_wr;
1025         int ret;
1026
1027         sge->addr   = qe->dma;
1028         sge->length = sizeof(struct nvme_command),
1029         sge->lkey   = queue->device->pd->local_dma_lkey;
1030
1031         qe->cqe.done = nvme_rdma_send_done;
1032
1033         wr.next       = NULL;
1034         wr.wr_cqe     = &qe->cqe;
1035         wr.sg_list    = sge;
1036         wr.num_sge    = num_sge;
1037         wr.opcode     = IB_WR_SEND;
1038         wr.send_flags = 0;
1039
1040         /*
1041          * Unsignalled send completions are another giant desaster in the
1042          * IB Verbs spec:  If we don't regularly post signalled sends
1043          * the send queue will fill up and only a QP reset will rescue us.
1044          * Would have been way to obvious to handle this in hardware or
1045          * at least the RDMA stack..
1046          *
1047          * This messy and racy code sniplet is copy and pasted from the iSER
1048          * initiator, and the magic '32' comes from there as well.
1049          *
1050          * Always signal the flushes. The magic request used for the flush
1051          * sequencer is not allocated in our driver's tagset and it's
1052          * triggered to be freed by blk_cleanup_queue(). So we need to
1053          * always mark it as signaled to ensure that the "wr_cqe", which is
1054          * embeded in request's payload, is not freed when __ib_process_cq()
1055          * calls wr_cqe->done().
1056          */
1057         if ((++queue->sig_count % 32) == 0 || flush)
1058                 wr.send_flags |= IB_SEND_SIGNALED;
1059
1060         if (first)
1061                 first->next = &wr;
1062         else
1063                 first = &wr;
1064
1065         ret = ib_post_send(queue->qp, first, &bad_wr);
1066         if (ret) {
1067                 dev_err(queue->ctrl->ctrl.device,
1068                              "%s failed with error code %d\n", __func__, ret);
1069         }
1070         return ret;
1071 }
1072
1073 static int nvme_rdma_post_recv(struct nvme_rdma_queue *queue,
1074                 struct nvme_rdma_qe *qe)
1075 {
1076         struct ib_recv_wr wr, *bad_wr;
1077         struct ib_sge list;
1078         int ret;
1079
1080         list.addr   = qe->dma;
1081         list.length = sizeof(struct nvme_completion);
1082         list.lkey   = queue->device->pd->local_dma_lkey;
1083
1084         qe->cqe.done = nvme_rdma_recv_done;
1085
1086         wr.next     = NULL;
1087         wr.wr_cqe   = &qe->cqe;
1088         wr.sg_list  = &list;
1089         wr.num_sge  = 1;
1090
1091         ret = ib_post_recv(queue->qp, &wr, &bad_wr);
1092         if (ret) {
1093                 dev_err(queue->ctrl->ctrl.device,
1094                         "%s failed with error code %d\n", __func__, ret);
1095         }
1096         return ret;
1097 }
1098
1099 static struct blk_mq_tags *nvme_rdma_tagset(struct nvme_rdma_queue *queue)
1100 {
1101         u32 queue_idx = nvme_rdma_queue_idx(queue);
1102
1103         if (queue_idx == 0)
1104                 return queue->ctrl->admin_tag_set.tags[queue_idx];
1105         return queue->ctrl->tag_set.tags[queue_idx - 1];
1106 }
1107
1108 static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg, int aer_idx)
1109 {
1110         struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(arg);
1111         struct nvme_rdma_queue *queue = &ctrl->queues[0];
1112         struct ib_device *dev = queue->device->dev;
1113         struct nvme_rdma_qe *sqe = &ctrl->async_event_sqe;
1114         struct nvme_command *cmd = sqe->data;
1115         struct ib_sge sge;
1116         int ret;
1117
1118         if (WARN_ON_ONCE(aer_idx != 0))
1119                 return;
1120
1121         ib_dma_sync_single_for_cpu(dev, sqe->dma, sizeof(*cmd), DMA_TO_DEVICE);
1122
1123         memset(cmd, 0, sizeof(*cmd));
1124         cmd->common.opcode = nvme_admin_async_event;
1125         cmd->common.command_id = NVME_RDMA_AQ_BLKMQ_DEPTH;
1126         cmd->common.flags |= NVME_CMD_SGL_METABUF;
1127         nvme_rdma_set_sg_null(cmd);
1128
1129         ib_dma_sync_single_for_device(dev, sqe->dma, sizeof(*cmd),
1130                         DMA_TO_DEVICE);
1131
1132         ret = nvme_rdma_post_send(queue, sqe, &sge, 1, NULL, false);
1133         WARN_ON_ONCE(ret);
1134 }
1135
1136 static int nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
1137                 struct nvme_completion *cqe, struct ib_wc *wc, int tag)
1138 {
1139         struct request *rq;
1140         struct nvme_rdma_request *req;
1141         int ret = 0;
1142
1143         rq = blk_mq_tag_to_rq(nvme_rdma_tagset(queue), cqe->command_id);
1144         if (!rq) {
1145                 dev_err(queue->ctrl->ctrl.device,
1146                         "tag 0x%x on QP %#x not found\n",
1147                         cqe->command_id, queue->qp->qp_num);
1148                 nvme_rdma_error_recovery(queue->ctrl);
1149                 return ret;
1150         }
1151         req = blk_mq_rq_to_pdu(rq);
1152
1153         if (rq->tag == tag)
1154                 ret = 1;
1155
1156         if ((wc->wc_flags & IB_WC_WITH_INVALIDATE) &&
1157             wc->ex.invalidate_rkey == req->mr->rkey)
1158                 req->mr->need_inval = false;
1159
1160         req->req.result = cqe->result;
1161         blk_mq_complete_request(rq, le16_to_cpu(cqe->status) >> 1);
1162         return ret;
1163 }
1164
1165 static int __nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc, int tag)
1166 {
1167         struct nvme_rdma_qe *qe =
1168                 container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe);
1169         struct nvme_rdma_queue *queue = cq->cq_context;
1170         struct ib_device *ibdev = queue->device->dev;
1171         struct nvme_completion *cqe = qe->data;
1172         const size_t len = sizeof(struct nvme_completion);
1173         int ret = 0;
1174
1175         if (unlikely(wc->status != IB_WC_SUCCESS)) {
1176                 nvme_rdma_wr_error(cq, wc, "RECV");
1177                 return 0;
1178         }
1179
1180         ib_dma_sync_single_for_cpu(ibdev, qe->dma, len, DMA_FROM_DEVICE);
1181         /*
1182          * AEN requests are special as they don't time out and can
1183          * survive any kind of queue freeze and often don't respond to
1184          * aborts.  We don't even bother to allocate a struct request
1185          * for them but rather special case them here.
1186          */
1187         if (unlikely(nvme_rdma_queue_idx(queue) == 0 &&
1188                         cqe->command_id >= NVME_RDMA_AQ_BLKMQ_DEPTH))
1189                 nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
1190                                 &cqe->result);
1191         else
1192                 ret = nvme_rdma_process_nvme_rsp(queue, cqe, wc, tag);
1193         ib_dma_sync_single_for_device(ibdev, qe->dma, len, DMA_FROM_DEVICE);
1194
1195         nvme_rdma_post_recv(queue, qe);
1196         return ret;
1197 }
1198
1199 static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc)
1200 {
1201         __nvme_rdma_recv_done(cq, wc, -1);
1202 }
1203
1204 static int nvme_rdma_conn_established(struct nvme_rdma_queue *queue)
1205 {
1206         int ret, i;
1207
1208         for (i = 0; i < queue->queue_size; i++) {
1209                 ret = nvme_rdma_post_recv(queue, &queue->rsp_ring[i]);
1210                 if (ret)
1211                         goto out_destroy_queue_ib;
1212         }
1213
1214         return 0;
1215
1216 out_destroy_queue_ib:
1217         nvme_rdma_destroy_queue_ib(queue);
1218         return ret;
1219 }
1220
1221 static int nvme_rdma_conn_rejected(struct nvme_rdma_queue *queue,
1222                 struct rdma_cm_event *ev)
1223 {
1224         struct rdma_cm_id *cm_id = queue->cm_id;
1225         int status = ev->status;
1226         const char *rej_msg;
1227         const struct nvme_rdma_cm_rej *rej_data;
1228         u8 rej_data_len;
1229
1230         rej_msg = rdma_reject_msg(cm_id, status);
1231         rej_data = rdma_consumer_reject_data(cm_id, ev, &rej_data_len);
1232
1233         if (rej_data && rej_data_len >= sizeof(u16)) {
1234                 u16 sts = le16_to_cpu(rej_data->sts);
1235
1236                 dev_err(queue->ctrl->ctrl.device,
1237                       "Connect rejected: status %d (%s) nvme status %d (%s).\n",
1238                       status, rej_msg, sts, nvme_rdma_cm_msg(sts));
1239         } else {
1240                 dev_err(queue->ctrl->ctrl.device,
1241                         "Connect rejected: status %d (%s).\n", status, rej_msg);
1242         }
1243
1244         return -ECONNRESET;
1245 }
1246
1247 static int nvme_rdma_addr_resolved(struct nvme_rdma_queue *queue)
1248 {
1249         struct nvme_rdma_device *dev;
1250         int ret;
1251
1252         dev = nvme_rdma_find_get_device(queue->cm_id);
1253         if (!dev) {
1254                 dev_err(queue->cm_id->device->dev.parent,
1255                         "no client data found!\n");
1256                 return -ECONNREFUSED;
1257         }
1258
1259         ret = nvme_rdma_create_queue_ib(queue, dev);
1260         if (ret) {
1261                 nvme_rdma_dev_put(dev);
1262                 goto out;
1263         }
1264
1265         ret = rdma_resolve_route(queue->cm_id, NVME_RDMA_CONNECT_TIMEOUT_MS);
1266         if (ret) {
1267                 dev_err(queue->ctrl->ctrl.device,
1268                         "rdma_resolve_route failed (%d).\n",
1269                         queue->cm_error);
1270                 goto out_destroy_queue;
1271         }
1272
1273         return 0;
1274
1275 out_destroy_queue:
1276         nvme_rdma_destroy_queue_ib(queue);
1277 out:
1278         return ret;
1279 }
1280
1281 static int nvme_rdma_route_resolved(struct nvme_rdma_queue *queue)
1282 {
1283         struct nvme_rdma_ctrl *ctrl = queue->ctrl;
1284         struct rdma_conn_param param = { };
1285         struct nvme_rdma_cm_req priv = { };
1286         int ret;
1287
1288         param.qp_num = queue->qp->qp_num;
1289         param.flow_control = 1;
1290
1291         param.responder_resources = queue->device->dev->attrs.max_qp_rd_atom;
1292         /* maximum retry count */
1293         param.retry_count = 7;
1294         param.rnr_retry_count = 7;
1295         param.private_data = &priv;
1296         param.private_data_len = sizeof(priv);
1297
1298         priv.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0);
1299         priv.qid = cpu_to_le16(nvme_rdma_queue_idx(queue));
1300         /*
1301          * set the admin queue depth to the minimum size
1302          * specified by the Fabrics standard.
1303          */
1304         if (priv.qid == 0) {
1305                 priv.hrqsize = cpu_to_le16(NVMF_AQ_DEPTH);
1306                 priv.hsqsize = cpu_to_le16(NVMF_AQ_DEPTH - 1);
1307         } else {
1308                 /*
1309                  * current interpretation of the fabrics spec
1310                  * is at minimum you make hrqsize sqsize+1, or a
1311                  * 1's based representation of sqsize.
1312                  */
1313                 priv.hrqsize = cpu_to_le16(queue->queue_size);
1314                 priv.hsqsize = cpu_to_le16(queue->ctrl->ctrl.sqsize);
1315         }
1316
1317         ret = rdma_connect(queue->cm_id, &param);
1318         if (ret) {
1319                 dev_err(ctrl->ctrl.device,
1320                         "rdma_connect failed (%d).\n", ret);
1321                 goto out_destroy_queue_ib;
1322         }
1323
1324         return 0;
1325
1326 out_destroy_queue_ib:
1327         nvme_rdma_destroy_queue_ib(queue);
1328         return ret;
1329 }
1330
1331 static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
1332                 struct rdma_cm_event *ev)
1333 {
1334         struct nvme_rdma_queue *queue = cm_id->context;
1335         int cm_error = 0;
1336
1337         dev_dbg(queue->ctrl->ctrl.device, "%s (%d): status %d id %p\n",
1338                 rdma_event_msg(ev->event), ev->event,
1339                 ev->status, cm_id);
1340
1341         switch (ev->event) {
1342         case RDMA_CM_EVENT_ADDR_RESOLVED:
1343                 cm_error = nvme_rdma_addr_resolved(queue);
1344                 break;
1345         case RDMA_CM_EVENT_ROUTE_RESOLVED:
1346                 cm_error = nvme_rdma_route_resolved(queue);
1347                 break;
1348         case RDMA_CM_EVENT_ESTABLISHED:
1349                 queue->cm_error = nvme_rdma_conn_established(queue);
1350                 /* complete cm_done regardless of success/failure */
1351                 complete(&queue->cm_done);
1352                 return 0;
1353         case RDMA_CM_EVENT_REJECTED:
1354                 cm_error = nvme_rdma_conn_rejected(queue, ev);
1355                 break;
1356         case RDMA_CM_EVENT_ADDR_ERROR:
1357         case RDMA_CM_EVENT_ROUTE_ERROR:
1358         case RDMA_CM_EVENT_CONNECT_ERROR:
1359         case RDMA_CM_EVENT_UNREACHABLE:
1360                 dev_dbg(queue->ctrl->ctrl.device,
1361                         "CM error event %d\n", ev->event);
1362                 cm_error = -ECONNRESET;
1363                 break;
1364         case RDMA_CM_EVENT_DISCONNECTED:
1365         case RDMA_CM_EVENT_ADDR_CHANGE:
1366         case RDMA_CM_EVENT_TIMEWAIT_EXIT:
1367                 dev_dbg(queue->ctrl->ctrl.device,
1368                         "disconnect received - connection closed\n");
1369                 nvme_rdma_error_recovery(queue->ctrl);
1370                 break;
1371         case RDMA_CM_EVENT_DEVICE_REMOVAL:
1372                 /* device removal is handled via the ib_client API */
1373                 break;
1374         default:
1375                 dev_err(queue->ctrl->ctrl.device,
1376                         "Unexpected RDMA CM event (%d)\n", ev->event);
1377                 nvme_rdma_error_recovery(queue->ctrl);
1378                 break;
1379         }
1380
1381         if (cm_error) {
1382                 queue->cm_error = cm_error;
1383                 complete(&queue->cm_done);
1384         }
1385
1386         return 0;
1387 }
1388
1389 static enum blk_eh_timer_return
1390 nvme_rdma_timeout(struct request *rq, bool reserved)
1391 {
1392         struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
1393
1394         /* queue error recovery */
1395         nvme_rdma_error_recovery(req->queue->ctrl);
1396
1397         /* fail with DNR on cmd timeout */
1398         rq->errors = NVME_SC_ABORT_REQ | NVME_SC_DNR;
1399
1400         return BLK_EH_HANDLED;
1401 }
1402
1403 /*
1404  * We cannot accept any other command until the Connect command has completed.
1405  */
1406 static inline bool nvme_rdma_queue_is_ready(struct nvme_rdma_queue *queue,
1407                 struct request *rq)
1408 {
1409         if (unlikely(!test_bit(NVME_RDMA_Q_LIVE, &queue->flags))) {
1410                 struct nvme_command *cmd = nvme_req(rq)->cmd;
1411
1412                 if (!blk_rq_is_passthrough(rq) ||
1413                     cmd->common.opcode != nvme_fabrics_command ||
1414                     cmd->fabrics.fctype != nvme_fabrics_type_connect)
1415                         return false;
1416         }
1417
1418         return true;
1419 }
1420
1421 static int nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
1422                 const struct blk_mq_queue_data *bd)
1423 {
1424         struct nvme_ns *ns = hctx->queue->queuedata;
1425         struct nvme_rdma_queue *queue = hctx->driver_data;
1426         struct request *rq = bd->rq;
1427         struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
1428         struct nvme_rdma_qe *sqe = &req->sqe;
1429         struct nvme_command *c = sqe->data;
1430         bool flush = false;
1431         struct ib_device *dev;
1432         int ret;
1433
1434         WARN_ON_ONCE(rq->tag < 0);
1435
1436         if (!nvme_rdma_queue_is_ready(queue, rq))
1437                 return BLK_MQ_RQ_QUEUE_BUSY;
1438
1439         dev = queue->device->dev;
1440         ib_dma_sync_single_for_cpu(dev, sqe->dma,
1441                         sizeof(struct nvme_command), DMA_TO_DEVICE);
1442
1443         ret = nvme_setup_cmd(ns, rq, c);
1444         if (ret != BLK_MQ_RQ_QUEUE_OK)
1445                 return ret;
1446
1447         blk_mq_start_request(rq);
1448
1449         ret = nvme_rdma_map_data(queue, rq, c);
1450         if (ret < 0) {
1451                 dev_err(queue->ctrl->ctrl.device,
1452                              "Failed to map data (%d)\n", ret);
1453                 nvme_cleanup_cmd(rq);
1454                 goto err;
1455         }
1456
1457         ib_dma_sync_single_for_device(dev, sqe->dma,
1458                         sizeof(struct nvme_command), DMA_TO_DEVICE);
1459
1460         if (req_op(rq) == REQ_OP_FLUSH)
1461                 flush = true;
1462         ret = nvme_rdma_post_send(queue, sqe, req->sge, req->num_sge,
1463                         req->mr->need_inval ? &req->reg_wr.wr : NULL, flush);
1464         if (ret) {
1465                 nvme_rdma_unmap_data(queue, rq);
1466                 goto err;
1467         }
1468
1469         return BLK_MQ_RQ_QUEUE_OK;
1470 err:
1471         return (ret == -ENOMEM || ret == -EAGAIN) ?
1472                 BLK_MQ_RQ_QUEUE_BUSY : BLK_MQ_RQ_QUEUE_ERROR;
1473 }
1474
1475 static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
1476 {
1477         struct nvme_rdma_queue *queue = hctx->driver_data;
1478         struct ib_cq *cq = queue->ib_cq;
1479         struct ib_wc wc;
1480         int found = 0;
1481
1482         ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
1483         while (ib_poll_cq(cq, 1, &wc) > 0) {
1484                 struct ib_cqe *cqe = wc.wr_cqe;
1485
1486                 if (cqe) {
1487                         if (cqe->done == nvme_rdma_recv_done)
1488                                 found |= __nvme_rdma_recv_done(cq, &wc, tag);
1489                         else
1490                                 cqe->done(cq, &wc);
1491                 }
1492         }
1493
1494         return found;
1495 }
1496
1497 static void nvme_rdma_complete_rq(struct request *rq)
1498 {
1499         struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
1500         struct nvme_rdma_queue *queue = req->queue;
1501         int error = 0;
1502
1503         nvme_rdma_unmap_data(queue, rq);
1504
1505         if (unlikely(rq->errors)) {
1506                 if (nvme_req_needs_retry(rq, rq->errors)) {
1507                         nvme_requeue_req(rq);
1508                         return;
1509                 }
1510
1511                 if (blk_rq_is_passthrough(rq))
1512                         error = rq->errors;
1513                 else
1514                         error = nvme_error_status(rq->errors);
1515         }
1516
1517         blk_mq_end_request(rq, error);
1518 }
1519
1520 static struct blk_mq_ops nvme_rdma_mq_ops = {
1521         .queue_rq       = nvme_rdma_queue_rq,
1522         .complete       = nvme_rdma_complete_rq,
1523         .init_request   = nvme_rdma_init_request,
1524         .exit_request   = nvme_rdma_exit_request,
1525         .reinit_request = nvme_rdma_reinit_request,
1526         .init_hctx      = nvme_rdma_init_hctx,
1527         .poll           = nvme_rdma_poll,
1528         .timeout        = nvme_rdma_timeout,
1529 };
1530
1531 static struct blk_mq_ops nvme_rdma_admin_mq_ops = {
1532         .queue_rq       = nvme_rdma_queue_rq,
1533         .complete       = nvme_rdma_complete_rq,
1534         .init_request   = nvme_rdma_init_admin_request,
1535         .exit_request   = nvme_rdma_exit_admin_request,
1536         .reinit_request = nvme_rdma_reinit_request,
1537         .init_hctx      = nvme_rdma_init_admin_hctx,
1538         .timeout        = nvme_rdma_timeout,
1539 };
1540
1541 static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl)
1542 {
1543         int error;
1544
1545         error = nvme_rdma_init_queue(ctrl, 0, NVMF_AQ_DEPTH);
1546         if (error)
1547                 return error;
1548
1549         ctrl->device = ctrl->queues[0].device;
1550
1551         /*
1552          * We need a reference on the device as long as the tag_set is alive,
1553          * as the MRs in the request structures need a valid ib_device.
1554          */
1555         error = -EINVAL;
1556         if (!nvme_rdma_dev_get(ctrl->device))
1557                 goto out_free_queue;
1558
1559         ctrl->max_fr_pages = min_t(u32, NVME_RDMA_MAX_SEGMENTS,
1560                 ctrl->device->dev->attrs.max_fast_reg_page_list_len);
1561
1562         memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set));
1563         ctrl->admin_tag_set.ops = &nvme_rdma_admin_mq_ops;
1564         ctrl->admin_tag_set.queue_depth = NVME_RDMA_AQ_BLKMQ_DEPTH;
1565         ctrl->admin_tag_set.reserved_tags = 2; /* connect + keep-alive */
1566         ctrl->admin_tag_set.numa_node = NUMA_NO_NODE;
1567         ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_rdma_request) +
1568                 SG_CHUNK_SIZE * sizeof(struct scatterlist);
1569         ctrl->admin_tag_set.driver_data = ctrl;
1570         ctrl->admin_tag_set.nr_hw_queues = 1;
1571         ctrl->admin_tag_set.timeout = ADMIN_TIMEOUT;
1572
1573         error = blk_mq_alloc_tag_set(&ctrl->admin_tag_set);
1574         if (error)
1575                 goto out_put_dev;
1576
1577         ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set);
1578         if (IS_ERR(ctrl->ctrl.admin_q)) {
1579                 error = PTR_ERR(ctrl->ctrl.admin_q);
1580                 goto out_free_tagset;
1581         }
1582
1583         error = nvmf_connect_admin_queue(&ctrl->ctrl);
1584         if (error)
1585                 goto out_cleanup_queue;
1586
1587         set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[0].flags);
1588
1589         error = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->cap);
1590         if (error) {
1591                 dev_err(ctrl->ctrl.device,
1592                         "prop_get NVME_REG_CAP failed\n");
1593                 goto out_cleanup_queue;
1594         }
1595
1596         ctrl->ctrl.sqsize =
1597                 min_t(int, NVME_CAP_MQES(ctrl->cap) + 1, ctrl->ctrl.sqsize);
1598
1599         error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap);
1600         if (error)
1601                 goto out_cleanup_queue;
1602
1603         ctrl->ctrl.max_hw_sectors =
1604                 (ctrl->max_fr_pages - 1) << (PAGE_SHIFT - 9);
1605
1606         error = nvme_init_identify(&ctrl->ctrl);
1607         if (error)
1608                 goto out_cleanup_queue;
1609
1610         error = nvme_rdma_alloc_qe(ctrl->queues[0].device->dev,
1611                         &ctrl->async_event_sqe, sizeof(struct nvme_command),
1612                         DMA_TO_DEVICE);
1613         if (error)
1614                 goto out_cleanup_queue;
1615
1616         nvme_start_keep_alive(&ctrl->ctrl);
1617
1618         return 0;
1619
1620 out_cleanup_queue:
1621         blk_cleanup_queue(ctrl->ctrl.admin_q);
1622 out_free_tagset:
1623         /* disconnect and drain the queue before freeing the tagset */
1624         nvme_rdma_stop_queue(&ctrl->queues[0]);
1625         blk_mq_free_tag_set(&ctrl->admin_tag_set);
1626 out_put_dev:
1627         nvme_rdma_dev_put(ctrl->device);
1628 out_free_queue:
1629         nvme_rdma_free_queue(&ctrl->queues[0]);
1630         return error;
1631 }
1632
1633 static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl)
1634 {
1635         nvme_stop_keep_alive(&ctrl->ctrl);
1636         cancel_work_sync(&ctrl->err_work);
1637         cancel_delayed_work_sync(&ctrl->reconnect_work);
1638
1639         if (ctrl->queue_count > 1) {
1640                 nvme_stop_queues(&ctrl->ctrl);
1641                 blk_mq_tagset_busy_iter(&ctrl->tag_set,
1642                                         nvme_cancel_request, &ctrl->ctrl);
1643                 nvme_rdma_free_io_queues(ctrl);
1644         }
1645
1646         if (test_bit(NVME_RDMA_Q_CONNECTED, &ctrl->queues[0].flags))
1647                 nvme_shutdown_ctrl(&ctrl->ctrl);
1648
1649         blk_mq_stop_hw_queues(ctrl->ctrl.admin_q);
1650         blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
1651                                 nvme_cancel_request, &ctrl->ctrl);
1652         nvme_rdma_destroy_admin_queue(ctrl);
1653 }
1654
1655 static void __nvme_rdma_remove_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
1656 {
1657         nvme_uninit_ctrl(&ctrl->ctrl);
1658         if (shutdown)
1659                 nvme_rdma_shutdown_ctrl(ctrl);
1660
1661         if (ctrl->ctrl.tagset) {
1662                 blk_cleanup_queue(ctrl->ctrl.connect_q);
1663                 blk_mq_free_tag_set(&ctrl->tag_set);
1664                 nvme_rdma_dev_put(ctrl->device);
1665         }
1666
1667         nvme_put_ctrl(&ctrl->ctrl);
1668 }
1669
1670 static void nvme_rdma_del_ctrl_work(struct work_struct *work)
1671 {
1672         struct nvme_rdma_ctrl *ctrl = container_of(work,
1673                                 struct nvme_rdma_ctrl, delete_work);
1674
1675         __nvme_rdma_remove_ctrl(ctrl, true);
1676 }
1677
1678 static int __nvme_rdma_del_ctrl(struct nvme_rdma_ctrl *ctrl)
1679 {
1680         if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING))
1681                 return -EBUSY;
1682
1683         if (!queue_work(nvme_rdma_wq, &ctrl->delete_work))
1684                 return -EBUSY;
1685
1686         return 0;
1687 }
1688
1689 static int nvme_rdma_del_ctrl(struct nvme_ctrl *nctrl)
1690 {
1691         struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
1692         int ret = 0;
1693
1694         /*
1695          * Keep a reference until all work is flushed since
1696          * __nvme_rdma_del_ctrl can free the ctrl mem
1697          */
1698         if (!kref_get_unless_zero(&ctrl->ctrl.kref))
1699                 return -EBUSY;
1700         ret = __nvme_rdma_del_ctrl(ctrl);
1701         if (!ret)
1702                 flush_work(&ctrl->delete_work);
1703         nvme_put_ctrl(&ctrl->ctrl);
1704         return ret;
1705 }
1706
1707 static void nvme_rdma_remove_ctrl_work(struct work_struct *work)
1708 {
1709         struct nvme_rdma_ctrl *ctrl = container_of(work,
1710                                 struct nvme_rdma_ctrl, delete_work);
1711
1712         __nvme_rdma_remove_ctrl(ctrl, false);
1713 }
1714
1715 static void nvme_rdma_reset_ctrl_work(struct work_struct *work)
1716 {
1717         struct nvme_rdma_ctrl *ctrl = container_of(work,
1718                                         struct nvme_rdma_ctrl, reset_work);
1719         int ret;
1720         bool changed;
1721
1722         nvme_rdma_shutdown_ctrl(ctrl);
1723
1724         ret = nvme_rdma_configure_admin_queue(ctrl);
1725         if (ret) {
1726                 /* ctrl is already shutdown, just remove the ctrl */
1727                 INIT_WORK(&ctrl->delete_work, nvme_rdma_remove_ctrl_work);
1728                 goto del_dead_ctrl;
1729         }
1730
1731         if (ctrl->queue_count > 1) {
1732                 ret = blk_mq_reinit_tagset(&ctrl->tag_set);
1733                 if (ret)
1734                         goto del_dead_ctrl;
1735
1736                 ret = nvme_rdma_init_io_queues(ctrl);
1737                 if (ret)
1738                         goto del_dead_ctrl;
1739
1740                 ret = nvme_rdma_connect_io_queues(ctrl);
1741                 if (ret)
1742                         goto del_dead_ctrl;
1743         }
1744
1745         changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
1746         WARN_ON_ONCE(!changed);
1747
1748         if (ctrl->queue_count > 1) {
1749                 nvme_start_queues(&ctrl->ctrl);
1750                 nvme_queue_scan(&ctrl->ctrl);
1751                 nvme_queue_async_events(&ctrl->ctrl);
1752         }
1753
1754         return;
1755
1756 del_dead_ctrl:
1757         /* Deleting this dead controller... */
1758         dev_warn(ctrl->ctrl.device, "Removing after reset failure\n");
1759         WARN_ON(!queue_work(nvme_rdma_wq, &ctrl->delete_work));
1760 }
1761
1762 static int nvme_rdma_reset_ctrl(struct nvme_ctrl *nctrl)
1763 {
1764         struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
1765
1766         if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING))
1767                 return -EBUSY;
1768
1769         if (!queue_work(nvme_rdma_wq, &ctrl->reset_work))
1770                 return -EBUSY;
1771
1772         flush_work(&ctrl->reset_work);
1773
1774         return 0;
1775 }
1776
1777 static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = {
1778         .name                   = "rdma",
1779         .module                 = THIS_MODULE,
1780         .is_fabrics             = true,
1781         .reg_read32             = nvmf_reg_read32,
1782         .reg_read64             = nvmf_reg_read64,
1783         .reg_write32            = nvmf_reg_write32,
1784         .reset_ctrl             = nvme_rdma_reset_ctrl,
1785         .free_ctrl              = nvme_rdma_free_ctrl,
1786         .submit_async_event     = nvme_rdma_submit_async_event,
1787         .delete_ctrl            = nvme_rdma_del_ctrl,
1788         .get_subsysnqn          = nvmf_get_subsysnqn,
1789         .get_address            = nvmf_get_address,
1790 };
1791
1792 static int nvme_rdma_create_io_queues(struct nvme_rdma_ctrl *ctrl)
1793 {
1794         struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
1795         int ret;
1796
1797         ret = nvme_set_queue_count(&ctrl->ctrl, &opts->nr_io_queues);
1798         if (ret)
1799                 return ret;
1800
1801         ctrl->queue_count = opts->nr_io_queues + 1;
1802         if (ctrl->queue_count < 2)
1803                 return 0;
1804
1805         dev_info(ctrl->ctrl.device,
1806                 "creating %d I/O queues.\n", opts->nr_io_queues);
1807
1808         ret = nvme_rdma_init_io_queues(ctrl);
1809         if (ret)
1810                 return ret;
1811
1812         /*
1813          * We need a reference on the device as long as the tag_set is alive,
1814          * as the MRs in the request structures need a valid ib_device.
1815          */
1816         ret = -EINVAL;
1817         if (!nvme_rdma_dev_get(ctrl->device))
1818                 goto out_free_io_queues;
1819
1820         memset(&ctrl->tag_set, 0, sizeof(ctrl->tag_set));
1821         ctrl->tag_set.ops = &nvme_rdma_mq_ops;
1822         ctrl->tag_set.queue_depth = ctrl->ctrl.opts->queue_size;
1823         ctrl->tag_set.reserved_tags = 1; /* fabric connect */
1824         ctrl->tag_set.numa_node = NUMA_NO_NODE;
1825         ctrl->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
1826         ctrl->tag_set.cmd_size = sizeof(struct nvme_rdma_request) +
1827                 SG_CHUNK_SIZE * sizeof(struct scatterlist);
1828         ctrl->tag_set.driver_data = ctrl;
1829         ctrl->tag_set.nr_hw_queues = ctrl->queue_count - 1;
1830         ctrl->tag_set.timeout = NVME_IO_TIMEOUT;
1831
1832         ret = blk_mq_alloc_tag_set(&ctrl->tag_set);
1833         if (ret)
1834                 goto out_put_dev;
1835         ctrl->ctrl.tagset = &ctrl->tag_set;
1836
1837         ctrl->ctrl.connect_q = blk_mq_init_queue(&ctrl->tag_set);
1838         if (IS_ERR(ctrl->ctrl.connect_q)) {
1839                 ret = PTR_ERR(ctrl->ctrl.connect_q);
1840                 goto out_free_tag_set;
1841         }
1842
1843         ret = nvme_rdma_connect_io_queues(ctrl);
1844         if (ret)
1845                 goto out_cleanup_connect_q;
1846
1847         return 0;
1848
1849 out_cleanup_connect_q:
1850         blk_cleanup_queue(ctrl->ctrl.connect_q);
1851 out_free_tag_set:
1852         blk_mq_free_tag_set(&ctrl->tag_set);
1853 out_put_dev:
1854         nvme_rdma_dev_put(ctrl->device);
1855 out_free_io_queues:
1856         nvme_rdma_free_io_queues(ctrl);
1857         return ret;
1858 }
1859
1860 static int nvme_rdma_parse_ipaddr(struct sockaddr_in *in_addr, char *p)
1861 {
1862         u8 *addr = (u8 *)&in_addr->sin_addr.s_addr;
1863         size_t buflen = strlen(p);
1864
1865         /* XXX: handle IPv6 addresses */
1866
1867         if (buflen > INET_ADDRSTRLEN)
1868                 return -EINVAL;
1869         if (in4_pton(p, buflen, addr, '\0', NULL) == 0)
1870                 return -EINVAL;
1871         in_addr->sin_family = AF_INET;
1872         return 0;
1873 }
1874
1875 static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
1876                 struct nvmf_ctrl_options *opts)
1877 {
1878         struct nvme_rdma_ctrl *ctrl;
1879         int ret;
1880         bool changed;
1881
1882         ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
1883         if (!ctrl)
1884                 return ERR_PTR(-ENOMEM);
1885         ctrl->ctrl.opts = opts;
1886         INIT_LIST_HEAD(&ctrl->list);
1887
1888         ret = nvme_rdma_parse_ipaddr(&ctrl->addr_in, opts->traddr);
1889         if (ret) {
1890                 pr_err("malformed IP address passed: %s\n", opts->traddr);
1891                 goto out_free_ctrl;
1892         }
1893
1894         if (opts->mask & NVMF_OPT_HOST_TRADDR) {
1895                 ret = nvme_rdma_parse_ipaddr(&ctrl->src_addr_in,
1896                                 opts->host_traddr);
1897                 if (ret) {
1898                         pr_err("malformed src IP address passed: %s\n",
1899                                opts->host_traddr);
1900                         goto out_free_ctrl;
1901                 }
1902         }
1903
1904         if (opts->mask & NVMF_OPT_TRSVCID) {
1905                 u16 port;
1906
1907                 ret = kstrtou16(opts->trsvcid, 0, &port);
1908                 if (ret)
1909                         goto out_free_ctrl;
1910
1911                 ctrl->addr_in.sin_port = cpu_to_be16(port);
1912         } else {
1913                 ctrl->addr_in.sin_port = cpu_to_be16(NVME_RDMA_IP_PORT);
1914         }
1915
1916         ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_rdma_ctrl_ops,
1917                                 0 /* no quirks, we're perfect! */);
1918         if (ret)
1919                 goto out_free_ctrl;
1920
1921         ctrl->reconnect_delay = opts->reconnect_delay;
1922         INIT_DELAYED_WORK(&ctrl->reconnect_work,
1923                         nvme_rdma_reconnect_ctrl_work);
1924         INIT_WORK(&ctrl->err_work, nvme_rdma_error_recovery_work);
1925         INIT_WORK(&ctrl->delete_work, nvme_rdma_del_ctrl_work);
1926         INIT_WORK(&ctrl->reset_work, nvme_rdma_reset_ctrl_work);
1927         spin_lock_init(&ctrl->lock);
1928
1929         ctrl->queue_count = opts->nr_io_queues + 1; /* +1 for admin queue */
1930         ctrl->ctrl.sqsize = opts->queue_size - 1;
1931         ctrl->ctrl.kato = opts->kato;
1932
1933         ret = -ENOMEM;
1934         ctrl->queues = kcalloc(ctrl->queue_count, sizeof(*ctrl->queues),
1935                                 GFP_KERNEL);
1936         if (!ctrl->queues)
1937                 goto out_uninit_ctrl;
1938
1939         ret = nvme_rdma_configure_admin_queue(ctrl);
1940         if (ret)
1941                 goto out_kfree_queues;
1942
1943         /* sanity check icdoff */
1944         if (ctrl->ctrl.icdoff) {
1945                 dev_err(ctrl->ctrl.device, "icdoff is not supported!\n");
1946                 goto out_remove_admin_queue;
1947         }
1948
1949         /* sanity check keyed sgls */
1950         if (!(ctrl->ctrl.sgls & (1 << 20))) {
1951                 dev_err(ctrl->ctrl.device, "Mandatory keyed sgls are not support\n");
1952                 goto out_remove_admin_queue;
1953         }
1954
1955         if (opts->queue_size > ctrl->ctrl.maxcmd) {
1956                 /* warn if maxcmd is lower than queue_size */
1957                 dev_warn(ctrl->ctrl.device,
1958                         "queue_size %zu > ctrl maxcmd %u, clamping down\n",
1959                         opts->queue_size, ctrl->ctrl.maxcmd);
1960                 opts->queue_size = ctrl->ctrl.maxcmd;
1961         }
1962
1963         if (opts->queue_size > ctrl->ctrl.sqsize + 1) {
1964                 /* warn if sqsize is lower than queue_size */
1965                 dev_warn(ctrl->ctrl.device,
1966                         "queue_size %zu > ctrl sqsize %u, clamping down\n",
1967                         opts->queue_size, ctrl->ctrl.sqsize + 1);
1968                 opts->queue_size = ctrl->ctrl.sqsize + 1;
1969         }
1970
1971         if (opts->nr_io_queues) {
1972                 ret = nvme_rdma_create_io_queues(ctrl);
1973                 if (ret)
1974                         goto out_remove_admin_queue;
1975         }
1976
1977         changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
1978         WARN_ON_ONCE(!changed);
1979
1980         dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n",
1981                 ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
1982
1983         kref_get(&ctrl->ctrl.kref);
1984
1985         mutex_lock(&nvme_rdma_ctrl_mutex);
1986         list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list);
1987         mutex_unlock(&nvme_rdma_ctrl_mutex);
1988
1989         if (opts->nr_io_queues) {
1990                 nvme_queue_scan(&ctrl->ctrl);
1991                 nvme_queue_async_events(&ctrl->ctrl);
1992         }
1993
1994         return &ctrl->ctrl;
1995
1996 out_remove_admin_queue:
1997         nvme_stop_keep_alive(&ctrl->ctrl);
1998         nvme_rdma_destroy_admin_queue(ctrl);
1999 out_kfree_queues:
2000         kfree(ctrl->queues);
2001 out_uninit_ctrl:
2002         nvme_uninit_ctrl(&ctrl->ctrl);
2003         nvme_put_ctrl(&ctrl->ctrl);
2004         if (ret > 0)
2005                 ret = -EIO;
2006         return ERR_PTR(ret);
2007 out_free_ctrl:
2008         kfree(ctrl);
2009         return ERR_PTR(ret);
2010 }
2011
2012 static struct nvmf_transport_ops nvme_rdma_transport = {
2013         .name           = "rdma",
2014         .required_opts  = NVMF_OPT_TRADDR,
2015         .allowed_opts   = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
2016                           NVMF_OPT_HOST_TRADDR,
2017         .create_ctrl    = nvme_rdma_create_ctrl,
2018 };
2019
2020 static void nvme_rdma_add_one(struct ib_device *ib_device)
2021 {
2022 }
2023
2024 static void nvme_rdma_remove_one(struct ib_device *ib_device, void *client_data)
2025 {
2026         struct nvme_rdma_ctrl *ctrl;
2027
2028         /* Delete all controllers using this device */
2029         mutex_lock(&nvme_rdma_ctrl_mutex);
2030         list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list) {
2031                 if (ctrl->device->dev != ib_device)
2032                         continue;
2033                 dev_info(ctrl->ctrl.device,
2034                         "Removing ctrl: NQN \"%s\", addr %pISp\n",
2035                         ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
2036                 __nvme_rdma_del_ctrl(ctrl);
2037         }
2038         mutex_unlock(&nvme_rdma_ctrl_mutex);
2039
2040         flush_workqueue(nvme_rdma_wq);
2041 }
2042
2043 static struct ib_client nvme_rdma_ib_client = {
2044         .name   = "nvme_rdma",
2045         .add = nvme_rdma_add_one,
2046         .remove = nvme_rdma_remove_one
2047 };
2048
2049 static int __init nvme_rdma_init_module(void)
2050 {
2051         int ret;
2052
2053         nvme_rdma_wq = create_workqueue("nvme_rdma_wq");
2054         if (!nvme_rdma_wq)
2055                 return -ENOMEM;
2056
2057         ret = ib_register_client(&nvme_rdma_ib_client);
2058         if (ret) {
2059                 destroy_workqueue(nvme_rdma_wq);
2060                 return ret;
2061         }
2062
2063         return nvmf_register_transport(&nvme_rdma_transport);
2064 }
2065
2066 static void __exit nvme_rdma_cleanup_module(void)
2067 {
2068         nvmf_unregister_transport(&nvme_rdma_transport);
2069         ib_unregister_client(&nvme_rdma_ib_client);
2070         destroy_workqueue(nvme_rdma_wq);
2071 }
2072
2073 module_init(nvme_rdma_init_module);
2074 module_exit(nvme_rdma_cleanup_module);
2075
2076 MODULE_LICENSE("GPL v2");