clk: x86: Rename clk-lpt to more specific clk-lpss-atom
[platform/kernel/linux-rpi.git] / drivers / nvme / host / tcp.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * NVMe over Fabrics TCP host.
4  * Copyright (c) 2018 Lightbits Labs. All rights reserved.
5  */
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7 #include <linux/module.h>
8 #include <linux/init.h>
9 #include <linux/slab.h>
10 #include <linux/err.h>
11 #include <linux/nvme-tcp.h>
12 #include <net/sock.h>
13 #include <net/tcp.h>
14 #include <linux/blk-mq.h>
15 #include <crypto/hash.h>
16 #include <net/busy_poll.h>
17
18 #include "nvme.h"
19 #include "fabrics.h"
20
21 struct nvme_tcp_queue;
22
23 /* Define the socket priority to use for connections were it is desirable
24  * that the NIC consider performing optimized packet processing or filtering.
25  * A non-zero value being sufficient to indicate general consideration of any
26  * possible optimization.  Making it a module param allows for alternative
27  * values that may be unique for some NIC implementations.
28  */
29 static int so_priority;
30 module_param(so_priority, int, 0644);
31 MODULE_PARM_DESC(so_priority, "nvme tcp socket optimize priority");
32
33 enum nvme_tcp_send_state {
34         NVME_TCP_SEND_CMD_PDU = 0,
35         NVME_TCP_SEND_H2C_PDU,
36         NVME_TCP_SEND_DATA,
37         NVME_TCP_SEND_DDGST,
38 };
39
40 struct nvme_tcp_request {
41         struct nvme_request     req;
42         void                    *pdu;
43         struct nvme_tcp_queue   *queue;
44         u32                     data_len;
45         u32                     pdu_len;
46         u32                     pdu_sent;
47         u16                     ttag;
48         struct list_head        entry;
49         struct llist_node       lentry;
50         __le32                  ddgst;
51
52         struct bio              *curr_bio;
53         struct iov_iter         iter;
54
55         /* send state */
56         size_t                  offset;
57         size_t                  data_sent;
58         enum nvme_tcp_send_state state;
59 };
60
61 enum nvme_tcp_queue_flags {
62         NVME_TCP_Q_ALLOCATED    = 0,
63         NVME_TCP_Q_LIVE         = 1,
64         NVME_TCP_Q_POLLING      = 2,
65 };
66
67 enum nvme_tcp_recv_state {
68         NVME_TCP_RECV_PDU = 0,
69         NVME_TCP_RECV_DATA,
70         NVME_TCP_RECV_DDGST,
71 };
72
73 struct nvme_tcp_ctrl;
74 struct nvme_tcp_queue {
75         struct socket           *sock;
76         struct work_struct      io_work;
77         int                     io_cpu;
78
79         struct mutex            queue_lock;
80         struct mutex            send_mutex;
81         struct llist_head       req_list;
82         struct list_head        send_list;
83         bool                    more_requests;
84
85         /* recv state */
86         void                    *pdu;
87         int                     pdu_remaining;
88         int                     pdu_offset;
89         size_t                  data_remaining;
90         size_t                  ddgst_remaining;
91         unsigned int            nr_cqe;
92
93         /* send state */
94         struct nvme_tcp_request *request;
95
96         int                     queue_size;
97         size_t                  cmnd_capsule_len;
98         struct nvme_tcp_ctrl    *ctrl;
99         unsigned long           flags;
100         bool                    rd_enabled;
101
102         bool                    hdr_digest;
103         bool                    data_digest;
104         struct ahash_request    *rcv_hash;
105         struct ahash_request    *snd_hash;
106         __le32                  exp_ddgst;
107         __le32                  recv_ddgst;
108
109         struct page_frag_cache  pf_cache;
110
111         void (*state_change)(struct sock *);
112         void (*data_ready)(struct sock *);
113         void (*write_space)(struct sock *);
114 };
115
116 struct nvme_tcp_ctrl {
117         /* read only in the hot path */
118         struct nvme_tcp_queue   *queues;
119         struct blk_mq_tag_set   tag_set;
120
121         /* other member variables */
122         struct list_head        list;
123         struct blk_mq_tag_set   admin_tag_set;
124         struct sockaddr_storage addr;
125         struct sockaddr_storage src_addr;
126         struct net_device       *ndev;
127         struct nvme_ctrl        ctrl;
128
129         struct work_struct      err_work;
130         struct delayed_work     connect_work;
131         struct nvme_tcp_request async_req;
132         u32                     io_queues[HCTX_MAX_TYPES];
133 };
134
135 static LIST_HEAD(nvme_tcp_ctrl_list);
136 static DEFINE_MUTEX(nvme_tcp_ctrl_mutex);
137 static struct workqueue_struct *nvme_tcp_wq;
138 static const struct blk_mq_ops nvme_tcp_mq_ops;
139 static const struct blk_mq_ops nvme_tcp_admin_mq_ops;
140 static int nvme_tcp_try_send(struct nvme_tcp_queue *queue);
141
142 static inline struct nvme_tcp_ctrl *to_tcp_ctrl(struct nvme_ctrl *ctrl)
143 {
144         return container_of(ctrl, struct nvme_tcp_ctrl, ctrl);
145 }
146
147 static inline int nvme_tcp_queue_id(struct nvme_tcp_queue *queue)
148 {
149         return queue - queue->ctrl->queues;
150 }
151
152 static inline struct blk_mq_tags *nvme_tcp_tagset(struct nvme_tcp_queue *queue)
153 {
154         u32 queue_idx = nvme_tcp_queue_id(queue);
155
156         if (queue_idx == 0)
157                 return queue->ctrl->admin_tag_set.tags[queue_idx];
158         return queue->ctrl->tag_set.tags[queue_idx - 1];
159 }
160
161 static inline u8 nvme_tcp_hdgst_len(struct nvme_tcp_queue *queue)
162 {
163         return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0;
164 }
165
166 static inline u8 nvme_tcp_ddgst_len(struct nvme_tcp_queue *queue)
167 {
168         return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
169 }
170
171 static inline size_t nvme_tcp_inline_data_size(struct nvme_tcp_queue *queue)
172 {
173         return queue->cmnd_capsule_len - sizeof(struct nvme_command);
174 }
175
176 static inline bool nvme_tcp_async_req(struct nvme_tcp_request *req)
177 {
178         return req == &req->queue->ctrl->async_req;
179 }
180
181 static inline bool nvme_tcp_has_inline_data(struct nvme_tcp_request *req)
182 {
183         struct request *rq;
184
185         if (unlikely(nvme_tcp_async_req(req)))
186                 return false; /* async events don't have a request */
187
188         rq = blk_mq_rq_from_pdu(req);
189
190         return rq_data_dir(rq) == WRITE && req->data_len &&
191                 req->data_len <= nvme_tcp_inline_data_size(req->queue);
192 }
193
194 static inline struct page *nvme_tcp_req_cur_page(struct nvme_tcp_request *req)
195 {
196         return req->iter.bvec->bv_page;
197 }
198
199 static inline size_t nvme_tcp_req_cur_offset(struct nvme_tcp_request *req)
200 {
201         return req->iter.bvec->bv_offset + req->iter.iov_offset;
202 }
203
204 static inline size_t nvme_tcp_req_cur_length(struct nvme_tcp_request *req)
205 {
206         return min_t(size_t, iov_iter_single_seg_count(&req->iter),
207                         req->pdu_len - req->pdu_sent);
208 }
209
210 static inline size_t nvme_tcp_pdu_data_left(struct nvme_tcp_request *req)
211 {
212         return rq_data_dir(blk_mq_rq_from_pdu(req)) == WRITE ?
213                         req->pdu_len - req->pdu_sent : 0;
214 }
215
216 static inline size_t nvme_tcp_pdu_last_send(struct nvme_tcp_request *req,
217                 int len)
218 {
219         return nvme_tcp_pdu_data_left(req) <= len;
220 }
221
222 static void nvme_tcp_init_iter(struct nvme_tcp_request *req,
223                 unsigned int dir)
224 {
225         struct request *rq = blk_mq_rq_from_pdu(req);
226         struct bio_vec *vec;
227         unsigned int size;
228         int nr_bvec;
229         size_t offset;
230
231         if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) {
232                 vec = &rq->special_vec;
233                 nr_bvec = 1;
234                 size = blk_rq_payload_bytes(rq);
235                 offset = 0;
236         } else {
237                 struct bio *bio = req->curr_bio;
238                 struct bvec_iter bi;
239                 struct bio_vec bv;
240
241                 vec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
242                 nr_bvec = 0;
243                 bio_for_each_bvec(bv, bio, bi) {
244                         nr_bvec++;
245                 }
246                 size = bio->bi_iter.bi_size;
247                 offset = bio->bi_iter.bi_bvec_done;
248         }
249
250         iov_iter_bvec(&req->iter, dir, vec, nr_bvec, size);
251         req->iter.iov_offset = offset;
252 }
253
254 static inline void nvme_tcp_advance_req(struct nvme_tcp_request *req,
255                 int len)
256 {
257         req->data_sent += len;
258         req->pdu_sent += len;
259         iov_iter_advance(&req->iter, len);
260         if (!iov_iter_count(&req->iter) &&
261             req->data_sent < req->data_len) {
262                 req->curr_bio = req->curr_bio->bi_next;
263                 nvme_tcp_init_iter(req, WRITE);
264         }
265 }
266
267 static inline void nvme_tcp_send_all(struct nvme_tcp_queue *queue)
268 {
269         int ret;
270
271         /* drain the send queue as much as we can... */
272         do {
273                 ret = nvme_tcp_try_send(queue);
274         } while (ret > 0);
275 }
276
277 static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
278                 bool sync, bool last)
279 {
280         struct nvme_tcp_queue *queue = req->queue;
281         bool empty;
282
283         empty = llist_add(&req->lentry, &queue->req_list) &&
284                 list_empty(&queue->send_list) && !queue->request;
285
286         /*
287          * if we're the first on the send_list and we can try to send
288          * directly, otherwise queue io_work. Also, only do that if we
289          * are on the same cpu, so we don't introduce contention.
290          */
291         if (queue->io_cpu == raw_smp_processor_id() &&
292             sync && empty && mutex_trylock(&queue->send_mutex)) {
293                 queue->more_requests = !last;
294                 nvme_tcp_send_all(queue);
295                 queue->more_requests = false;
296                 mutex_unlock(&queue->send_mutex);
297         } else if (last) {
298                 queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
299         }
300 }
301
302 static void nvme_tcp_process_req_list(struct nvme_tcp_queue *queue)
303 {
304         struct nvme_tcp_request *req;
305         struct llist_node *node;
306
307         for (node = llist_del_all(&queue->req_list); node; node = node->next) {
308                 req = llist_entry(node, struct nvme_tcp_request, lentry);
309                 list_add(&req->entry, &queue->send_list);
310         }
311 }
312
313 static inline struct nvme_tcp_request *
314 nvme_tcp_fetch_request(struct nvme_tcp_queue *queue)
315 {
316         struct nvme_tcp_request *req;
317
318         req = list_first_entry_or_null(&queue->send_list,
319                         struct nvme_tcp_request, entry);
320         if (!req) {
321                 nvme_tcp_process_req_list(queue);
322                 req = list_first_entry_or_null(&queue->send_list,
323                                 struct nvme_tcp_request, entry);
324                 if (unlikely(!req))
325                         return NULL;
326         }
327
328         list_del(&req->entry);
329         return req;
330 }
331
332 static inline void nvme_tcp_ddgst_final(struct ahash_request *hash,
333                 __le32 *dgst)
334 {
335         ahash_request_set_crypt(hash, NULL, (u8 *)dgst, 0);
336         crypto_ahash_final(hash);
337 }
338
339 static inline void nvme_tcp_ddgst_update(struct ahash_request *hash,
340                 struct page *page, off_t off, size_t len)
341 {
342         struct scatterlist sg;
343
344         sg_init_marker(&sg, 1);
345         sg_set_page(&sg, page, len, off);
346         ahash_request_set_crypt(hash, &sg, NULL, len);
347         crypto_ahash_update(hash);
348 }
349
350 static inline void nvme_tcp_hdgst(struct ahash_request *hash,
351                 void *pdu, size_t len)
352 {
353         struct scatterlist sg;
354
355         sg_init_one(&sg, pdu, len);
356         ahash_request_set_crypt(hash, &sg, pdu + len, len);
357         crypto_ahash_digest(hash);
358 }
359
360 static int nvme_tcp_verify_hdgst(struct nvme_tcp_queue *queue,
361                 void *pdu, size_t pdu_len)
362 {
363         struct nvme_tcp_hdr *hdr = pdu;
364         __le32 recv_digest;
365         __le32 exp_digest;
366
367         if (unlikely(!(hdr->flags & NVME_TCP_F_HDGST))) {
368                 dev_err(queue->ctrl->ctrl.device,
369                         "queue %d: header digest flag is cleared\n",
370                         nvme_tcp_queue_id(queue));
371                 return -EPROTO;
372         }
373
374         recv_digest = *(__le32 *)(pdu + hdr->hlen);
375         nvme_tcp_hdgst(queue->rcv_hash, pdu, pdu_len);
376         exp_digest = *(__le32 *)(pdu + hdr->hlen);
377         if (recv_digest != exp_digest) {
378                 dev_err(queue->ctrl->ctrl.device,
379                         "header digest error: recv %#x expected %#x\n",
380                         le32_to_cpu(recv_digest), le32_to_cpu(exp_digest));
381                 return -EIO;
382         }
383
384         return 0;
385 }
386
387 static int nvme_tcp_check_ddgst(struct nvme_tcp_queue *queue, void *pdu)
388 {
389         struct nvme_tcp_hdr *hdr = pdu;
390         u8 digest_len = nvme_tcp_hdgst_len(queue);
391         u32 len;
392
393         len = le32_to_cpu(hdr->plen) - hdr->hlen -
394                 ((hdr->flags & NVME_TCP_F_HDGST) ? digest_len : 0);
395
396         if (unlikely(len && !(hdr->flags & NVME_TCP_F_DDGST))) {
397                 dev_err(queue->ctrl->ctrl.device,
398                         "queue %d: data digest flag is cleared\n",
399                 nvme_tcp_queue_id(queue));
400                 return -EPROTO;
401         }
402         crypto_ahash_init(queue->rcv_hash);
403
404         return 0;
405 }
406
407 static void nvme_tcp_exit_request(struct blk_mq_tag_set *set,
408                 struct request *rq, unsigned int hctx_idx)
409 {
410         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
411
412         page_frag_free(req->pdu);
413 }
414
415 static int nvme_tcp_init_request(struct blk_mq_tag_set *set,
416                 struct request *rq, unsigned int hctx_idx,
417                 unsigned int numa_node)
418 {
419         struct nvme_tcp_ctrl *ctrl = set->driver_data;
420         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
421         struct nvme_tcp_cmd_pdu *pdu;
422         int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
423         struct nvme_tcp_queue *queue = &ctrl->queues[queue_idx];
424         u8 hdgst = nvme_tcp_hdgst_len(queue);
425
426         req->pdu = page_frag_alloc(&queue->pf_cache,
427                 sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
428                 GFP_KERNEL | __GFP_ZERO);
429         if (!req->pdu)
430                 return -ENOMEM;
431
432         pdu = req->pdu;
433         req->queue = queue;
434         nvme_req(rq)->ctrl = &ctrl->ctrl;
435         nvme_req(rq)->cmd = &pdu->cmd;
436
437         return 0;
438 }
439
440 static int nvme_tcp_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
441                 unsigned int hctx_idx)
442 {
443         struct nvme_tcp_ctrl *ctrl = data;
444         struct nvme_tcp_queue *queue = &ctrl->queues[hctx_idx + 1];
445
446         hctx->driver_data = queue;
447         return 0;
448 }
449
450 static int nvme_tcp_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
451                 unsigned int hctx_idx)
452 {
453         struct nvme_tcp_ctrl *ctrl = data;
454         struct nvme_tcp_queue *queue = &ctrl->queues[0];
455
456         hctx->driver_data = queue;
457         return 0;
458 }
459
460 static enum nvme_tcp_recv_state
461 nvme_tcp_recv_state(struct nvme_tcp_queue *queue)
462 {
463         return  (queue->pdu_remaining) ? NVME_TCP_RECV_PDU :
464                 (queue->ddgst_remaining) ? NVME_TCP_RECV_DDGST :
465                 NVME_TCP_RECV_DATA;
466 }
467
468 static void nvme_tcp_init_recv_ctx(struct nvme_tcp_queue *queue)
469 {
470         queue->pdu_remaining = sizeof(struct nvme_tcp_rsp_pdu) +
471                                 nvme_tcp_hdgst_len(queue);
472         queue->pdu_offset = 0;
473         queue->data_remaining = -1;
474         queue->ddgst_remaining = 0;
475 }
476
477 static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl)
478 {
479         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
480                 return;
481
482         dev_warn(ctrl->device, "starting error recovery\n");
483         queue_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work);
484 }
485
486 static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
487                 struct nvme_completion *cqe)
488 {
489         struct request *rq;
490
491         rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), cqe->command_id);
492         if (!rq) {
493                 dev_err(queue->ctrl->ctrl.device,
494                         "queue %d tag 0x%x not found\n",
495                         nvme_tcp_queue_id(queue), cqe->command_id);
496                 nvme_tcp_error_recovery(&queue->ctrl->ctrl);
497                 return -EINVAL;
498         }
499
500         if (!nvme_try_complete_req(rq, cqe->status, cqe->result))
501                 nvme_complete_rq(rq);
502         queue->nr_cqe++;
503
504         return 0;
505 }
506
507 static int nvme_tcp_handle_c2h_data(struct nvme_tcp_queue *queue,
508                 struct nvme_tcp_data_pdu *pdu)
509 {
510         struct request *rq;
511
512         rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
513         if (!rq) {
514                 dev_err(queue->ctrl->ctrl.device,
515                         "queue %d tag %#x not found\n",
516                         nvme_tcp_queue_id(queue), pdu->command_id);
517                 return -ENOENT;
518         }
519
520         if (!blk_rq_payload_bytes(rq)) {
521                 dev_err(queue->ctrl->ctrl.device,
522                         "queue %d tag %#x unexpected data\n",
523                         nvme_tcp_queue_id(queue), rq->tag);
524                 return -EIO;
525         }
526
527         queue->data_remaining = le32_to_cpu(pdu->data_length);
528
529         if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS &&
530             unlikely(!(pdu->hdr.flags & NVME_TCP_F_DATA_LAST))) {
531                 dev_err(queue->ctrl->ctrl.device,
532                         "queue %d tag %#x SUCCESS set but not last PDU\n",
533                         nvme_tcp_queue_id(queue), rq->tag);
534                 nvme_tcp_error_recovery(&queue->ctrl->ctrl);
535                 return -EPROTO;
536         }
537
538         return 0;
539 }
540
541 static int nvme_tcp_handle_comp(struct nvme_tcp_queue *queue,
542                 struct nvme_tcp_rsp_pdu *pdu)
543 {
544         struct nvme_completion *cqe = &pdu->cqe;
545         int ret = 0;
546
547         /*
548          * AEN requests are special as they don't time out and can
549          * survive any kind of queue freeze and often don't respond to
550          * aborts.  We don't even bother to allocate a struct request
551          * for them but rather special case them here.
552          */
553         if (unlikely(nvme_is_aen_req(nvme_tcp_queue_id(queue),
554                                      cqe->command_id)))
555                 nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
556                                 &cqe->result);
557         else
558                 ret = nvme_tcp_process_nvme_cqe(queue, cqe);
559
560         return ret;
561 }
562
563 static int nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req,
564                 struct nvme_tcp_r2t_pdu *pdu)
565 {
566         struct nvme_tcp_data_pdu *data = req->pdu;
567         struct nvme_tcp_queue *queue = req->queue;
568         struct request *rq = blk_mq_rq_from_pdu(req);
569         u8 hdgst = nvme_tcp_hdgst_len(queue);
570         u8 ddgst = nvme_tcp_ddgst_len(queue);
571
572         req->pdu_len = le32_to_cpu(pdu->r2t_length);
573         req->pdu_sent = 0;
574
575         if (unlikely(!req->pdu_len)) {
576                 dev_err(queue->ctrl->ctrl.device,
577                         "req %d r2t len is %u, probably a bug...\n",
578                         rq->tag, req->pdu_len);
579                 return -EPROTO;
580         }
581
582         if (unlikely(req->data_sent + req->pdu_len > req->data_len)) {
583                 dev_err(queue->ctrl->ctrl.device,
584                         "req %d r2t len %u exceeded data len %u (%zu sent)\n",
585                         rq->tag, req->pdu_len, req->data_len,
586                         req->data_sent);
587                 return -EPROTO;
588         }
589
590         if (unlikely(le32_to_cpu(pdu->r2t_offset) < req->data_sent)) {
591                 dev_err(queue->ctrl->ctrl.device,
592                         "req %d unexpected r2t offset %u (expected %zu)\n",
593                         rq->tag, le32_to_cpu(pdu->r2t_offset),
594                         req->data_sent);
595                 return -EPROTO;
596         }
597
598         memset(data, 0, sizeof(*data));
599         data->hdr.type = nvme_tcp_h2c_data;
600         data->hdr.flags = NVME_TCP_F_DATA_LAST;
601         if (queue->hdr_digest)
602                 data->hdr.flags |= NVME_TCP_F_HDGST;
603         if (queue->data_digest)
604                 data->hdr.flags |= NVME_TCP_F_DDGST;
605         data->hdr.hlen = sizeof(*data);
606         data->hdr.pdo = data->hdr.hlen + hdgst;
607         data->hdr.plen =
608                 cpu_to_le32(data->hdr.hlen + hdgst + req->pdu_len + ddgst);
609         data->ttag = pdu->ttag;
610         data->command_id = rq->tag;
611         data->data_offset = cpu_to_le32(req->data_sent);
612         data->data_length = cpu_to_le32(req->pdu_len);
613         return 0;
614 }
615
616 static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue,
617                 struct nvme_tcp_r2t_pdu *pdu)
618 {
619         struct nvme_tcp_request *req;
620         struct request *rq;
621         int ret;
622
623         rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
624         if (!rq) {
625                 dev_err(queue->ctrl->ctrl.device,
626                         "queue %d tag %#x not found\n",
627                         nvme_tcp_queue_id(queue), pdu->command_id);
628                 return -ENOENT;
629         }
630         req = blk_mq_rq_to_pdu(rq);
631
632         ret = nvme_tcp_setup_h2c_data_pdu(req, pdu);
633         if (unlikely(ret))
634                 return ret;
635
636         req->state = NVME_TCP_SEND_H2C_PDU;
637         req->offset = 0;
638
639         nvme_tcp_queue_request(req, false, true);
640
641         return 0;
642 }
643
644 static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb,
645                 unsigned int *offset, size_t *len)
646 {
647         struct nvme_tcp_hdr *hdr;
648         char *pdu = queue->pdu;
649         size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining);
650         int ret;
651
652         ret = skb_copy_bits(skb, *offset,
653                 &pdu[queue->pdu_offset], rcv_len);
654         if (unlikely(ret))
655                 return ret;
656
657         queue->pdu_remaining -= rcv_len;
658         queue->pdu_offset += rcv_len;
659         *offset += rcv_len;
660         *len -= rcv_len;
661         if (queue->pdu_remaining)
662                 return 0;
663
664         hdr = queue->pdu;
665         if (queue->hdr_digest) {
666                 ret = nvme_tcp_verify_hdgst(queue, queue->pdu, hdr->hlen);
667                 if (unlikely(ret))
668                         return ret;
669         }
670
671
672         if (queue->data_digest) {
673                 ret = nvme_tcp_check_ddgst(queue, queue->pdu);
674                 if (unlikely(ret))
675                         return ret;
676         }
677
678         switch (hdr->type) {
679         case nvme_tcp_c2h_data:
680                 return nvme_tcp_handle_c2h_data(queue, (void *)queue->pdu);
681         case nvme_tcp_rsp:
682                 nvme_tcp_init_recv_ctx(queue);
683                 return nvme_tcp_handle_comp(queue, (void *)queue->pdu);
684         case nvme_tcp_r2t:
685                 nvme_tcp_init_recv_ctx(queue);
686                 return nvme_tcp_handle_r2t(queue, (void *)queue->pdu);
687         default:
688                 dev_err(queue->ctrl->ctrl.device,
689                         "unsupported pdu type (%d)\n", hdr->type);
690                 return -EINVAL;
691         }
692 }
693
694 static inline void nvme_tcp_end_request(struct request *rq, u16 status)
695 {
696         union nvme_result res = {};
697
698         if (!nvme_try_complete_req(rq, cpu_to_le16(status << 1), res))
699                 nvme_complete_rq(rq);
700 }
701
702 static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
703                               unsigned int *offset, size_t *len)
704 {
705         struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
706         struct nvme_tcp_request *req;
707         struct request *rq;
708
709         rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
710         if (!rq) {
711                 dev_err(queue->ctrl->ctrl.device,
712                         "queue %d tag %#x not found\n",
713                         nvme_tcp_queue_id(queue), pdu->command_id);
714                 return -ENOENT;
715         }
716         req = blk_mq_rq_to_pdu(rq);
717
718         while (true) {
719                 int recv_len, ret;
720
721                 recv_len = min_t(size_t, *len, queue->data_remaining);
722                 if (!recv_len)
723                         break;
724
725                 if (!iov_iter_count(&req->iter)) {
726                         req->curr_bio = req->curr_bio->bi_next;
727
728                         /*
729                          * If we don`t have any bios it means that controller
730                          * sent more data than we requested, hence error
731                          */
732                         if (!req->curr_bio) {
733                                 dev_err(queue->ctrl->ctrl.device,
734                                         "queue %d no space in request %#x",
735                                         nvme_tcp_queue_id(queue), rq->tag);
736                                 nvme_tcp_init_recv_ctx(queue);
737                                 return -EIO;
738                         }
739                         nvme_tcp_init_iter(req, READ);
740                 }
741
742                 /* we can read only from what is left in this bio */
743                 recv_len = min_t(size_t, recv_len,
744                                 iov_iter_count(&req->iter));
745
746                 if (queue->data_digest)
747                         ret = skb_copy_and_hash_datagram_iter(skb, *offset,
748                                 &req->iter, recv_len, queue->rcv_hash);
749                 else
750                         ret = skb_copy_datagram_iter(skb, *offset,
751                                         &req->iter, recv_len);
752                 if (ret) {
753                         dev_err(queue->ctrl->ctrl.device,
754                                 "queue %d failed to copy request %#x data",
755                                 nvme_tcp_queue_id(queue), rq->tag);
756                         return ret;
757                 }
758
759                 *len -= recv_len;
760                 *offset += recv_len;
761                 queue->data_remaining -= recv_len;
762         }
763
764         if (!queue->data_remaining) {
765                 if (queue->data_digest) {
766                         nvme_tcp_ddgst_final(queue->rcv_hash, &queue->exp_ddgst);
767                         queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH;
768                 } else {
769                         if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) {
770                                 nvme_tcp_end_request(rq, NVME_SC_SUCCESS);
771                                 queue->nr_cqe++;
772                         }
773                         nvme_tcp_init_recv_ctx(queue);
774                 }
775         }
776
777         return 0;
778 }
779
780 static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue,
781                 struct sk_buff *skb, unsigned int *offset, size_t *len)
782 {
783         struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
784         char *ddgst = (char *)&queue->recv_ddgst;
785         size_t recv_len = min_t(size_t, *len, queue->ddgst_remaining);
786         off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining;
787         int ret;
788
789         ret = skb_copy_bits(skb, *offset, &ddgst[off], recv_len);
790         if (unlikely(ret))
791                 return ret;
792
793         queue->ddgst_remaining -= recv_len;
794         *offset += recv_len;
795         *len -= recv_len;
796         if (queue->ddgst_remaining)
797                 return 0;
798
799         if (queue->recv_ddgst != queue->exp_ddgst) {
800                 dev_err(queue->ctrl->ctrl.device,
801                         "data digest error: recv %#x expected %#x\n",
802                         le32_to_cpu(queue->recv_ddgst),
803                         le32_to_cpu(queue->exp_ddgst));
804                 return -EIO;
805         }
806
807         if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) {
808                 struct request *rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue),
809                                                 pdu->command_id);
810
811                 nvme_tcp_end_request(rq, NVME_SC_SUCCESS);
812                 queue->nr_cqe++;
813         }
814
815         nvme_tcp_init_recv_ctx(queue);
816         return 0;
817 }
818
819 static int nvme_tcp_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
820                              unsigned int offset, size_t len)
821 {
822         struct nvme_tcp_queue *queue = desc->arg.data;
823         size_t consumed = len;
824         int result;
825
826         while (len) {
827                 switch (nvme_tcp_recv_state(queue)) {
828                 case NVME_TCP_RECV_PDU:
829                         result = nvme_tcp_recv_pdu(queue, skb, &offset, &len);
830                         break;
831                 case NVME_TCP_RECV_DATA:
832                         result = nvme_tcp_recv_data(queue, skb, &offset, &len);
833                         break;
834                 case NVME_TCP_RECV_DDGST:
835                         result = nvme_tcp_recv_ddgst(queue, skb, &offset, &len);
836                         break;
837                 default:
838                         result = -EFAULT;
839                 }
840                 if (result) {
841                         dev_err(queue->ctrl->ctrl.device,
842                                 "receive failed:  %d\n", result);
843                         queue->rd_enabled = false;
844                         nvme_tcp_error_recovery(&queue->ctrl->ctrl);
845                         return result;
846                 }
847         }
848
849         return consumed;
850 }
851
852 static void nvme_tcp_data_ready(struct sock *sk)
853 {
854         struct nvme_tcp_queue *queue;
855
856         read_lock_bh(&sk->sk_callback_lock);
857         queue = sk->sk_user_data;
858         if (likely(queue && queue->rd_enabled) &&
859             !test_bit(NVME_TCP_Q_POLLING, &queue->flags))
860                 queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
861         read_unlock_bh(&sk->sk_callback_lock);
862 }
863
864 static void nvme_tcp_write_space(struct sock *sk)
865 {
866         struct nvme_tcp_queue *queue;
867
868         read_lock_bh(&sk->sk_callback_lock);
869         queue = sk->sk_user_data;
870         if (likely(queue && sk_stream_is_writeable(sk))) {
871                 clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
872                 queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
873         }
874         read_unlock_bh(&sk->sk_callback_lock);
875 }
876
877 static void nvme_tcp_state_change(struct sock *sk)
878 {
879         struct nvme_tcp_queue *queue;
880
881         read_lock_bh(&sk->sk_callback_lock);
882         queue = sk->sk_user_data;
883         if (!queue)
884                 goto done;
885
886         switch (sk->sk_state) {
887         case TCP_CLOSE:
888         case TCP_CLOSE_WAIT:
889         case TCP_LAST_ACK:
890         case TCP_FIN_WAIT1:
891         case TCP_FIN_WAIT2:
892                 nvme_tcp_error_recovery(&queue->ctrl->ctrl);
893                 break;
894         default:
895                 dev_info(queue->ctrl->ctrl.device,
896                         "queue %d socket state %d\n",
897                         nvme_tcp_queue_id(queue), sk->sk_state);
898         }
899
900         queue->state_change(sk);
901 done:
902         read_unlock_bh(&sk->sk_callback_lock);
903 }
904
905 static inline bool nvme_tcp_queue_more(struct nvme_tcp_queue *queue)
906 {
907         return !list_empty(&queue->send_list) ||
908                 !llist_empty(&queue->req_list) || queue->more_requests;
909 }
910
911 static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue)
912 {
913         queue->request = NULL;
914 }
915
916 static void nvme_tcp_fail_request(struct nvme_tcp_request *req)
917 {
918         nvme_tcp_end_request(blk_mq_rq_from_pdu(req), NVME_SC_HOST_PATH_ERROR);
919 }
920
921 static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
922 {
923         struct nvme_tcp_queue *queue = req->queue;
924
925         while (true) {
926                 struct page *page = nvme_tcp_req_cur_page(req);
927                 size_t offset = nvme_tcp_req_cur_offset(req);
928                 size_t len = nvme_tcp_req_cur_length(req);
929                 bool last = nvme_tcp_pdu_last_send(req, len);
930                 int ret, flags = MSG_DONTWAIT;
931
932                 if (last && !queue->data_digest && !nvme_tcp_queue_more(queue))
933                         flags |= MSG_EOR;
934                 else
935                         flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
936
937                 if (sendpage_ok(page)) {
938                         ret = kernel_sendpage(queue->sock, page, offset, len,
939                                         flags);
940                 } else {
941                         ret = sock_no_sendpage(queue->sock, page, offset, len,
942                                         flags);
943                 }
944                 if (ret <= 0)
945                         return ret;
946
947                 if (queue->data_digest)
948                         nvme_tcp_ddgst_update(queue->snd_hash, page,
949                                         offset, ret);
950
951                 /* fully successful last write*/
952                 if (last && ret == len) {
953                         if (queue->data_digest) {
954                                 nvme_tcp_ddgst_final(queue->snd_hash,
955                                         &req->ddgst);
956                                 req->state = NVME_TCP_SEND_DDGST;
957                                 req->offset = 0;
958                         } else {
959                                 nvme_tcp_done_send_req(queue);
960                         }
961                         return 1;
962                 }
963                 nvme_tcp_advance_req(req, ret);
964         }
965         return -EAGAIN;
966 }
967
968 static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
969 {
970         struct nvme_tcp_queue *queue = req->queue;
971         struct nvme_tcp_cmd_pdu *pdu = req->pdu;
972         bool inline_data = nvme_tcp_has_inline_data(req);
973         u8 hdgst = nvme_tcp_hdgst_len(queue);
974         int len = sizeof(*pdu) + hdgst - req->offset;
975         int flags = MSG_DONTWAIT;
976         int ret;
977
978         if (inline_data || nvme_tcp_queue_more(queue))
979                 flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
980         else
981                 flags |= MSG_EOR;
982
983         if (queue->hdr_digest && !req->offset)
984                 nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
985
986         ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
987                         offset_in_page(pdu) + req->offset, len,  flags);
988         if (unlikely(ret <= 0))
989                 return ret;
990
991         len -= ret;
992         if (!len) {
993                 if (inline_data) {
994                         req->state = NVME_TCP_SEND_DATA;
995                         if (queue->data_digest)
996                                 crypto_ahash_init(queue->snd_hash);
997                 } else {
998                         nvme_tcp_done_send_req(queue);
999                 }
1000                 return 1;
1001         }
1002         req->offset += ret;
1003
1004         return -EAGAIN;
1005 }
1006
1007 static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
1008 {
1009         struct nvme_tcp_queue *queue = req->queue;
1010         struct nvme_tcp_data_pdu *pdu = req->pdu;
1011         u8 hdgst = nvme_tcp_hdgst_len(queue);
1012         int len = sizeof(*pdu) - req->offset + hdgst;
1013         int ret;
1014
1015         if (queue->hdr_digest && !req->offset)
1016                 nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
1017
1018         ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
1019                         offset_in_page(pdu) + req->offset, len,
1020                         MSG_DONTWAIT | MSG_MORE | MSG_SENDPAGE_NOTLAST);
1021         if (unlikely(ret <= 0))
1022                 return ret;
1023
1024         len -= ret;
1025         if (!len) {
1026                 req->state = NVME_TCP_SEND_DATA;
1027                 if (queue->data_digest)
1028                         crypto_ahash_init(queue->snd_hash);
1029                 return 1;
1030         }
1031         req->offset += ret;
1032
1033         return -EAGAIN;
1034 }
1035
1036 static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req)
1037 {
1038         struct nvme_tcp_queue *queue = req->queue;
1039         int ret;
1040         struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
1041         struct kvec iov = {
1042                 .iov_base = &req->ddgst + req->offset,
1043                 .iov_len = NVME_TCP_DIGEST_LENGTH - req->offset
1044         };
1045
1046         if (nvme_tcp_queue_more(queue))
1047                 msg.msg_flags |= MSG_MORE;
1048         else
1049                 msg.msg_flags |= MSG_EOR;
1050
1051         ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
1052         if (unlikely(ret <= 0))
1053                 return ret;
1054
1055         if (req->offset + ret == NVME_TCP_DIGEST_LENGTH) {
1056                 nvme_tcp_done_send_req(queue);
1057                 return 1;
1058         }
1059
1060         req->offset += ret;
1061         return -EAGAIN;
1062 }
1063
1064 static int nvme_tcp_try_send(struct nvme_tcp_queue *queue)
1065 {
1066         struct nvme_tcp_request *req;
1067         int ret = 1;
1068
1069         if (!queue->request) {
1070                 queue->request = nvme_tcp_fetch_request(queue);
1071                 if (!queue->request)
1072                         return 0;
1073         }
1074         req = queue->request;
1075
1076         if (req->state == NVME_TCP_SEND_CMD_PDU) {
1077                 ret = nvme_tcp_try_send_cmd_pdu(req);
1078                 if (ret <= 0)
1079                         goto done;
1080                 if (!nvme_tcp_has_inline_data(req))
1081                         return ret;
1082         }
1083
1084         if (req->state == NVME_TCP_SEND_H2C_PDU) {
1085                 ret = nvme_tcp_try_send_data_pdu(req);
1086                 if (ret <= 0)
1087                         goto done;
1088         }
1089
1090         if (req->state == NVME_TCP_SEND_DATA) {
1091                 ret = nvme_tcp_try_send_data(req);
1092                 if (ret <= 0)
1093                         goto done;
1094         }
1095
1096         if (req->state == NVME_TCP_SEND_DDGST)
1097                 ret = nvme_tcp_try_send_ddgst(req);
1098 done:
1099         if (ret == -EAGAIN) {
1100                 ret = 0;
1101         } else if (ret < 0) {
1102                 dev_err(queue->ctrl->ctrl.device,
1103                         "failed to send request %d\n", ret);
1104                 if (ret != -EPIPE && ret != -ECONNRESET)
1105                         nvme_tcp_fail_request(queue->request);
1106                 nvme_tcp_done_send_req(queue);
1107         }
1108         return ret;
1109 }
1110
1111 static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue)
1112 {
1113         struct socket *sock = queue->sock;
1114         struct sock *sk = sock->sk;
1115         read_descriptor_t rd_desc;
1116         int consumed;
1117
1118         rd_desc.arg.data = queue;
1119         rd_desc.count = 1;
1120         lock_sock(sk);
1121         queue->nr_cqe = 0;
1122         consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb);
1123         release_sock(sk);
1124         return consumed;
1125 }
1126
1127 static void nvme_tcp_io_work(struct work_struct *w)
1128 {
1129         struct nvme_tcp_queue *queue =
1130                 container_of(w, struct nvme_tcp_queue, io_work);
1131         unsigned long deadline = jiffies + msecs_to_jiffies(1);
1132
1133         do {
1134                 bool pending = false;
1135                 int result;
1136
1137                 if (mutex_trylock(&queue->send_mutex)) {
1138                         result = nvme_tcp_try_send(queue);
1139                         mutex_unlock(&queue->send_mutex);
1140                         if (result > 0)
1141                                 pending = true;
1142                         else if (unlikely(result < 0))
1143                                 break;
1144                 } else
1145                         pending = !llist_empty(&queue->req_list);
1146
1147                 result = nvme_tcp_try_recv(queue);
1148                 if (result > 0)
1149                         pending = true;
1150                 else if (unlikely(result < 0))
1151                         return;
1152
1153                 if (!pending)
1154                         return;
1155
1156         } while (!time_after(jiffies, deadline)); /* quota is exhausted */
1157
1158         queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
1159 }
1160
1161 static void nvme_tcp_free_crypto(struct nvme_tcp_queue *queue)
1162 {
1163         struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash);
1164
1165         ahash_request_free(queue->rcv_hash);
1166         ahash_request_free(queue->snd_hash);
1167         crypto_free_ahash(tfm);
1168 }
1169
1170 static int nvme_tcp_alloc_crypto(struct nvme_tcp_queue *queue)
1171 {
1172         struct crypto_ahash *tfm;
1173
1174         tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC);
1175         if (IS_ERR(tfm))
1176                 return PTR_ERR(tfm);
1177
1178         queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL);
1179         if (!queue->snd_hash)
1180                 goto free_tfm;
1181         ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL);
1182
1183         queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL);
1184         if (!queue->rcv_hash)
1185                 goto free_snd_hash;
1186         ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL);
1187
1188         return 0;
1189 free_snd_hash:
1190         ahash_request_free(queue->snd_hash);
1191 free_tfm:
1192         crypto_free_ahash(tfm);
1193         return -ENOMEM;
1194 }
1195
1196 static void nvme_tcp_free_async_req(struct nvme_tcp_ctrl *ctrl)
1197 {
1198         struct nvme_tcp_request *async = &ctrl->async_req;
1199
1200         page_frag_free(async->pdu);
1201 }
1202
1203 static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl)
1204 {
1205         struct nvme_tcp_queue *queue = &ctrl->queues[0];
1206         struct nvme_tcp_request *async = &ctrl->async_req;
1207         u8 hdgst = nvme_tcp_hdgst_len(queue);
1208
1209         async->pdu = page_frag_alloc(&queue->pf_cache,
1210                 sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
1211                 GFP_KERNEL | __GFP_ZERO);
1212         if (!async->pdu)
1213                 return -ENOMEM;
1214
1215         async->queue = &ctrl->queues[0];
1216         return 0;
1217 }
1218
1219 static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
1220 {
1221         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1222         struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1223
1224         if (!test_and_clear_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
1225                 return;
1226
1227         if (queue->hdr_digest || queue->data_digest)
1228                 nvme_tcp_free_crypto(queue);
1229
1230         sock_release(queue->sock);
1231         kfree(queue->pdu);
1232         mutex_destroy(&queue->queue_lock);
1233 }
1234
1235 static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue)
1236 {
1237         struct nvme_tcp_icreq_pdu *icreq;
1238         struct nvme_tcp_icresp_pdu *icresp;
1239         struct msghdr msg = {};
1240         struct kvec iov;
1241         bool ctrl_hdgst, ctrl_ddgst;
1242         int ret;
1243
1244         icreq = kzalloc(sizeof(*icreq), GFP_KERNEL);
1245         if (!icreq)
1246                 return -ENOMEM;
1247
1248         icresp = kzalloc(sizeof(*icresp), GFP_KERNEL);
1249         if (!icresp) {
1250                 ret = -ENOMEM;
1251                 goto free_icreq;
1252         }
1253
1254         icreq->hdr.type = nvme_tcp_icreq;
1255         icreq->hdr.hlen = sizeof(*icreq);
1256         icreq->hdr.pdo = 0;
1257         icreq->hdr.plen = cpu_to_le32(icreq->hdr.hlen);
1258         icreq->pfv = cpu_to_le16(NVME_TCP_PFV_1_0);
1259         icreq->maxr2t = 0; /* single inflight r2t supported */
1260         icreq->hpda = 0; /* no alignment constraint */
1261         if (queue->hdr_digest)
1262                 icreq->digest |= NVME_TCP_HDR_DIGEST_ENABLE;
1263         if (queue->data_digest)
1264                 icreq->digest |= NVME_TCP_DATA_DIGEST_ENABLE;
1265
1266         iov.iov_base = icreq;
1267         iov.iov_len = sizeof(*icreq);
1268         ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
1269         if (ret < 0)
1270                 goto free_icresp;
1271
1272         memset(&msg, 0, sizeof(msg));
1273         iov.iov_base = icresp;
1274         iov.iov_len = sizeof(*icresp);
1275         ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
1276                         iov.iov_len, msg.msg_flags);
1277         if (ret < 0)
1278                 goto free_icresp;
1279
1280         ret = -EINVAL;
1281         if (icresp->hdr.type != nvme_tcp_icresp) {
1282                 pr_err("queue %d: bad type returned %d\n",
1283                         nvme_tcp_queue_id(queue), icresp->hdr.type);
1284                 goto free_icresp;
1285         }
1286
1287         if (le32_to_cpu(icresp->hdr.plen) != sizeof(*icresp)) {
1288                 pr_err("queue %d: bad pdu length returned %d\n",
1289                         nvme_tcp_queue_id(queue), icresp->hdr.plen);
1290                 goto free_icresp;
1291         }
1292
1293         if (icresp->pfv != NVME_TCP_PFV_1_0) {
1294                 pr_err("queue %d: bad pfv returned %d\n",
1295                         nvme_tcp_queue_id(queue), icresp->pfv);
1296                 goto free_icresp;
1297         }
1298
1299         ctrl_ddgst = !!(icresp->digest & NVME_TCP_DATA_DIGEST_ENABLE);
1300         if ((queue->data_digest && !ctrl_ddgst) ||
1301             (!queue->data_digest && ctrl_ddgst)) {
1302                 pr_err("queue %d: data digest mismatch host: %s ctrl: %s\n",
1303                         nvme_tcp_queue_id(queue),
1304                         queue->data_digest ? "enabled" : "disabled",
1305                         ctrl_ddgst ? "enabled" : "disabled");
1306                 goto free_icresp;
1307         }
1308
1309         ctrl_hdgst = !!(icresp->digest & NVME_TCP_HDR_DIGEST_ENABLE);
1310         if ((queue->hdr_digest && !ctrl_hdgst) ||
1311             (!queue->hdr_digest && ctrl_hdgst)) {
1312                 pr_err("queue %d: header digest mismatch host: %s ctrl: %s\n",
1313                         nvme_tcp_queue_id(queue),
1314                         queue->hdr_digest ? "enabled" : "disabled",
1315                         ctrl_hdgst ? "enabled" : "disabled");
1316                 goto free_icresp;
1317         }
1318
1319         if (icresp->cpda != 0) {
1320                 pr_err("queue %d: unsupported cpda returned %d\n",
1321                         nvme_tcp_queue_id(queue), icresp->cpda);
1322                 goto free_icresp;
1323         }
1324
1325         ret = 0;
1326 free_icresp:
1327         kfree(icresp);
1328 free_icreq:
1329         kfree(icreq);
1330         return ret;
1331 }
1332
1333 static bool nvme_tcp_admin_queue(struct nvme_tcp_queue *queue)
1334 {
1335         return nvme_tcp_queue_id(queue) == 0;
1336 }
1337
1338 static bool nvme_tcp_default_queue(struct nvme_tcp_queue *queue)
1339 {
1340         struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1341         int qid = nvme_tcp_queue_id(queue);
1342
1343         return !nvme_tcp_admin_queue(queue) &&
1344                 qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT];
1345 }
1346
1347 static bool nvme_tcp_read_queue(struct nvme_tcp_queue *queue)
1348 {
1349         struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1350         int qid = nvme_tcp_queue_id(queue);
1351
1352         return !nvme_tcp_admin_queue(queue) &&
1353                 !nvme_tcp_default_queue(queue) &&
1354                 qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT] +
1355                           ctrl->io_queues[HCTX_TYPE_READ];
1356 }
1357
1358 static bool nvme_tcp_poll_queue(struct nvme_tcp_queue *queue)
1359 {
1360         struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1361         int qid = nvme_tcp_queue_id(queue);
1362
1363         return !nvme_tcp_admin_queue(queue) &&
1364                 !nvme_tcp_default_queue(queue) &&
1365                 !nvme_tcp_read_queue(queue) &&
1366                 qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT] +
1367                           ctrl->io_queues[HCTX_TYPE_READ] +
1368                           ctrl->io_queues[HCTX_TYPE_POLL];
1369 }
1370
1371 static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
1372 {
1373         struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1374         int qid = nvme_tcp_queue_id(queue);
1375         int n = 0;
1376
1377         if (nvme_tcp_default_queue(queue))
1378                 n = qid - 1;
1379         else if (nvme_tcp_read_queue(queue))
1380                 n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] - 1;
1381         else if (nvme_tcp_poll_queue(queue))
1382                 n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] -
1383                                 ctrl->io_queues[HCTX_TYPE_READ] - 1;
1384         queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
1385 }
1386
1387 static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
1388                 int qid, size_t queue_size)
1389 {
1390         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1391         struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1392         int ret, rcv_pdu_size;
1393
1394         mutex_init(&queue->queue_lock);
1395         queue->ctrl = ctrl;
1396         init_llist_head(&queue->req_list);
1397         INIT_LIST_HEAD(&queue->send_list);
1398         mutex_init(&queue->send_mutex);
1399         INIT_WORK(&queue->io_work, nvme_tcp_io_work);
1400         queue->queue_size = queue_size;
1401
1402         if (qid > 0)
1403                 queue->cmnd_capsule_len = nctrl->ioccsz * 16;
1404         else
1405                 queue->cmnd_capsule_len = sizeof(struct nvme_command) +
1406                                                 NVME_TCP_ADMIN_CCSZ;
1407
1408         ret = sock_create(ctrl->addr.ss_family, SOCK_STREAM,
1409                         IPPROTO_TCP, &queue->sock);
1410         if (ret) {
1411                 dev_err(nctrl->device,
1412                         "failed to create socket: %d\n", ret);
1413                 goto err_destroy_mutex;
1414         }
1415
1416         /* Single syn retry */
1417         tcp_sock_set_syncnt(queue->sock->sk, 1);
1418
1419         /* Set TCP no delay */
1420         tcp_sock_set_nodelay(queue->sock->sk);
1421
1422         /*
1423          * Cleanup whatever is sitting in the TCP transmit queue on socket
1424          * close. This is done to prevent stale data from being sent should
1425          * the network connection be restored before TCP times out.
1426          */
1427         sock_no_linger(queue->sock->sk);
1428
1429         if (so_priority > 0)
1430                 sock_set_priority(queue->sock->sk, so_priority);
1431
1432         /* Set socket type of service */
1433         if (nctrl->opts->tos >= 0)
1434                 ip_sock_set_tos(queue->sock->sk, nctrl->opts->tos);
1435
1436         /* Set 10 seconds timeout for icresp recvmsg */
1437         queue->sock->sk->sk_rcvtimeo = 10 * HZ;
1438
1439         queue->sock->sk->sk_allocation = GFP_ATOMIC;
1440         nvme_tcp_set_queue_io_cpu(queue);
1441         queue->request = NULL;
1442         queue->data_remaining = 0;
1443         queue->ddgst_remaining = 0;
1444         queue->pdu_remaining = 0;
1445         queue->pdu_offset = 0;
1446         sk_set_memalloc(queue->sock->sk);
1447
1448         if (nctrl->opts->mask & NVMF_OPT_HOST_TRADDR) {
1449                 ret = kernel_bind(queue->sock, (struct sockaddr *)&ctrl->src_addr,
1450                         sizeof(ctrl->src_addr));
1451                 if (ret) {
1452                         dev_err(nctrl->device,
1453                                 "failed to bind queue %d socket %d\n",
1454                                 qid, ret);
1455                         goto err_sock;
1456                 }
1457         }
1458
1459         if (nctrl->opts->mask & NVMF_OPT_HOST_IFACE) {
1460                 char *iface = nctrl->opts->host_iface;
1461                 sockptr_t optval = KERNEL_SOCKPTR(iface);
1462
1463                 ret = sock_setsockopt(queue->sock, SOL_SOCKET, SO_BINDTODEVICE,
1464                                       optval, strlen(iface));
1465                 if (ret) {
1466                         dev_err(nctrl->device,
1467                           "failed to bind to interface %s queue %d err %d\n",
1468                           iface, qid, ret);
1469                         goto err_sock;
1470                 }
1471         }
1472
1473         queue->hdr_digest = nctrl->opts->hdr_digest;
1474         queue->data_digest = nctrl->opts->data_digest;
1475         if (queue->hdr_digest || queue->data_digest) {
1476                 ret = nvme_tcp_alloc_crypto(queue);
1477                 if (ret) {
1478                         dev_err(nctrl->device,
1479                                 "failed to allocate queue %d crypto\n", qid);
1480                         goto err_sock;
1481                 }
1482         }
1483
1484         rcv_pdu_size = sizeof(struct nvme_tcp_rsp_pdu) +
1485                         nvme_tcp_hdgst_len(queue);
1486         queue->pdu = kmalloc(rcv_pdu_size, GFP_KERNEL);
1487         if (!queue->pdu) {
1488                 ret = -ENOMEM;
1489                 goto err_crypto;
1490         }
1491
1492         dev_dbg(nctrl->device, "connecting queue %d\n",
1493                         nvme_tcp_queue_id(queue));
1494
1495         ret = kernel_connect(queue->sock, (struct sockaddr *)&ctrl->addr,
1496                 sizeof(ctrl->addr), 0);
1497         if (ret) {
1498                 dev_err(nctrl->device,
1499                         "failed to connect socket: %d\n", ret);
1500                 goto err_rcv_pdu;
1501         }
1502
1503         ret = nvme_tcp_init_connection(queue);
1504         if (ret)
1505                 goto err_init_connect;
1506
1507         queue->rd_enabled = true;
1508         set_bit(NVME_TCP_Q_ALLOCATED, &queue->flags);
1509         nvme_tcp_init_recv_ctx(queue);
1510
1511         write_lock_bh(&queue->sock->sk->sk_callback_lock);
1512         queue->sock->sk->sk_user_data = queue;
1513         queue->state_change = queue->sock->sk->sk_state_change;
1514         queue->data_ready = queue->sock->sk->sk_data_ready;
1515         queue->write_space = queue->sock->sk->sk_write_space;
1516         queue->sock->sk->sk_data_ready = nvme_tcp_data_ready;
1517         queue->sock->sk->sk_state_change = nvme_tcp_state_change;
1518         queue->sock->sk->sk_write_space = nvme_tcp_write_space;
1519 #ifdef CONFIG_NET_RX_BUSY_POLL
1520         queue->sock->sk->sk_ll_usec = 1;
1521 #endif
1522         write_unlock_bh(&queue->sock->sk->sk_callback_lock);
1523
1524         return 0;
1525
1526 err_init_connect:
1527         kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1528 err_rcv_pdu:
1529         kfree(queue->pdu);
1530 err_crypto:
1531         if (queue->hdr_digest || queue->data_digest)
1532                 nvme_tcp_free_crypto(queue);
1533 err_sock:
1534         sock_release(queue->sock);
1535         queue->sock = NULL;
1536 err_destroy_mutex:
1537         mutex_destroy(&queue->queue_lock);
1538         return ret;
1539 }
1540
1541 static void nvme_tcp_restore_sock_calls(struct nvme_tcp_queue *queue)
1542 {
1543         struct socket *sock = queue->sock;
1544
1545         write_lock_bh(&sock->sk->sk_callback_lock);
1546         sock->sk->sk_user_data  = NULL;
1547         sock->sk->sk_data_ready = queue->data_ready;
1548         sock->sk->sk_state_change = queue->state_change;
1549         sock->sk->sk_write_space  = queue->write_space;
1550         write_unlock_bh(&sock->sk->sk_callback_lock);
1551 }
1552
1553 static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue)
1554 {
1555         kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1556         nvme_tcp_restore_sock_calls(queue);
1557         cancel_work_sync(&queue->io_work);
1558 }
1559
1560 static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
1561 {
1562         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1563         struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1564
1565         mutex_lock(&queue->queue_lock);
1566         if (test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags))
1567                 __nvme_tcp_stop_queue(queue);
1568         mutex_unlock(&queue->queue_lock);
1569 }
1570
1571 static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx)
1572 {
1573         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1574         int ret;
1575
1576         if (idx)
1577                 ret = nvmf_connect_io_queue(nctrl, idx);
1578         else
1579                 ret = nvmf_connect_admin_queue(nctrl);
1580
1581         if (!ret) {
1582                 set_bit(NVME_TCP_Q_LIVE, &ctrl->queues[idx].flags);
1583         } else {
1584                 if (test_bit(NVME_TCP_Q_ALLOCATED, &ctrl->queues[idx].flags))
1585                         __nvme_tcp_stop_queue(&ctrl->queues[idx]);
1586                 dev_err(nctrl->device,
1587                         "failed to connect queue: %d ret=%d\n", idx, ret);
1588         }
1589         return ret;
1590 }
1591
1592 static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl,
1593                 bool admin)
1594 {
1595         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1596         struct blk_mq_tag_set *set;
1597         int ret;
1598
1599         if (admin) {
1600                 set = &ctrl->admin_tag_set;
1601                 memset(set, 0, sizeof(*set));
1602                 set->ops = &nvme_tcp_admin_mq_ops;
1603                 set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
1604                 set->reserved_tags = NVMF_RESERVED_TAGS;
1605                 set->numa_node = nctrl->numa_node;
1606                 set->flags = BLK_MQ_F_BLOCKING;
1607                 set->cmd_size = sizeof(struct nvme_tcp_request);
1608                 set->driver_data = ctrl;
1609                 set->nr_hw_queues = 1;
1610                 set->timeout = NVME_ADMIN_TIMEOUT;
1611         } else {
1612                 set = &ctrl->tag_set;
1613                 memset(set, 0, sizeof(*set));
1614                 set->ops = &nvme_tcp_mq_ops;
1615                 set->queue_depth = nctrl->sqsize + 1;
1616                 set->reserved_tags = NVMF_RESERVED_TAGS;
1617                 set->numa_node = nctrl->numa_node;
1618                 set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;
1619                 set->cmd_size = sizeof(struct nvme_tcp_request);
1620                 set->driver_data = ctrl;
1621                 set->nr_hw_queues = nctrl->queue_count - 1;
1622                 set->timeout = NVME_IO_TIMEOUT;
1623                 set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2;
1624         }
1625
1626         ret = blk_mq_alloc_tag_set(set);
1627         if (ret)
1628                 return ERR_PTR(ret);
1629
1630         return set;
1631 }
1632
1633 static void nvme_tcp_free_admin_queue(struct nvme_ctrl *ctrl)
1634 {
1635         if (to_tcp_ctrl(ctrl)->async_req.pdu) {
1636                 cancel_work_sync(&ctrl->async_event_work);
1637                 nvme_tcp_free_async_req(to_tcp_ctrl(ctrl));
1638                 to_tcp_ctrl(ctrl)->async_req.pdu = NULL;
1639         }
1640
1641         nvme_tcp_free_queue(ctrl, 0);
1642 }
1643
1644 static void nvme_tcp_free_io_queues(struct nvme_ctrl *ctrl)
1645 {
1646         int i;
1647
1648         for (i = 1; i < ctrl->queue_count; i++)
1649                 nvme_tcp_free_queue(ctrl, i);
1650 }
1651
1652 static void nvme_tcp_stop_io_queues(struct nvme_ctrl *ctrl)
1653 {
1654         int i;
1655
1656         for (i = 1; i < ctrl->queue_count; i++)
1657                 nvme_tcp_stop_queue(ctrl, i);
1658 }
1659
1660 static int nvme_tcp_start_io_queues(struct nvme_ctrl *ctrl)
1661 {
1662         int i, ret = 0;
1663
1664         for (i = 1; i < ctrl->queue_count; i++) {
1665                 ret = nvme_tcp_start_queue(ctrl, i);
1666                 if (ret)
1667                         goto out_stop_queues;
1668         }
1669
1670         return 0;
1671
1672 out_stop_queues:
1673         for (i--; i >= 1; i--)
1674                 nvme_tcp_stop_queue(ctrl, i);
1675         return ret;
1676 }
1677
1678 static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl)
1679 {
1680         int ret;
1681
1682         ret = nvme_tcp_alloc_queue(ctrl, 0, NVME_AQ_DEPTH);
1683         if (ret)
1684                 return ret;
1685
1686         ret = nvme_tcp_alloc_async_req(to_tcp_ctrl(ctrl));
1687         if (ret)
1688                 goto out_free_queue;
1689
1690         return 0;
1691
1692 out_free_queue:
1693         nvme_tcp_free_queue(ctrl, 0);
1694         return ret;
1695 }
1696
1697 static int __nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
1698 {
1699         int i, ret;
1700
1701         for (i = 1; i < ctrl->queue_count; i++) {
1702                 ret = nvme_tcp_alloc_queue(ctrl, i,
1703                                 ctrl->sqsize + 1);
1704                 if (ret)
1705                         goto out_free_queues;
1706         }
1707
1708         return 0;
1709
1710 out_free_queues:
1711         for (i--; i >= 1; i--)
1712                 nvme_tcp_free_queue(ctrl, i);
1713
1714         return ret;
1715 }
1716
1717 static unsigned int nvme_tcp_nr_io_queues(struct nvme_ctrl *ctrl)
1718 {
1719         unsigned int nr_io_queues;
1720
1721         nr_io_queues = min(ctrl->opts->nr_io_queues, num_online_cpus());
1722         nr_io_queues += min(ctrl->opts->nr_write_queues, num_online_cpus());
1723         nr_io_queues += min(ctrl->opts->nr_poll_queues, num_online_cpus());
1724
1725         return nr_io_queues;
1726 }
1727
1728 static void nvme_tcp_set_io_queues(struct nvme_ctrl *nctrl,
1729                 unsigned int nr_io_queues)
1730 {
1731         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1732         struct nvmf_ctrl_options *opts = nctrl->opts;
1733
1734         if (opts->nr_write_queues && opts->nr_io_queues < nr_io_queues) {
1735                 /*
1736                  * separate read/write queues
1737                  * hand out dedicated default queues only after we have
1738                  * sufficient read queues.
1739                  */
1740                 ctrl->io_queues[HCTX_TYPE_READ] = opts->nr_io_queues;
1741                 nr_io_queues -= ctrl->io_queues[HCTX_TYPE_READ];
1742                 ctrl->io_queues[HCTX_TYPE_DEFAULT] =
1743                         min(opts->nr_write_queues, nr_io_queues);
1744                 nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
1745         } else {
1746                 /*
1747                  * shared read/write queues
1748                  * either no write queues were requested, or we don't have
1749                  * sufficient queue count to have dedicated default queues.
1750                  */
1751                 ctrl->io_queues[HCTX_TYPE_DEFAULT] =
1752                         min(opts->nr_io_queues, nr_io_queues);
1753                 nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
1754         }
1755
1756         if (opts->nr_poll_queues && nr_io_queues) {
1757                 /* map dedicated poll queues only if we have queues left */
1758                 ctrl->io_queues[HCTX_TYPE_POLL] =
1759                         min(opts->nr_poll_queues, nr_io_queues);
1760         }
1761 }
1762
1763 static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
1764 {
1765         unsigned int nr_io_queues;
1766         int ret;
1767
1768         nr_io_queues = nvme_tcp_nr_io_queues(ctrl);
1769         ret = nvme_set_queue_count(ctrl, &nr_io_queues);
1770         if (ret)
1771                 return ret;
1772
1773         ctrl->queue_count = nr_io_queues + 1;
1774         if (ctrl->queue_count < 2) {
1775                 dev_err(ctrl->device,
1776                         "unable to set any I/O queues\n");
1777                 return -ENOMEM;
1778         }
1779
1780         dev_info(ctrl->device,
1781                 "creating %d I/O queues.\n", nr_io_queues);
1782
1783         nvme_tcp_set_io_queues(ctrl, nr_io_queues);
1784
1785         return __nvme_tcp_alloc_io_queues(ctrl);
1786 }
1787
1788 static void nvme_tcp_destroy_io_queues(struct nvme_ctrl *ctrl, bool remove)
1789 {
1790         nvme_tcp_stop_io_queues(ctrl);
1791         if (remove) {
1792                 blk_cleanup_queue(ctrl->connect_q);
1793                 blk_mq_free_tag_set(ctrl->tagset);
1794         }
1795         nvme_tcp_free_io_queues(ctrl);
1796 }
1797
1798 static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
1799 {
1800         int ret;
1801
1802         ret = nvme_tcp_alloc_io_queues(ctrl);
1803         if (ret)
1804                 return ret;
1805
1806         if (new) {
1807                 ctrl->tagset = nvme_tcp_alloc_tagset(ctrl, false);
1808                 if (IS_ERR(ctrl->tagset)) {
1809                         ret = PTR_ERR(ctrl->tagset);
1810                         goto out_free_io_queues;
1811                 }
1812
1813                 ctrl->connect_q = blk_mq_init_queue(ctrl->tagset);
1814                 if (IS_ERR(ctrl->connect_q)) {
1815                         ret = PTR_ERR(ctrl->connect_q);
1816                         goto out_free_tag_set;
1817                 }
1818         }
1819
1820         ret = nvme_tcp_start_io_queues(ctrl);
1821         if (ret)
1822                 goto out_cleanup_connect_q;
1823
1824         if (!new) {
1825                 nvme_start_queues(ctrl);
1826                 if (!nvme_wait_freeze_timeout(ctrl, NVME_IO_TIMEOUT)) {
1827                         /*
1828                          * If we timed out waiting for freeze we are likely to
1829                          * be stuck.  Fail the controller initialization just
1830                          * to be safe.
1831                          */
1832                         ret = -ENODEV;
1833                         goto out_wait_freeze_timed_out;
1834                 }
1835                 blk_mq_update_nr_hw_queues(ctrl->tagset,
1836                         ctrl->queue_count - 1);
1837                 nvme_unfreeze(ctrl);
1838         }
1839
1840         return 0;
1841
1842 out_wait_freeze_timed_out:
1843         nvme_stop_queues(ctrl);
1844         nvme_sync_io_queues(ctrl);
1845         nvme_tcp_stop_io_queues(ctrl);
1846 out_cleanup_connect_q:
1847         nvme_cancel_tagset(ctrl);
1848         if (new)
1849                 blk_cleanup_queue(ctrl->connect_q);
1850 out_free_tag_set:
1851         if (new)
1852                 blk_mq_free_tag_set(ctrl->tagset);
1853 out_free_io_queues:
1854         nvme_tcp_free_io_queues(ctrl);
1855         return ret;
1856 }
1857
1858 static void nvme_tcp_destroy_admin_queue(struct nvme_ctrl *ctrl, bool remove)
1859 {
1860         nvme_tcp_stop_queue(ctrl, 0);
1861         if (remove) {
1862                 blk_cleanup_queue(ctrl->admin_q);
1863                 blk_cleanup_queue(ctrl->fabrics_q);
1864                 blk_mq_free_tag_set(ctrl->admin_tagset);
1865         }
1866         nvme_tcp_free_admin_queue(ctrl);
1867 }
1868
1869 static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
1870 {
1871         int error;
1872
1873         error = nvme_tcp_alloc_admin_queue(ctrl);
1874         if (error)
1875                 return error;
1876
1877         if (new) {
1878                 ctrl->admin_tagset = nvme_tcp_alloc_tagset(ctrl, true);
1879                 if (IS_ERR(ctrl->admin_tagset)) {
1880                         error = PTR_ERR(ctrl->admin_tagset);
1881                         goto out_free_queue;
1882                 }
1883
1884                 ctrl->fabrics_q = blk_mq_init_queue(ctrl->admin_tagset);
1885                 if (IS_ERR(ctrl->fabrics_q)) {
1886                         error = PTR_ERR(ctrl->fabrics_q);
1887                         goto out_free_tagset;
1888                 }
1889
1890                 ctrl->admin_q = blk_mq_init_queue(ctrl->admin_tagset);
1891                 if (IS_ERR(ctrl->admin_q)) {
1892                         error = PTR_ERR(ctrl->admin_q);
1893                         goto out_cleanup_fabrics_q;
1894                 }
1895         }
1896
1897         error = nvme_tcp_start_queue(ctrl, 0);
1898         if (error)
1899                 goto out_cleanup_queue;
1900
1901         error = nvme_enable_ctrl(ctrl);
1902         if (error)
1903                 goto out_stop_queue;
1904
1905         blk_mq_unquiesce_queue(ctrl->admin_q);
1906
1907         error = nvme_init_ctrl_finish(ctrl);
1908         if (error)
1909                 goto out_quiesce_queue;
1910
1911         return 0;
1912
1913 out_quiesce_queue:
1914         blk_mq_quiesce_queue(ctrl->admin_q);
1915         blk_sync_queue(ctrl->admin_q);
1916 out_stop_queue:
1917         nvme_tcp_stop_queue(ctrl, 0);
1918         nvme_cancel_admin_tagset(ctrl);
1919 out_cleanup_queue:
1920         if (new)
1921                 blk_cleanup_queue(ctrl->admin_q);
1922 out_cleanup_fabrics_q:
1923         if (new)
1924                 blk_cleanup_queue(ctrl->fabrics_q);
1925 out_free_tagset:
1926         if (new)
1927                 blk_mq_free_tag_set(ctrl->admin_tagset);
1928 out_free_queue:
1929         nvme_tcp_free_admin_queue(ctrl);
1930         return error;
1931 }
1932
1933 static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl,
1934                 bool remove)
1935 {
1936         blk_mq_quiesce_queue(ctrl->admin_q);
1937         blk_sync_queue(ctrl->admin_q);
1938         nvme_tcp_stop_queue(ctrl, 0);
1939         nvme_cancel_admin_tagset(ctrl);
1940         if (remove)
1941                 blk_mq_unquiesce_queue(ctrl->admin_q);
1942         nvme_tcp_destroy_admin_queue(ctrl, remove);
1943 }
1944
1945 static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,
1946                 bool remove)
1947 {
1948         if (ctrl->queue_count <= 1)
1949                 return;
1950         blk_mq_quiesce_queue(ctrl->admin_q);
1951         nvme_start_freeze(ctrl);
1952         nvme_stop_queues(ctrl);
1953         nvme_sync_io_queues(ctrl);
1954         nvme_tcp_stop_io_queues(ctrl);
1955         nvme_cancel_tagset(ctrl);
1956         if (remove)
1957                 nvme_start_queues(ctrl);
1958         nvme_tcp_destroy_io_queues(ctrl, remove);
1959 }
1960
1961 static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl)
1962 {
1963         /* If we are resetting/deleting then do nothing */
1964         if (ctrl->state != NVME_CTRL_CONNECTING) {
1965                 WARN_ON_ONCE(ctrl->state == NVME_CTRL_NEW ||
1966                         ctrl->state == NVME_CTRL_LIVE);
1967                 return;
1968         }
1969
1970         if (nvmf_should_reconnect(ctrl)) {
1971                 dev_info(ctrl->device, "Reconnecting in %d seconds...\n",
1972                         ctrl->opts->reconnect_delay);
1973                 queue_delayed_work(nvme_wq, &to_tcp_ctrl(ctrl)->connect_work,
1974                                 ctrl->opts->reconnect_delay * HZ);
1975         } else {
1976                 dev_info(ctrl->device, "Removing controller...\n");
1977                 nvme_delete_ctrl(ctrl);
1978         }
1979 }
1980
1981 static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
1982 {
1983         struct nvmf_ctrl_options *opts = ctrl->opts;
1984         int ret;
1985
1986         ret = nvme_tcp_configure_admin_queue(ctrl, new);
1987         if (ret)
1988                 return ret;
1989
1990         if (ctrl->icdoff) {
1991                 ret = -EOPNOTSUPP;
1992                 dev_err(ctrl->device, "icdoff is not supported!\n");
1993                 goto destroy_admin;
1994         }
1995
1996         if (!nvme_ctrl_sgl_supported(ctrl)) {
1997                 ret = -EOPNOTSUPP;
1998                 dev_err(ctrl->device, "Mandatory sgls are not supported!\n");
1999                 goto destroy_admin;
2000         }
2001
2002         if (opts->queue_size > ctrl->sqsize + 1)
2003                 dev_warn(ctrl->device,
2004                         "queue_size %zu > ctrl sqsize %u, clamping down\n",
2005                         opts->queue_size, ctrl->sqsize + 1);
2006
2007         if (ctrl->sqsize + 1 > ctrl->maxcmd) {
2008                 dev_warn(ctrl->device,
2009                         "sqsize %u > ctrl maxcmd %u, clamping down\n",
2010                         ctrl->sqsize + 1, ctrl->maxcmd);
2011                 ctrl->sqsize = ctrl->maxcmd - 1;
2012         }
2013
2014         if (ctrl->queue_count > 1) {
2015                 ret = nvme_tcp_configure_io_queues(ctrl, new);
2016                 if (ret)
2017                         goto destroy_admin;
2018         }
2019
2020         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) {
2021                 /*
2022                  * state change failure is ok if we started ctrl delete,
2023                  * unless we're during creation of a new controller to
2024                  * avoid races with teardown flow.
2025                  */
2026                 WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
2027                              ctrl->state != NVME_CTRL_DELETING_NOIO);
2028                 WARN_ON_ONCE(new);
2029                 ret = -EINVAL;
2030                 goto destroy_io;
2031         }
2032
2033         nvme_start_ctrl(ctrl);
2034         return 0;
2035
2036 destroy_io:
2037         if (ctrl->queue_count > 1) {
2038                 nvme_stop_queues(ctrl);
2039                 nvme_sync_io_queues(ctrl);
2040                 nvme_tcp_stop_io_queues(ctrl);
2041                 nvme_cancel_tagset(ctrl);
2042                 nvme_tcp_destroy_io_queues(ctrl, new);
2043         }
2044 destroy_admin:
2045         blk_mq_quiesce_queue(ctrl->admin_q);
2046         blk_sync_queue(ctrl->admin_q);
2047         nvme_tcp_stop_queue(ctrl, 0);
2048         nvme_cancel_admin_tagset(ctrl);
2049         nvme_tcp_destroy_admin_queue(ctrl, new);
2050         return ret;
2051 }
2052
2053 static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
2054 {
2055         struct nvme_tcp_ctrl *tcp_ctrl = container_of(to_delayed_work(work),
2056                         struct nvme_tcp_ctrl, connect_work);
2057         struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
2058
2059         ++ctrl->nr_reconnects;
2060
2061         if (nvme_tcp_setup_ctrl(ctrl, false))
2062                 goto requeue;
2063
2064         dev_info(ctrl->device, "Successfully reconnected (%d attempt)\n",
2065                         ctrl->nr_reconnects);
2066
2067         ctrl->nr_reconnects = 0;
2068
2069         return;
2070
2071 requeue:
2072         dev_info(ctrl->device, "Failed reconnect attempt %d\n",
2073                         ctrl->nr_reconnects);
2074         nvme_tcp_reconnect_or_remove(ctrl);
2075 }
2076
2077 static void nvme_tcp_error_recovery_work(struct work_struct *work)
2078 {
2079         struct nvme_tcp_ctrl *tcp_ctrl = container_of(work,
2080                                 struct nvme_tcp_ctrl, err_work);
2081         struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
2082
2083         nvme_stop_keep_alive(ctrl);
2084         nvme_tcp_teardown_io_queues(ctrl, false);
2085         /* unquiesce to fail fast pending requests */
2086         nvme_start_queues(ctrl);
2087         nvme_tcp_teardown_admin_queue(ctrl, false);
2088         blk_mq_unquiesce_queue(ctrl->admin_q);
2089
2090         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
2091                 /* state change failure is ok if we started ctrl delete */
2092                 WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
2093                              ctrl->state != NVME_CTRL_DELETING_NOIO);
2094                 return;
2095         }
2096
2097         nvme_tcp_reconnect_or_remove(ctrl);
2098 }
2099
2100 static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
2101 {
2102         cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work);
2103         cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
2104
2105         nvme_tcp_teardown_io_queues(ctrl, shutdown);
2106         blk_mq_quiesce_queue(ctrl->admin_q);
2107         if (shutdown)
2108                 nvme_shutdown_ctrl(ctrl);
2109         else
2110                 nvme_disable_ctrl(ctrl);
2111         nvme_tcp_teardown_admin_queue(ctrl, shutdown);
2112 }
2113
2114 static void nvme_tcp_delete_ctrl(struct nvme_ctrl *ctrl)
2115 {
2116         nvme_tcp_teardown_ctrl(ctrl, true);
2117 }
2118
2119 static void nvme_reset_ctrl_work(struct work_struct *work)
2120 {
2121         struct nvme_ctrl *ctrl =
2122                 container_of(work, struct nvme_ctrl, reset_work);
2123
2124         nvme_stop_ctrl(ctrl);
2125         nvme_tcp_teardown_ctrl(ctrl, false);
2126
2127         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
2128                 /* state change failure is ok if we started ctrl delete */
2129                 WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
2130                              ctrl->state != NVME_CTRL_DELETING_NOIO);
2131                 return;
2132         }
2133
2134         if (nvme_tcp_setup_ctrl(ctrl, false))
2135                 goto out_fail;
2136
2137         return;
2138
2139 out_fail:
2140         ++ctrl->nr_reconnects;
2141         nvme_tcp_reconnect_or_remove(ctrl);
2142 }
2143
2144 static void nvme_tcp_free_ctrl(struct nvme_ctrl *nctrl)
2145 {
2146         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
2147
2148         if (list_empty(&ctrl->list))
2149                 goto free_ctrl;
2150
2151         mutex_lock(&nvme_tcp_ctrl_mutex);
2152         list_del(&ctrl->list);
2153         mutex_unlock(&nvme_tcp_ctrl_mutex);
2154
2155         nvmf_free_options(nctrl->opts);
2156 free_ctrl:
2157         kfree(ctrl->queues);
2158         kfree(ctrl);
2159 }
2160
2161 static void nvme_tcp_set_sg_null(struct nvme_command *c)
2162 {
2163         struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
2164
2165         sg->addr = 0;
2166         sg->length = 0;
2167         sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
2168                         NVME_SGL_FMT_TRANSPORT_A;
2169 }
2170
2171 static void nvme_tcp_set_sg_inline(struct nvme_tcp_queue *queue,
2172                 struct nvme_command *c, u32 data_len)
2173 {
2174         struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
2175
2176         sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff);
2177         sg->length = cpu_to_le32(data_len);
2178         sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET;
2179 }
2180
2181 static void nvme_tcp_set_sg_host_data(struct nvme_command *c,
2182                 u32 data_len)
2183 {
2184         struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
2185
2186         sg->addr = 0;
2187         sg->length = cpu_to_le32(data_len);
2188         sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
2189                         NVME_SGL_FMT_TRANSPORT_A;
2190 }
2191
2192 static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg)
2193 {
2194         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(arg);
2195         struct nvme_tcp_queue *queue = &ctrl->queues[0];
2196         struct nvme_tcp_cmd_pdu *pdu = ctrl->async_req.pdu;
2197         struct nvme_command *cmd = &pdu->cmd;
2198         u8 hdgst = nvme_tcp_hdgst_len(queue);
2199
2200         memset(pdu, 0, sizeof(*pdu));
2201         pdu->hdr.type = nvme_tcp_cmd;
2202         if (queue->hdr_digest)
2203                 pdu->hdr.flags |= NVME_TCP_F_HDGST;
2204         pdu->hdr.hlen = sizeof(*pdu);
2205         pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
2206
2207         cmd->common.opcode = nvme_admin_async_event;
2208         cmd->common.command_id = NVME_AQ_BLK_MQ_DEPTH;
2209         cmd->common.flags |= NVME_CMD_SGL_METABUF;
2210         nvme_tcp_set_sg_null(cmd);
2211
2212         ctrl->async_req.state = NVME_TCP_SEND_CMD_PDU;
2213         ctrl->async_req.offset = 0;
2214         ctrl->async_req.curr_bio = NULL;
2215         ctrl->async_req.data_len = 0;
2216
2217         nvme_tcp_queue_request(&ctrl->async_req, true, true);
2218 }
2219
2220 static void nvme_tcp_complete_timed_out(struct request *rq)
2221 {
2222         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2223         struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl;
2224
2225         nvme_tcp_stop_queue(ctrl, nvme_tcp_queue_id(req->queue));
2226         if (blk_mq_request_started(rq) && !blk_mq_request_completed(rq)) {
2227                 nvme_req(rq)->status = NVME_SC_HOST_ABORTED_CMD;
2228                 blk_mq_complete_request(rq);
2229         }
2230 }
2231
2232 static enum blk_eh_timer_return
2233 nvme_tcp_timeout(struct request *rq, bool reserved)
2234 {
2235         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2236         struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl;
2237         struct nvme_tcp_cmd_pdu *pdu = req->pdu;
2238
2239         dev_warn(ctrl->device,
2240                 "queue %d: timeout request %#x type %d\n",
2241                 nvme_tcp_queue_id(req->queue), rq->tag, pdu->hdr.type);
2242
2243         if (ctrl->state != NVME_CTRL_LIVE) {
2244                 /*
2245                  * If we are resetting, connecting or deleting we should
2246                  * complete immediately because we may block controller
2247                  * teardown or setup sequence
2248                  * - ctrl disable/shutdown fabrics requests
2249                  * - connect requests
2250                  * - initialization admin requests
2251                  * - I/O requests that entered after unquiescing and
2252                  *   the controller stopped responding
2253                  *
2254                  * All other requests should be cancelled by the error
2255                  * recovery work, so it's fine that we fail it here.
2256                  */
2257                 nvme_tcp_complete_timed_out(rq);
2258                 return BLK_EH_DONE;
2259         }
2260
2261         /*
2262          * LIVE state should trigger the normal error recovery which will
2263          * handle completing this request.
2264          */
2265         nvme_tcp_error_recovery(ctrl);
2266         return BLK_EH_RESET_TIMER;
2267 }
2268
2269 static blk_status_t nvme_tcp_map_data(struct nvme_tcp_queue *queue,
2270                         struct request *rq)
2271 {
2272         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2273         struct nvme_tcp_cmd_pdu *pdu = req->pdu;
2274         struct nvme_command *c = &pdu->cmd;
2275
2276         c->common.flags |= NVME_CMD_SGL_METABUF;
2277
2278         if (!blk_rq_nr_phys_segments(rq))
2279                 nvme_tcp_set_sg_null(c);
2280         else if (rq_data_dir(rq) == WRITE &&
2281             req->data_len <= nvme_tcp_inline_data_size(queue))
2282                 nvme_tcp_set_sg_inline(queue, c, req->data_len);
2283         else
2284                 nvme_tcp_set_sg_host_data(c, req->data_len);
2285
2286         return 0;
2287 }
2288
2289 static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns,
2290                 struct request *rq)
2291 {
2292         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2293         struct nvme_tcp_cmd_pdu *pdu = req->pdu;
2294         struct nvme_tcp_queue *queue = req->queue;
2295         u8 hdgst = nvme_tcp_hdgst_len(queue), ddgst = 0;
2296         blk_status_t ret;
2297
2298         ret = nvme_setup_cmd(ns, rq);
2299         if (ret)
2300                 return ret;
2301
2302         req->state = NVME_TCP_SEND_CMD_PDU;
2303         req->offset = 0;
2304         req->data_sent = 0;
2305         req->pdu_len = 0;
2306         req->pdu_sent = 0;
2307         req->data_len = blk_rq_nr_phys_segments(rq) ?
2308                                 blk_rq_payload_bytes(rq) : 0;
2309         req->curr_bio = rq->bio;
2310         if (req->curr_bio && req->data_len)
2311                 nvme_tcp_init_iter(req, rq_data_dir(rq));
2312
2313         if (rq_data_dir(rq) == WRITE &&
2314             req->data_len <= nvme_tcp_inline_data_size(queue))
2315                 req->pdu_len = req->data_len;
2316
2317         pdu->hdr.type = nvme_tcp_cmd;
2318         pdu->hdr.flags = 0;
2319         if (queue->hdr_digest)
2320                 pdu->hdr.flags |= NVME_TCP_F_HDGST;
2321         if (queue->data_digest && req->pdu_len) {
2322                 pdu->hdr.flags |= NVME_TCP_F_DDGST;
2323                 ddgst = nvme_tcp_ddgst_len(queue);
2324         }
2325         pdu->hdr.hlen = sizeof(*pdu);
2326         pdu->hdr.pdo = req->pdu_len ? pdu->hdr.hlen + hdgst : 0;
2327         pdu->hdr.plen =
2328                 cpu_to_le32(pdu->hdr.hlen + hdgst + req->pdu_len + ddgst);
2329
2330         ret = nvme_tcp_map_data(queue, rq);
2331         if (unlikely(ret)) {
2332                 nvme_cleanup_cmd(rq);
2333                 dev_err(queue->ctrl->ctrl.device,
2334                         "Failed to map data (%d)\n", ret);
2335                 return ret;
2336         }
2337
2338         return 0;
2339 }
2340
2341 static void nvme_tcp_commit_rqs(struct blk_mq_hw_ctx *hctx)
2342 {
2343         struct nvme_tcp_queue *queue = hctx->driver_data;
2344
2345         if (!llist_empty(&queue->req_list))
2346                 queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
2347 }
2348
2349 static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
2350                 const struct blk_mq_queue_data *bd)
2351 {
2352         struct nvme_ns *ns = hctx->queue->queuedata;
2353         struct nvme_tcp_queue *queue = hctx->driver_data;
2354         struct request *rq = bd->rq;
2355         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2356         bool queue_ready = test_bit(NVME_TCP_Q_LIVE, &queue->flags);
2357         blk_status_t ret;
2358
2359         if (!nvme_check_ready(&queue->ctrl->ctrl, rq, queue_ready))
2360                 return nvme_fail_nonready_command(&queue->ctrl->ctrl, rq);
2361
2362         ret = nvme_tcp_setup_cmd_pdu(ns, rq);
2363         if (unlikely(ret))
2364                 return ret;
2365
2366         blk_mq_start_request(rq);
2367
2368         nvme_tcp_queue_request(req, true, bd->last);
2369
2370         return BLK_STS_OK;
2371 }
2372
2373 static int nvme_tcp_map_queues(struct blk_mq_tag_set *set)
2374 {
2375         struct nvme_tcp_ctrl *ctrl = set->driver_data;
2376         struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
2377
2378         if (opts->nr_write_queues && ctrl->io_queues[HCTX_TYPE_READ]) {
2379                 /* separate read/write queues */
2380                 set->map[HCTX_TYPE_DEFAULT].nr_queues =
2381                         ctrl->io_queues[HCTX_TYPE_DEFAULT];
2382                 set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
2383                 set->map[HCTX_TYPE_READ].nr_queues =
2384                         ctrl->io_queues[HCTX_TYPE_READ];
2385                 set->map[HCTX_TYPE_READ].queue_offset =
2386                         ctrl->io_queues[HCTX_TYPE_DEFAULT];
2387         } else {
2388                 /* shared read/write queues */
2389                 set->map[HCTX_TYPE_DEFAULT].nr_queues =
2390                         ctrl->io_queues[HCTX_TYPE_DEFAULT];
2391                 set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
2392                 set->map[HCTX_TYPE_READ].nr_queues =
2393                         ctrl->io_queues[HCTX_TYPE_DEFAULT];
2394                 set->map[HCTX_TYPE_READ].queue_offset = 0;
2395         }
2396         blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
2397         blk_mq_map_queues(&set->map[HCTX_TYPE_READ]);
2398
2399         if (opts->nr_poll_queues && ctrl->io_queues[HCTX_TYPE_POLL]) {
2400                 /* map dedicated poll queues only if we have queues left */
2401                 set->map[HCTX_TYPE_POLL].nr_queues =
2402                                 ctrl->io_queues[HCTX_TYPE_POLL];
2403                 set->map[HCTX_TYPE_POLL].queue_offset =
2404                         ctrl->io_queues[HCTX_TYPE_DEFAULT] +
2405                         ctrl->io_queues[HCTX_TYPE_READ];
2406                 blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]);
2407         }
2408
2409         dev_info(ctrl->ctrl.device,
2410                 "mapped %d/%d/%d default/read/poll queues.\n",
2411                 ctrl->io_queues[HCTX_TYPE_DEFAULT],
2412                 ctrl->io_queues[HCTX_TYPE_READ],
2413                 ctrl->io_queues[HCTX_TYPE_POLL]);
2414
2415         return 0;
2416 }
2417
2418 static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx)
2419 {
2420         struct nvme_tcp_queue *queue = hctx->driver_data;
2421         struct sock *sk = queue->sock->sk;
2422
2423         if (!test_bit(NVME_TCP_Q_LIVE, &queue->flags))
2424                 return 0;
2425
2426         set_bit(NVME_TCP_Q_POLLING, &queue->flags);
2427         if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue))
2428                 sk_busy_loop(sk, true);
2429         nvme_tcp_try_recv(queue);
2430         clear_bit(NVME_TCP_Q_POLLING, &queue->flags);
2431         return queue->nr_cqe;
2432 }
2433
2434 static const struct blk_mq_ops nvme_tcp_mq_ops = {
2435         .queue_rq       = nvme_tcp_queue_rq,
2436         .commit_rqs     = nvme_tcp_commit_rqs,
2437         .complete       = nvme_complete_rq,
2438         .init_request   = nvme_tcp_init_request,
2439         .exit_request   = nvme_tcp_exit_request,
2440         .init_hctx      = nvme_tcp_init_hctx,
2441         .timeout        = nvme_tcp_timeout,
2442         .map_queues     = nvme_tcp_map_queues,
2443         .poll           = nvme_tcp_poll,
2444 };
2445
2446 static const struct blk_mq_ops nvme_tcp_admin_mq_ops = {
2447         .queue_rq       = nvme_tcp_queue_rq,
2448         .complete       = nvme_complete_rq,
2449         .init_request   = nvme_tcp_init_request,
2450         .exit_request   = nvme_tcp_exit_request,
2451         .init_hctx      = nvme_tcp_init_admin_hctx,
2452         .timeout        = nvme_tcp_timeout,
2453 };
2454
2455 static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = {
2456         .name                   = "tcp",
2457         .module                 = THIS_MODULE,
2458         .flags                  = NVME_F_FABRICS,
2459         .reg_read32             = nvmf_reg_read32,
2460         .reg_read64             = nvmf_reg_read64,
2461         .reg_write32            = nvmf_reg_write32,
2462         .free_ctrl              = nvme_tcp_free_ctrl,
2463         .submit_async_event     = nvme_tcp_submit_async_event,
2464         .delete_ctrl            = nvme_tcp_delete_ctrl,
2465         .get_address            = nvmf_get_address,
2466 };
2467
2468 static bool
2469 nvme_tcp_existing_controller(struct nvmf_ctrl_options *opts)
2470 {
2471         struct nvme_tcp_ctrl *ctrl;
2472         bool found = false;
2473
2474         mutex_lock(&nvme_tcp_ctrl_mutex);
2475         list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list) {
2476                 found = nvmf_ip_options_match(&ctrl->ctrl, opts);
2477                 if (found)
2478                         break;
2479         }
2480         mutex_unlock(&nvme_tcp_ctrl_mutex);
2481
2482         return found;
2483 }
2484
2485 static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
2486                 struct nvmf_ctrl_options *opts)
2487 {
2488         struct nvme_tcp_ctrl *ctrl;
2489         int ret;
2490
2491         ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
2492         if (!ctrl)
2493                 return ERR_PTR(-ENOMEM);
2494
2495         INIT_LIST_HEAD(&ctrl->list);
2496         ctrl->ctrl.opts = opts;
2497         ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues +
2498                                 opts->nr_poll_queues + 1;
2499         ctrl->ctrl.sqsize = opts->queue_size - 1;
2500         ctrl->ctrl.kato = opts->kato;
2501
2502         INIT_DELAYED_WORK(&ctrl->connect_work,
2503                         nvme_tcp_reconnect_ctrl_work);
2504         INIT_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work);
2505         INIT_WORK(&ctrl->ctrl.reset_work, nvme_reset_ctrl_work);
2506
2507         if (!(opts->mask & NVMF_OPT_TRSVCID)) {
2508                 opts->trsvcid =
2509                         kstrdup(__stringify(NVME_TCP_DISC_PORT), GFP_KERNEL);
2510                 if (!opts->trsvcid) {
2511                         ret = -ENOMEM;
2512                         goto out_free_ctrl;
2513                 }
2514                 opts->mask |= NVMF_OPT_TRSVCID;
2515         }
2516
2517         ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
2518                         opts->traddr, opts->trsvcid, &ctrl->addr);
2519         if (ret) {
2520                 pr_err("malformed address passed: %s:%s\n",
2521                         opts->traddr, opts->trsvcid);
2522                 goto out_free_ctrl;
2523         }
2524
2525         if (opts->mask & NVMF_OPT_HOST_TRADDR) {
2526                 ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
2527                         opts->host_traddr, NULL, &ctrl->src_addr);
2528                 if (ret) {
2529                         pr_err("malformed src address passed: %s\n",
2530                                opts->host_traddr);
2531                         goto out_free_ctrl;
2532                 }
2533         }
2534
2535         if (opts->mask & NVMF_OPT_HOST_IFACE) {
2536                 ctrl->ndev = dev_get_by_name(&init_net, opts->host_iface);
2537                 if (!ctrl->ndev) {
2538                         pr_err("invalid interface passed: %s\n",
2539                                opts->host_iface);
2540                         ret = -ENODEV;
2541                         goto out_free_ctrl;
2542                 }
2543         }
2544
2545         if (!opts->duplicate_connect && nvme_tcp_existing_controller(opts)) {
2546                 ret = -EALREADY;
2547                 goto out_free_ctrl;
2548         }
2549
2550         ctrl->queues = kcalloc(ctrl->ctrl.queue_count, sizeof(*ctrl->queues),
2551                                 GFP_KERNEL);
2552         if (!ctrl->queues) {
2553                 ret = -ENOMEM;
2554                 goto out_free_ctrl;
2555         }
2556
2557         ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_tcp_ctrl_ops, 0);
2558         if (ret)
2559                 goto out_kfree_queues;
2560
2561         if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
2562                 WARN_ON_ONCE(1);
2563                 ret = -EINTR;
2564                 goto out_uninit_ctrl;
2565         }
2566
2567         ret = nvme_tcp_setup_ctrl(&ctrl->ctrl, true);
2568         if (ret)
2569                 goto out_uninit_ctrl;
2570
2571         dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n",
2572                 ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
2573
2574         mutex_lock(&nvme_tcp_ctrl_mutex);
2575         list_add_tail(&ctrl->list, &nvme_tcp_ctrl_list);
2576         mutex_unlock(&nvme_tcp_ctrl_mutex);
2577
2578         return &ctrl->ctrl;
2579
2580 out_uninit_ctrl:
2581         nvme_uninit_ctrl(&ctrl->ctrl);
2582         nvme_put_ctrl(&ctrl->ctrl);
2583         if (ret > 0)
2584                 ret = -EIO;
2585         return ERR_PTR(ret);
2586 out_kfree_queues:
2587         kfree(ctrl->queues);
2588 out_free_ctrl:
2589         kfree(ctrl);
2590         return ERR_PTR(ret);
2591 }
2592
2593 static struct nvmf_transport_ops nvme_tcp_transport = {
2594         .name           = "tcp",
2595         .module         = THIS_MODULE,
2596         .required_opts  = NVMF_OPT_TRADDR,
2597         .allowed_opts   = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
2598                           NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
2599                           NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST |
2600                           NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES |
2601                           NVMF_OPT_TOS | NVMF_OPT_HOST_IFACE,
2602         .create_ctrl    = nvme_tcp_create_ctrl,
2603 };
2604
2605 static int __init nvme_tcp_init_module(void)
2606 {
2607         nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq",
2608                         WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
2609         if (!nvme_tcp_wq)
2610                 return -ENOMEM;
2611
2612         nvmf_register_transport(&nvme_tcp_transport);
2613         return 0;
2614 }
2615
2616 static void __exit nvme_tcp_cleanup_module(void)
2617 {
2618         struct nvme_tcp_ctrl *ctrl;
2619
2620         nvmf_unregister_transport(&nvme_tcp_transport);
2621
2622         mutex_lock(&nvme_tcp_ctrl_mutex);
2623         list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list)
2624                 nvme_delete_ctrl(&ctrl->ctrl);
2625         mutex_unlock(&nvme_tcp_ctrl_mutex);
2626         flush_workqueue(nvme_delete_wq);
2627
2628         destroy_workqueue(nvme_tcp_wq);
2629 }
2630
2631 module_init(nvme_tcp_init_module);
2632 module_exit(nvme_tcp_cleanup_module);
2633
2634 MODULE_LICENSE("GPL v2");