468fdd0d8713ccce5f9f561d50a2c4fed309a303
[platform/kernel/linux-starfive.git] / drivers / infiniband / ulp / rtrs / rtrs-clt.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * RDMA Transport Layer
4  *
5  * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved.
6  * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved.
7  * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved.
8  */
9
10 #undef pr_fmt
11 #define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt
12
13 #include <linux/module.h>
14 #include <linux/rculist.h>
15 #include <linux/blkdev.h> /* for BLK_MAX_SEGMENT_SIZE */
16
17 #include "rtrs-clt.h"
18 #include "rtrs-log.h"
19
20 #define RTRS_CONNECT_TIMEOUT_MS 30000
21 /*
22  * Wait a bit before trying to reconnect after a failure
23  * in order to give server time to finish clean up which
24  * leads to "false positives" failed reconnect attempts
25  */
26 #define RTRS_RECONNECT_BACKOFF 1000
27
28 MODULE_DESCRIPTION("RDMA Transport Client");
29 MODULE_LICENSE("GPL");
30
31 static const struct rtrs_rdma_dev_pd_ops dev_pd_ops;
32 static struct rtrs_rdma_dev_pd dev_pd = {
33         .ops = &dev_pd_ops
34 };
35
36 static struct workqueue_struct *rtrs_wq;
37 static struct class *rtrs_clt_dev_class;
38
39 static inline bool rtrs_clt_is_connected(const struct rtrs_clt *clt)
40 {
41         struct rtrs_clt_sess *sess;
42         bool connected = false;
43
44         rcu_read_lock();
45         list_for_each_entry_rcu(sess, &clt->paths_list, s.entry)
46                 connected |= READ_ONCE(sess->state) == RTRS_CLT_CONNECTED;
47         rcu_read_unlock();
48
49         return connected;
50 }
51
52 static struct rtrs_permit *
53 __rtrs_get_permit(struct rtrs_clt *clt, enum rtrs_clt_con_type con_type)
54 {
55         size_t max_depth = clt->queue_depth;
56         struct rtrs_permit *permit;
57         int bit;
58
59         /*
60          * Adapted from null_blk get_tag(). Callers from different cpus may
61          * grab the same bit, since find_first_zero_bit is not atomic.
62          * But then the test_and_set_bit_lock will fail for all the
63          * callers but one, so that they will loop again.
64          * This way an explicit spinlock is not required.
65          */
66         do {
67                 bit = find_first_zero_bit(clt->permits_map, max_depth);
68                 if (unlikely(bit >= max_depth))
69                         return NULL;
70         } while (unlikely(test_and_set_bit_lock(bit, clt->permits_map)));
71
72         permit = get_permit(clt, bit);
73         WARN_ON(permit->mem_id != bit);
74         permit->cpu_id = raw_smp_processor_id();
75         permit->con_type = con_type;
76
77         return permit;
78 }
79
80 static inline void __rtrs_put_permit(struct rtrs_clt *clt,
81                                       struct rtrs_permit *permit)
82 {
83         clear_bit_unlock(permit->mem_id, clt->permits_map);
84 }
85
86 /**
87  * rtrs_clt_get_permit() - allocates permit for future RDMA operation
88  * @clt:        Current session
89  * @con_type:   Type of connection to use with the permit
90  * @can_wait:   Wait type
91  *
92  * Description:
93  *    Allocates permit for the following RDMA operation.  Permit is used
94  *    to preallocate all resources and to propagate memory pressure
95  *    up earlier.
96  *
97  * Context:
98  *    Can sleep if @wait == RTRS_TAG_WAIT
99  */
100 struct rtrs_permit *rtrs_clt_get_permit(struct rtrs_clt *clt,
101                                           enum rtrs_clt_con_type con_type,
102                                           int can_wait)
103 {
104         struct rtrs_permit *permit;
105         DEFINE_WAIT(wait);
106
107         permit = __rtrs_get_permit(clt, con_type);
108         if (likely(permit) || !can_wait)
109                 return permit;
110
111         do {
112                 prepare_to_wait(&clt->permits_wait, &wait,
113                                 TASK_UNINTERRUPTIBLE);
114                 permit = __rtrs_get_permit(clt, con_type);
115                 if (likely(permit))
116                         break;
117
118                 io_schedule();
119         } while (1);
120
121         finish_wait(&clt->permits_wait, &wait);
122
123         return permit;
124 }
125 EXPORT_SYMBOL(rtrs_clt_get_permit);
126
127 /**
128  * rtrs_clt_put_permit() - puts allocated permit
129  * @clt:        Current session
130  * @permit:     Permit to be freed
131  *
132  * Context:
133  *    Does not matter
134  */
135 void rtrs_clt_put_permit(struct rtrs_clt *clt, struct rtrs_permit *permit)
136 {
137         if (WARN_ON(!test_bit(permit->mem_id, clt->permits_map)))
138                 return;
139
140         __rtrs_put_permit(clt, permit);
141
142         /*
143          * rtrs_clt_get_permit() adds itself to the &clt->permits_wait list
144          * before calling schedule(). So if rtrs_clt_get_permit() is sleeping
145          * it must have added itself to &clt->permits_wait before
146          * __rtrs_put_permit() finished.
147          * Hence it is safe to guard wake_up() with a waitqueue_active() test.
148          */
149         if (waitqueue_active(&clt->permits_wait))
150                 wake_up(&clt->permits_wait);
151 }
152 EXPORT_SYMBOL(rtrs_clt_put_permit);
153
154 void *rtrs_permit_to_pdu(struct rtrs_permit *permit)
155 {
156         return permit + 1;
157 }
158 EXPORT_SYMBOL(rtrs_permit_to_pdu);
159
160 /**
161  * rtrs_permit_to_clt_con() - returns RDMA connection pointer by the permit
162  * @sess: client session pointer
163  * @permit: permit for the allocation of the RDMA buffer
164  * Note:
165  *     IO connection starts from 1.
166  *     0 connection is for user messages.
167  */
168 static
169 struct rtrs_clt_con *rtrs_permit_to_clt_con(struct rtrs_clt_sess *sess,
170                                             struct rtrs_permit *permit)
171 {
172         int id = 0;
173
174         if (likely(permit->con_type == RTRS_IO_CON))
175                 id = (permit->cpu_id % (sess->s.con_num - 1)) + 1;
176
177         return to_clt_con(sess->s.con[id]);
178 }
179
180 /**
181  * __rtrs_clt_change_state() - change the session state through session state
182  * machine.
183  *
184  * @sess: client session to change the state of.
185  * @new_state: state to change to.
186  *
187  * returns true if successful, false if the requested state can not be set.
188  *
189  * Locks:
190  * state_wq lock must be hold.
191  */
192 static bool __rtrs_clt_change_state(struct rtrs_clt_sess *sess,
193                                      enum rtrs_clt_state new_state)
194 {
195         enum rtrs_clt_state old_state;
196         bool changed = false;
197
198         lockdep_assert_held(&sess->state_wq.lock);
199
200         old_state = sess->state;
201         switch (new_state) {
202         case RTRS_CLT_CONNECTING:
203                 switch (old_state) {
204                 case RTRS_CLT_RECONNECTING:
205                         changed = true;
206                         fallthrough;
207                 default:
208                         break;
209                 }
210                 break;
211         case RTRS_CLT_RECONNECTING:
212                 switch (old_state) {
213                 case RTRS_CLT_CONNECTED:
214                 case RTRS_CLT_CONNECTING_ERR:
215                 case RTRS_CLT_CLOSED:
216                         changed = true;
217                         fallthrough;
218                 default:
219                         break;
220                 }
221                 break;
222         case RTRS_CLT_CONNECTED:
223                 switch (old_state) {
224                 case RTRS_CLT_CONNECTING:
225                         changed = true;
226                         fallthrough;
227                 default:
228                         break;
229                 }
230                 break;
231         case RTRS_CLT_CONNECTING_ERR:
232                 switch (old_state) {
233                 case RTRS_CLT_CONNECTING:
234                         changed = true;
235                         fallthrough;
236                 default:
237                         break;
238                 }
239                 break;
240         case RTRS_CLT_CLOSING:
241                 switch (old_state) {
242                 case RTRS_CLT_CONNECTING:
243                 case RTRS_CLT_CONNECTING_ERR:
244                 case RTRS_CLT_RECONNECTING:
245                 case RTRS_CLT_CONNECTED:
246                         changed = true;
247                         fallthrough;
248                 default:
249                         break;
250                 }
251                 break;
252         case RTRS_CLT_CLOSED:
253                 switch (old_state) {
254                 case RTRS_CLT_CLOSING:
255                         changed = true;
256                         fallthrough;
257                 default:
258                         break;
259                 }
260                 break;
261         case RTRS_CLT_DEAD:
262                 switch (old_state) {
263                 case RTRS_CLT_CLOSED:
264                         changed = true;
265                         fallthrough;
266                 default:
267                         break;
268                 }
269                 break;
270         default:
271                 break;
272         }
273         if (changed) {
274                 sess->state = new_state;
275                 wake_up_locked(&sess->state_wq);
276         }
277
278         return changed;
279 }
280
281 static bool rtrs_clt_change_state_from_to(struct rtrs_clt_sess *sess,
282                                            enum rtrs_clt_state old_state,
283                                            enum rtrs_clt_state new_state)
284 {
285         bool changed = false;
286
287         spin_lock_irq(&sess->state_wq.lock);
288         if (sess->state == old_state)
289                 changed = __rtrs_clt_change_state(sess, new_state);
290         spin_unlock_irq(&sess->state_wq.lock);
291
292         return changed;
293 }
294
295 static void rtrs_rdma_error_recovery(struct rtrs_clt_con *con)
296 {
297         struct rtrs_clt_sess *sess = to_clt_sess(con->c.sess);
298
299         if (rtrs_clt_change_state_from_to(sess,
300                                            RTRS_CLT_CONNECTED,
301                                            RTRS_CLT_RECONNECTING)) {
302                 struct rtrs_clt *clt = sess->clt;
303                 unsigned int delay_ms;
304
305                 /*
306                  * Normal scenario, reconnect if we were successfully connected
307                  */
308                 delay_ms = clt->reconnect_delay_sec * 1000;
309                 queue_delayed_work(rtrs_wq, &sess->reconnect_dwork,
310                                    msecs_to_jiffies(delay_ms));
311         } else {
312                 /*
313                  * Error can happen just on establishing new connection,
314                  * so notify waiter with error state, waiter is responsible
315                  * for cleaning the rest and reconnect if needed.
316                  */
317                 rtrs_clt_change_state_from_to(sess,
318                                                RTRS_CLT_CONNECTING,
319                                                RTRS_CLT_CONNECTING_ERR);
320         }
321 }
322
323 static void rtrs_clt_fast_reg_done(struct ib_cq *cq, struct ib_wc *wc)
324 {
325         struct rtrs_clt_con *con = cq->cq_context;
326
327         if (unlikely(wc->status != IB_WC_SUCCESS)) {
328                 rtrs_err(con->c.sess, "Failed IB_WR_REG_MR: %s\n",
329                           ib_wc_status_msg(wc->status));
330                 rtrs_rdma_error_recovery(con);
331         }
332 }
333
334 static struct ib_cqe fast_reg_cqe = {
335         .done = rtrs_clt_fast_reg_done
336 };
337
338 static void complete_rdma_req(struct rtrs_clt_io_req *req, int errno,
339                               bool notify, bool can_wait);
340
341 static void rtrs_clt_inv_rkey_done(struct ib_cq *cq, struct ib_wc *wc)
342 {
343         struct rtrs_clt_io_req *req =
344                 container_of(wc->wr_cqe, typeof(*req), inv_cqe);
345         struct rtrs_clt_con *con = cq->cq_context;
346
347         if (unlikely(wc->status != IB_WC_SUCCESS)) {
348                 rtrs_err(con->c.sess, "Failed IB_WR_LOCAL_INV: %s\n",
349                           ib_wc_status_msg(wc->status));
350                 rtrs_rdma_error_recovery(con);
351         }
352         req->need_inv = false;
353         if (likely(req->need_inv_comp))
354                 complete(&req->inv_comp);
355         else
356                 /* Complete request from INV callback */
357                 complete_rdma_req(req, req->inv_errno, true, false);
358 }
359
360 static int rtrs_inv_rkey(struct rtrs_clt_io_req *req)
361 {
362         struct rtrs_clt_con *con = req->con;
363         struct ib_send_wr wr = {
364                 .opcode             = IB_WR_LOCAL_INV,
365                 .wr_cqe             = &req->inv_cqe,
366                 .send_flags         = IB_SEND_SIGNALED,
367                 .ex.invalidate_rkey = req->mr->rkey,
368         };
369         req->inv_cqe.done = rtrs_clt_inv_rkey_done;
370
371         return ib_post_send(con->c.qp, &wr, NULL);
372 }
373
374 static void complete_rdma_req(struct rtrs_clt_io_req *req, int errno,
375                               bool notify, bool can_wait)
376 {
377         struct rtrs_clt_con *con = req->con;
378         struct rtrs_clt_sess *sess;
379         int err;
380
381         if (WARN_ON(!req->in_use))
382                 return;
383         if (WARN_ON(!req->con))
384                 return;
385         sess = to_clt_sess(con->c.sess);
386
387         if (req->sg_cnt) {
388                 if (unlikely(req->dir == DMA_FROM_DEVICE && req->need_inv)) {
389                         /*
390                          * We are here to invalidate read requests
391                          * ourselves.  In normal scenario server should
392                          * send INV for all read requests, but
393                          * we are here, thus two things could happen:
394                          *
395                          *    1.  this is failover, when errno != 0
396                          *        and can_wait == 1,
397                          *
398                          *    2.  something totally bad happened and
399                          *        server forgot to send INV, so we
400                          *        should do that ourselves.
401                          */
402
403                         if (likely(can_wait)) {
404                                 req->need_inv_comp = true;
405                         } else {
406                                 /* This should be IO path, so always notify */
407                                 WARN_ON(!notify);
408                                 /* Save errno for INV callback */
409                                 req->inv_errno = errno;
410                         }
411
412                         err = rtrs_inv_rkey(req);
413                         if (unlikely(err)) {
414                                 rtrs_err(con->c.sess, "Send INV WR key=%#x: %d\n",
415                                           req->mr->rkey, err);
416                         } else if (likely(can_wait)) {
417                                 wait_for_completion(&req->inv_comp);
418                         } else {
419                                 /*
420                                  * Something went wrong, so request will be
421                                  * completed from INV callback.
422                                  */
423                                 WARN_ON_ONCE(1);
424
425                                 return;
426                         }
427                 }
428                 ib_dma_unmap_sg(sess->s.dev->ib_dev, req->sglist,
429                                 req->sg_cnt, req->dir);
430         }
431         if (sess->clt->mp_policy == MP_POLICY_MIN_INFLIGHT)
432                 atomic_dec(&sess->stats->inflight);
433
434         req->in_use = false;
435         req->con = NULL;
436
437         if (notify)
438                 req->conf(req->priv, errno);
439 }
440
441 static int rtrs_post_send_rdma(struct rtrs_clt_con *con,
442                                 struct rtrs_clt_io_req *req,
443                                 struct rtrs_rbuf *rbuf, u32 off,
444                                 u32 imm, struct ib_send_wr *wr)
445 {
446         struct rtrs_clt_sess *sess = to_clt_sess(con->c.sess);
447         enum ib_send_flags flags;
448         struct ib_sge sge;
449
450         if (unlikely(!req->sg_size)) {
451                 rtrs_wrn(con->c.sess,
452                          "Doing RDMA Write failed, no data supplied\n");
453                 return -EINVAL;
454         }
455
456         /* user data and user message in the first list element */
457         sge.addr   = req->iu->dma_addr;
458         sge.length = req->sg_size;
459         sge.lkey   = sess->s.dev->ib_pd->local_dma_lkey;
460
461         /*
462          * From time to time we have to post signalled sends,
463          * or send queue will fill up and only QP reset can help.
464          */
465         flags = atomic_inc_return(&con->io_cnt) % sess->queue_depth ?
466                         0 : IB_SEND_SIGNALED;
467
468         ib_dma_sync_single_for_device(sess->s.dev->ib_dev, req->iu->dma_addr,
469                                       req->sg_size, DMA_TO_DEVICE);
470
471         return rtrs_iu_post_rdma_write_imm(&con->c, req->iu, &sge, 1,
472                                             rbuf->rkey, rbuf->addr + off,
473                                             imm, flags, wr);
474 }
475
476 static void process_io_rsp(struct rtrs_clt_sess *sess, u32 msg_id,
477                            s16 errno, bool w_inval)
478 {
479         struct rtrs_clt_io_req *req;
480
481         if (WARN_ON(msg_id >= sess->queue_depth))
482                 return;
483
484         req = &sess->reqs[msg_id];
485         /* Drop need_inv if server responded with send with invalidation */
486         req->need_inv &= !w_inval;
487         complete_rdma_req(req, errno, true, false);
488 }
489
490 static void rtrs_clt_recv_done(struct rtrs_clt_con *con, struct ib_wc *wc)
491 {
492         struct rtrs_iu *iu;
493         int err;
494         struct rtrs_clt_sess *sess = to_clt_sess(con->c.sess);
495
496         WARN_ON(sess->flags != RTRS_MSG_NEW_RKEY_F);
497         iu = container_of(wc->wr_cqe, struct rtrs_iu,
498                           cqe);
499         err = rtrs_iu_post_recv(&con->c, iu);
500         if (unlikely(err)) {
501                 rtrs_err(con->c.sess, "post iu failed %d\n", err);
502                 rtrs_rdma_error_recovery(con);
503         }
504 }
505
506 static void rtrs_clt_rkey_rsp_done(struct rtrs_clt_con *con, struct ib_wc *wc)
507 {
508         struct rtrs_clt_sess *sess = to_clt_sess(con->c.sess);
509         struct rtrs_msg_rkey_rsp *msg;
510         u32 imm_type, imm_payload;
511         bool w_inval = false;
512         struct rtrs_iu *iu;
513         u32 buf_id;
514         int err;
515
516         WARN_ON(sess->flags != RTRS_MSG_NEW_RKEY_F);
517
518         iu = container_of(wc->wr_cqe, struct rtrs_iu, cqe);
519
520         if (unlikely(wc->byte_len < sizeof(*msg))) {
521                 rtrs_err(con->c.sess, "rkey response is malformed: size %d\n",
522                           wc->byte_len);
523                 goto out;
524         }
525         ib_dma_sync_single_for_cpu(sess->s.dev->ib_dev, iu->dma_addr,
526                                    iu->size, DMA_FROM_DEVICE);
527         msg = iu->buf;
528         if (unlikely(le16_to_cpu(msg->type) != RTRS_MSG_RKEY_RSP)) {
529                 rtrs_err(sess->clt, "rkey response is malformed: type %d\n",
530                           le16_to_cpu(msg->type));
531                 goto out;
532         }
533         buf_id = le16_to_cpu(msg->buf_id);
534         if (WARN_ON(buf_id >= sess->queue_depth))
535                 goto out;
536
537         rtrs_from_imm(be32_to_cpu(wc->ex.imm_data), &imm_type, &imm_payload);
538         if (likely(imm_type == RTRS_IO_RSP_IMM ||
539                    imm_type == RTRS_IO_RSP_W_INV_IMM)) {
540                 u32 msg_id;
541
542                 w_inval = (imm_type == RTRS_IO_RSP_W_INV_IMM);
543                 rtrs_from_io_rsp_imm(imm_payload, &msg_id, &err);
544
545                 if (WARN_ON(buf_id != msg_id))
546                         goto out;
547                 sess->rbufs[buf_id].rkey = le32_to_cpu(msg->rkey);
548                 process_io_rsp(sess, msg_id, err, w_inval);
549         }
550         ib_dma_sync_single_for_device(sess->s.dev->ib_dev, iu->dma_addr,
551                                       iu->size, DMA_FROM_DEVICE);
552         return rtrs_clt_recv_done(con, wc);
553 out:
554         rtrs_rdma_error_recovery(con);
555 }
556
557 static void rtrs_clt_rdma_done(struct ib_cq *cq, struct ib_wc *wc);
558
559 static struct ib_cqe io_comp_cqe = {
560         .done = rtrs_clt_rdma_done
561 };
562
563 /*
564  * Post x2 empty WRs: first is for this RDMA with IMM,
565  * second is for RECV with INV, which happened earlier.
566  */
567 static int rtrs_post_recv_empty_x2(struct rtrs_con *con, struct ib_cqe *cqe)
568 {
569         struct ib_recv_wr wr_arr[2], *wr;
570         int i;
571
572         memset(wr_arr, 0, sizeof(wr_arr));
573         for (i = 0; i < ARRAY_SIZE(wr_arr); i++) {
574                 wr = &wr_arr[i];
575                 wr->wr_cqe  = cqe;
576                 if (i)
577                         /* Chain backwards */
578                         wr->next = &wr_arr[i - 1];
579         }
580
581         return ib_post_recv(con->qp, wr, NULL);
582 }
583
584 static void rtrs_clt_rdma_done(struct ib_cq *cq, struct ib_wc *wc)
585 {
586         struct rtrs_clt_con *con = cq->cq_context;
587         struct rtrs_clt_sess *sess = to_clt_sess(con->c.sess);
588         u32 imm_type, imm_payload;
589         bool w_inval = false;
590         int err;
591
592         if (unlikely(wc->status != IB_WC_SUCCESS)) {
593                 if (wc->status != IB_WC_WR_FLUSH_ERR) {
594                         rtrs_err(sess->clt, "RDMA failed: %s\n",
595                                   ib_wc_status_msg(wc->status));
596                         rtrs_rdma_error_recovery(con);
597                 }
598                 return;
599         }
600         rtrs_clt_update_wc_stats(con);
601
602         switch (wc->opcode) {
603         case IB_WC_RECV_RDMA_WITH_IMM:
604                 /*
605                  * post_recv() RDMA write completions of IO reqs (read/write)
606                  * and hb
607                  */
608                 if (WARN_ON(wc->wr_cqe->done != rtrs_clt_rdma_done))
609                         return;
610                 rtrs_from_imm(be32_to_cpu(wc->ex.imm_data),
611                                &imm_type, &imm_payload);
612                 if (likely(imm_type == RTRS_IO_RSP_IMM ||
613                            imm_type == RTRS_IO_RSP_W_INV_IMM)) {
614                         u32 msg_id;
615
616                         w_inval = (imm_type == RTRS_IO_RSP_W_INV_IMM);
617                         rtrs_from_io_rsp_imm(imm_payload, &msg_id, &err);
618
619                         process_io_rsp(sess, msg_id, err, w_inval);
620                 } else if (imm_type == RTRS_HB_MSG_IMM) {
621                         WARN_ON(con->c.cid);
622                         rtrs_send_hb_ack(&sess->s);
623                         if (sess->flags == RTRS_MSG_NEW_RKEY_F)
624                                 return  rtrs_clt_recv_done(con, wc);
625                 } else if (imm_type == RTRS_HB_ACK_IMM) {
626                         WARN_ON(con->c.cid);
627                         sess->s.hb_missed_cnt = 0;
628                         if (sess->flags == RTRS_MSG_NEW_RKEY_F)
629                                 return  rtrs_clt_recv_done(con, wc);
630                 } else {
631                         rtrs_wrn(con->c.sess, "Unknown IMM type %u\n",
632                                   imm_type);
633                 }
634                 if (w_inval)
635                         /*
636                          * Post x2 empty WRs: first is for this RDMA with IMM,
637                          * second is for RECV with INV, which happened earlier.
638                          */
639                         err = rtrs_post_recv_empty_x2(&con->c, &io_comp_cqe);
640                 else
641                         err = rtrs_post_recv_empty(&con->c, &io_comp_cqe);
642                 if (unlikely(err)) {
643                         rtrs_err(con->c.sess, "rtrs_post_recv_empty(): %d\n",
644                                   err);
645                         rtrs_rdma_error_recovery(con);
646                         break;
647                 }
648                 break;
649         case IB_WC_RECV:
650                 /*
651                  * Key invalidations from server side
652                  */
653                 WARN_ON(!(wc->wc_flags & IB_WC_WITH_INVALIDATE ||
654                           wc->wc_flags & IB_WC_WITH_IMM));
655                 WARN_ON(wc->wr_cqe->done != rtrs_clt_rdma_done);
656                 if (sess->flags == RTRS_MSG_NEW_RKEY_F) {
657                         if (wc->wc_flags & IB_WC_WITH_INVALIDATE)
658                                 return  rtrs_clt_recv_done(con, wc);
659
660                         return  rtrs_clt_rkey_rsp_done(con, wc);
661                 }
662                 break;
663         case IB_WC_RDMA_WRITE:
664                 /*
665                  * post_send() RDMA write completions of IO reqs (read/write)
666                  * and hb
667                  */
668                 break;
669
670         default:
671                 rtrs_wrn(sess->clt, "Unexpected WC type: %d\n", wc->opcode);
672                 return;
673         }
674 }
675
676 static int post_recv_io(struct rtrs_clt_con *con, size_t q_size)
677 {
678         int err, i;
679         struct rtrs_clt_sess *sess = to_clt_sess(con->c.sess);
680
681         for (i = 0; i < q_size; i++) {
682                 if (sess->flags == RTRS_MSG_NEW_RKEY_F) {
683                         struct rtrs_iu *iu = &con->rsp_ius[i];
684
685                         err = rtrs_iu_post_recv(&con->c, iu);
686                 } else {
687                         err = rtrs_post_recv_empty(&con->c, &io_comp_cqe);
688                 }
689                 if (unlikely(err))
690                         return err;
691         }
692
693         return 0;
694 }
695
696 static int post_recv_sess(struct rtrs_clt_sess *sess)
697 {
698         size_t q_size = 0;
699         int err, cid;
700
701         for (cid = 0; cid < sess->s.con_num; cid++) {
702                 if (cid == 0)
703                         q_size = SERVICE_CON_QUEUE_DEPTH;
704                 else
705                         q_size = sess->queue_depth;
706
707                 /*
708                  * x2 for RDMA read responses + FR key invalidations,
709                  * RDMA writes do not require any FR registrations.
710                  */
711                 q_size *= 2;
712
713                 err = post_recv_io(to_clt_con(sess->s.con[cid]), q_size);
714                 if (unlikely(err)) {
715                         rtrs_err(sess->clt, "post_recv_io(), err: %d\n", err);
716                         return err;
717                 }
718         }
719
720         return 0;
721 }
722
723 struct path_it {
724         int i;
725         struct list_head skip_list;
726         struct rtrs_clt *clt;
727         struct rtrs_clt_sess *(*next_path)(struct path_it *it);
728 };
729
730 #define do_each_path(path, clt, it) {                                   \
731         path_it_init(it, clt);                                          \
732         rcu_read_lock();                                                \
733         for ((it)->i = 0; ((path) = ((it)->next_path)(it)) &&           \
734                           (it)->i < (it)->clt->paths_num;               \
735              (it)->i++)
736
737 #define while_each_path(it)                                             \
738         path_it_deinit(it);                                             \
739         rcu_read_unlock();                                              \
740         }
741
742 /**
743  * list_next_or_null_rr_rcu - get next list element in round-robin fashion.
744  * @head:       the head for the list.
745  * @ptr:        the list head to take the next element from.
746  * @type:       the type of the struct this is embedded in.
747  * @memb:       the name of the list_head within the struct.
748  *
749  * Next element returned in round-robin fashion, i.e. head will be skipped,
750  * but if list is observed as empty, NULL will be returned.
751  *
752  * This primitive may safely run concurrently with the _rcu list-mutation
753  * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock().
754  */
755 #define list_next_or_null_rr_rcu(head, ptr, type, memb) \
756 ({ \
757         list_next_or_null_rcu(head, ptr, type, memb) ?: \
758                 list_next_or_null_rcu(head, READ_ONCE((ptr)->next), \
759                                       type, memb); \
760 })
761
762 /**
763  * get_next_path_rr() - Returns path in round-robin fashion.
764  * @it: the path pointer
765  *
766  * Related to @MP_POLICY_RR
767  *
768  * Locks:
769  *    rcu_read_lock() must be hold.
770  */
771 static struct rtrs_clt_sess *get_next_path_rr(struct path_it *it)
772 {
773         struct rtrs_clt_sess __rcu **ppcpu_path;
774         struct rtrs_clt_sess *path;
775         struct rtrs_clt *clt;
776
777         clt = it->clt;
778
779         /*
780          * Here we use two RCU objects: @paths_list and @pcpu_path
781          * pointer.  See rtrs_clt_remove_path_from_arr() for details
782          * how that is handled.
783          */
784
785         ppcpu_path = this_cpu_ptr(clt->pcpu_path);
786         path = rcu_dereference(*ppcpu_path);
787         if (unlikely(!path))
788                 path = list_first_or_null_rcu(&clt->paths_list,
789                                               typeof(*path), s.entry);
790         else
791                 path = list_next_or_null_rr_rcu(&clt->paths_list,
792                                                 &path->s.entry,
793                                                 typeof(*path),
794                                                 s.entry);
795         rcu_assign_pointer(*ppcpu_path, path);
796
797         return path;
798 }
799
800 /**
801  * get_next_path_min_inflight() - Returns path with minimal inflight count.
802  * @it: the path pointer
803  *
804  * Related to @MP_POLICY_MIN_INFLIGHT
805  *
806  * Locks:
807  *    rcu_read_lock() must be hold.
808  */
809 static struct rtrs_clt_sess *get_next_path_min_inflight(struct path_it *it)
810 {
811         struct rtrs_clt_sess *min_path = NULL;
812         struct rtrs_clt *clt = it->clt;
813         struct rtrs_clt_sess *sess;
814         int min_inflight = INT_MAX;
815         int inflight;
816
817         list_for_each_entry_rcu(sess, &clt->paths_list, s.entry) {
818                 if (unlikely(!list_empty(raw_cpu_ptr(sess->mp_skip_entry))))
819                         continue;
820
821                 inflight = atomic_read(&sess->stats->inflight);
822
823                 if (inflight < min_inflight) {
824                         min_inflight = inflight;
825                         min_path = sess;
826                 }
827         }
828
829         /*
830          * add the path to the skip list, so that next time we can get
831          * a different one
832          */
833         if (min_path)
834                 list_add(raw_cpu_ptr(min_path->mp_skip_entry), &it->skip_list);
835
836         return min_path;
837 }
838
839 static inline void path_it_init(struct path_it *it, struct rtrs_clt *clt)
840 {
841         INIT_LIST_HEAD(&it->skip_list);
842         it->clt = clt;
843         it->i = 0;
844
845         if (clt->mp_policy == MP_POLICY_RR)
846                 it->next_path = get_next_path_rr;
847         else
848                 it->next_path = get_next_path_min_inflight;
849 }
850
851 static inline void path_it_deinit(struct path_it *it)
852 {
853         struct list_head *skip, *tmp;
854         /*
855          * The skip_list is used only for the MIN_INFLIGHT policy.
856          * We need to remove paths from it, so that next IO can insert
857          * paths (->mp_skip_entry) into a skip_list again.
858          */
859         list_for_each_safe(skip, tmp, &it->skip_list)
860                 list_del_init(skip);
861 }
862
863 /**
864  * rtrs_clt_init_req() Initialize an rtrs_clt_io_req holding information
865  * about an inflight IO.
866  * The user buffer holding user control message (not data) is copied into
867  * the corresponding buffer of rtrs_iu (req->iu->buf), which later on will
868  * also hold the control message of rtrs.
869  * @req: an io request holding information about IO.
870  * @sess: client session
871  * @conf: conformation callback function to notify upper layer.
872  * @permit: permit for allocation of RDMA remote buffer
873  * @priv: private pointer
874  * @vec: kernel vector containing control message
875  * @usr_len: length of the user message
876  * @sg: scater list for IO data
877  * @sg_cnt: number of scater list entries
878  * @data_len: length of the IO data
879  * @dir: direction of the IO.
880  */
881 static void rtrs_clt_init_req(struct rtrs_clt_io_req *req,
882                               struct rtrs_clt_sess *sess,
883                               void (*conf)(void *priv, int errno),
884                               struct rtrs_permit *permit, void *priv,
885                               const struct kvec *vec, size_t usr_len,
886                               struct scatterlist *sg, size_t sg_cnt,
887                               size_t data_len, int dir)
888 {
889         struct iov_iter iter;
890         size_t len;
891
892         req->permit = permit;
893         req->in_use = true;
894         req->usr_len = usr_len;
895         req->data_len = data_len;
896         req->sglist = sg;
897         req->sg_cnt = sg_cnt;
898         req->priv = priv;
899         req->dir = dir;
900         req->con = rtrs_permit_to_clt_con(sess, permit);
901         req->conf = conf;
902         req->need_inv = false;
903         req->need_inv_comp = false;
904         req->inv_errno = 0;
905
906         iov_iter_kvec(&iter, READ, vec, 1, usr_len);
907         len = _copy_from_iter(req->iu->buf, usr_len, &iter);
908         WARN_ON(len != usr_len);
909
910         reinit_completion(&req->inv_comp);
911 }
912
913 static struct rtrs_clt_io_req *
914 rtrs_clt_get_req(struct rtrs_clt_sess *sess,
915                  void (*conf)(void *priv, int errno),
916                  struct rtrs_permit *permit, void *priv,
917                  const struct kvec *vec, size_t usr_len,
918                  struct scatterlist *sg, size_t sg_cnt,
919                  size_t data_len, int dir)
920 {
921         struct rtrs_clt_io_req *req;
922
923         req = &sess->reqs[permit->mem_id];
924         rtrs_clt_init_req(req, sess, conf, permit, priv, vec, usr_len,
925                            sg, sg_cnt, data_len, dir);
926         return req;
927 }
928
929 static struct rtrs_clt_io_req *
930 rtrs_clt_get_copy_req(struct rtrs_clt_sess *alive_sess,
931                        struct rtrs_clt_io_req *fail_req)
932 {
933         struct rtrs_clt_io_req *req;
934         struct kvec vec = {
935                 .iov_base = fail_req->iu->buf,
936                 .iov_len  = fail_req->usr_len
937         };
938
939         req = &alive_sess->reqs[fail_req->permit->mem_id];
940         rtrs_clt_init_req(req, alive_sess, fail_req->conf, fail_req->permit,
941                            fail_req->priv, &vec, fail_req->usr_len,
942                            fail_req->sglist, fail_req->sg_cnt,
943                            fail_req->data_len, fail_req->dir);
944         return req;
945 }
946
947 static int rtrs_post_rdma_write_sg(struct rtrs_clt_con *con,
948                                     struct rtrs_clt_io_req *req,
949                                     struct rtrs_rbuf *rbuf,
950                                     u32 size, u32 imm)
951 {
952         struct rtrs_clt_sess *sess = to_clt_sess(con->c.sess);
953         struct ib_sge *sge = req->sge;
954         enum ib_send_flags flags;
955         struct scatterlist *sg;
956         size_t num_sge;
957         int i;
958
959         for_each_sg(req->sglist, sg, req->sg_cnt, i) {
960                 sge[i].addr   = sg_dma_address(sg);
961                 sge[i].length = sg_dma_len(sg);
962                 sge[i].lkey   = sess->s.dev->ib_pd->local_dma_lkey;
963         }
964         sge[i].addr   = req->iu->dma_addr;
965         sge[i].length = size;
966         sge[i].lkey   = sess->s.dev->ib_pd->local_dma_lkey;
967
968         num_sge = 1 + req->sg_cnt;
969
970         /*
971          * From time to time we have to post signalled sends,
972          * or send queue will fill up and only QP reset can help.
973          */
974         flags = atomic_inc_return(&con->io_cnt) % sess->queue_depth ?
975                         0 : IB_SEND_SIGNALED;
976
977         ib_dma_sync_single_for_device(sess->s.dev->ib_dev, req->iu->dma_addr,
978                                       size, DMA_TO_DEVICE);
979
980         return rtrs_iu_post_rdma_write_imm(&con->c, req->iu, sge, num_sge,
981                                             rbuf->rkey, rbuf->addr, imm,
982                                             flags, NULL);
983 }
984
985 static int rtrs_clt_write_req(struct rtrs_clt_io_req *req)
986 {
987         struct rtrs_clt_con *con = req->con;
988         struct rtrs_sess *s = con->c.sess;
989         struct rtrs_clt_sess *sess = to_clt_sess(s);
990         struct rtrs_msg_rdma_write *msg;
991
992         struct rtrs_rbuf *rbuf;
993         int ret, count = 0;
994         u32 imm, buf_id;
995
996         const size_t tsize = sizeof(*msg) + req->data_len + req->usr_len;
997
998         if (unlikely(tsize > sess->chunk_size)) {
999                 rtrs_wrn(s, "Write request failed, size too big %zu > %d\n",
1000                           tsize, sess->chunk_size);
1001                 return -EMSGSIZE;
1002         }
1003         if (req->sg_cnt) {
1004                 count = ib_dma_map_sg(sess->s.dev->ib_dev, req->sglist,
1005                                       req->sg_cnt, req->dir);
1006                 if (unlikely(!count)) {
1007                         rtrs_wrn(s, "Write request failed, map failed\n");
1008                         return -EINVAL;
1009                 }
1010         }
1011         /* put rtrs msg after sg and user message */
1012         msg = req->iu->buf + req->usr_len;
1013         msg->type = cpu_to_le16(RTRS_MSG_WRITE);
1014         msg->usr_len = cpu_to_le16(req->usr_len);
1015
1016         /* rtrs message on server side will be after user data and message */
1017         imm = req->permit->mem_off + req->data_len + req->usr_len;
1018         imm = rtrs_to_io_req_imm(imm);
1019         buf_id = req->permit->mem_id;
1020         req->sg_size = tsize;
1021         rbuf = &sess->rbufs[buf_id];
1022
1023         /*
1024          * Update stats now, after request is successfully sent it is not
1025          * safe anymore to touch it.
1026          */
1027         rtrs_clt_update_all_stats(req, WRITE);
1028
1029         ret = rtrs_post_rdma_write_sg(req->con, req, rbuf,
1030                                        req->usr_len + sizeof(*msg),
1031                                        imm);
1032         if (unlikely(ret)) {
1033                 rtrs_err(s, "Write request failed: %d\n", ret);
1034                 if (sess->clt->mp_policy == MP_POLICY_MIN_INFLIGHT)
1035                         atomic_dec(&sess->stats->inflight);
1036                 if (req->sg_cnt)
1037                         ib_dma_unmap_sg(sess->s.dev->ib_dev, req->sglist,
1038                                         req->sg_cnt, req->dir);
1039         }
1040
1041         return ret;
1042 }
1043
1044 static int rtrs_map_sg_fr(struct rtrs_clt_io_req *req, size_t count)
1045 {
1046         int nr;
1047
1048         /* Align the MR to a 4K page size to match the block virt boundary */
1049         nr = ib_map_mr_sg(req->mr, req->sglist, count, NULL, SZ_4K);
1050         if (unlikely(nr < req->sg_cnt)) {
1051                 if (nr < 0)
1052                         return nr;
1053                 return -EINVAL;
1054         }
1055         ib_update_fast_reg_key(req->mr, ib_inc_rkey(req->mr->rkey));
1056
1057         return nr;
1058 }
1059
1060 static int rtrs_clt_read_req(struct rtrs_clt_io_req *req)
1061 {
1062         struct rtrs_clt_con *con = req->con;
1063         struct rtrs_sess *s = con->c.sess;
1064         struct rtrs_clt_sess *sess = to_clt_sess(s);
1065         struct rtrs_msg_rdma_read *msg;
1066         struct rtrs_ib_dev *dev;
1067
1068         struct ib_reg_wr rwr;
1069         struct ib_send_wr *wr = NULL;
1070
1071         int ret, count = 0;
1072         u32 imm, buf_id;
1073
1074         const size_t tsize = sizeof(*msg) + req->data_len + req->usr_len;
1075
1076         s = &sess->s;
1077         dev = sess->s.dev;
1078
1079         if (unlikely(tsize > sess->chunk_size)) {
1080                 rtrs_wrn(s,
1081                           "Read request failed, message size is %zu, bigger than CHUNK_SIZE %d\n",
1082                           tsize, sess->chunk_size);
1083                 return -EMSGSIZE;
1084         }
1085
1086         if (req->sg_cnt) {
1087                 count = ib_dma_map_sg(dev->ib_dev, req->sglist, req->sg_cnt,
1088                                       req->dir);
1089                 if (unlikely(!count)) {
1090                         rtrs_wrn(s,
1091                                   "Read request failed, dma map failed\n");
1092                         return -EINVAL;
1093                 }
1094         }
1095         /* put our message into req->buf after user message*/
1096         msg = req->iu->buf + req->usr_len;
1097         msg->type = cpu_to_le16(RTRS_MSG_READ);
1098         msg->usr_len = cpu_to_le16(req->usr_len);
1099
1100         if (count) {
1101                 ret = rtrs_map_sg_fr(req, count);
1102                 if (ret < 0) {
1103                         rtrs_err_rl(s,
1104                                      "Read request failed, failed to map  fast reg. data, err: %d\n",
1105                                      ret);
1106                         ib_dma_unmap_sg(dev->ib_dev, req->sglist, req->sg_cnt,
1107                                         req->dir);
1108                         return ret;
1109                 }
1110                 rwr = (struct ib_reg_wr) {
1111                         .wr.opcode = IB_WR_REG_MR,
1112                         .wr.wr_cqe = &fast_reg_cqe,
1113                         .mr = req->mr,
1114                         .key = req->mr->rkey,
1115                         .access = (IB_ACCESS_LOCAL_WRITE |
1116                                    IB_ACCESS_REMOTE_WRITE),
1117                 };
1118                 wr = &rwr.wr;
1119
1120                 msg->sg_cnt = cpu_to_le16(1);
1121                 msg->flags = cpu_to_le16(RTRS_MSG_NEED_INVAL_F);
1122
1123                 msg->desc[0].addr = cpu_to_le64(req->mr->iova);
1124                 msg->desc[0].key = cpu_to_le32(req->mr->rkey);
1125                 msg->desc[0].len = cpu_to_le32(req->mr->length);
1126
1127                 /* Further invalidation is required */
1128                 req->need_inv = !!RTRS_MSG_NEED_INVAL_F;
1129
1130         } else {
1131                 msg->sg_cnt = 0;
1132                 msg->flags = 0;
1133         }
1134         /*
1135          * rtrs message will be after the space reserved for disk data and
1136          * user message
1137          */
1138         imm = req->permit->mem_off + req->data_len + req->usr_len;
1139         imm = rtrs_to_io_req_imm(imm);
1140         buf_id = req->permit->mem_id;
1141
1142         req->sg_size  = sizeof(*msg);
1143         req->sg_size += le16_to_cpu(msg->sg_cnt) * sizeof(struct rtrs_sg_desc);
1144         req->sg_size += req->usr_len;
1145
1146         /*
1147          * Update stats now, after request is successfully sent it is not
1148          * safe anymore to touch it.
1149          */
1150         rtrs_clt_update_all_stats(req, READ);
1151
1152         ret = rtrs_post_send_rdma(req->con, req, &sess->rbufs[buf_id],
1153                                    req->data_len, imm, wr);
1154         if (unlikely(ret)) {
1155                 rtrs_err(s, "Read request failed: %d\n", ret);
1156                 if (sess->clt->mp_policy == MP_POLICY_MIN_INFLIGHT)
1157                         atomic_dec(&sess->stats->inflight);
1158                 req->need_inv = false;
1159                 if (req->sg_cnt)
1160                         ib_dma_unmap_sg(dev->ib_dev, req->sglist,
1161                                         req->sg_cnt, req->dir);
1162         }
1163
1164         return ret;
1165 }
1166
1167 /**
1168  * rtrs_clt_failover_req() Try to find an active path for a failed request
1169  * @clt: clt context
1170  * @fail_req: a failed io request.
1171  */
1172 static int rtrs_clt_failover_req(struct rtrs_clt *clt,
1173                                  struct rtrs_clt_io_req *fail_req)
1174 {
1175         struct rtrs_clt_sess *alive_sess;
1176         struct rtrs_clt_io_req *req;
1177         int err = -ECONNABORTED;
1178         struct path_it it;
1179
1180         do_each_path(alive_sess, clt, &it) {
1181                 if (unlikely(READ_ONCE(alive_sess->state) !=
1182                              RTRS_CLT_CONNECTED))
1183                         continue;
1184                 req = rtrs_clt_get_copy_req(alive_sess, fail_req);
1185                 if (req->dir == DMA_TO_DEVICE)
1186                         err = rtrs_clt_write_req(req);
1187                 else
1188                         err = rtrs_clt_read_req(req);
1189                 if (unlikely(err)) {
1190                         req->in_use = false;
1191                         continue;
1192                 }
1193                 /* Success path */
1194                 rtrs_clt_inc_failover_cnt(alive_sess->stats);
1195                 break;
1196         } while_each_path(&it);
1197
1198         return err;
1199 }
1200
1201 static void fail_all_outstanding_reqs(struct rtrs_clt_sess *sess)
1202 {
1203         struct rtrs_clt *clt = sess->clt;
1204         struct rtrs_clt_io_req *req;
1205         int i, err;
1206
1207         if (!sess->reqs)
1208                 return;
1209         for (i = 0; i < sess->queue_depth; ++i) {
1210                 req = &sess->reqs[i];
1211                 if (!req->in_use)
1212                         continue;
1213
1214                 /*
1215                  * Safely (without notification) complete failed request.
1216                  * After completion this request is still useble and can
1217                  * be failovered to another path.
1218                  */
1219                 complete_rdma_req(req, -ECONNABORTED, false, true);
1220
1221                 err = rtrs_clt_failover_req(clt, req);
1222                 if (unlikely(err))
1223                         /* Failover failed, notify anyway */
1224                         req->conf(req->priv, err);
1225         }
1226 }
1227
1228 static void free_sess_reqs(struct rtrs_clt_sess *sess)
1229 {
1230         struct rtrs_clt_io_req *req;
1231         int i;
1232
1233         if (!sess->reqs)
1234                 return;
1235         for (i = 0; i < sess->queue_depth; ++i) {
1236                 req = &sess->reqs[i];
1237                 if (req->mr)
1238                         ib_dereg_mr(req->mr);
1239                 kfree(req->sge);
1240                 rtrs_iu_free(req->iu, DMA_TO_DEVICE,
1241                               sess->s.dev->ib_dev, 1);
1242         }
1243         kfree(sess->reqs);
1244         sess->reqs = NULL;
1245 }
1246
1247 static int alloc_sess_reqs(struct rtrs_clt_sess *sess)
1248 {
1249         struct rtrs_clt_io_req *req;
1250         struct rtrs_clt *clt = sess->clt;
1251         int i, err = -ENOMEM;
1252
1253         sess->reqs = kcalloc(sess->queue_depth, sizeof(*sess->reqs),
1254                              GFP_KERNEL);
1255         if (!sess->reqs)
1256                 return -ENOMEM;
1257
1258         for (i = 0; i < sess->queue_depth; ++i) {
1259                 req = &sess->reqs[i];
1260                 req->iu = rtrs_iu_alloc(1, sess->max_hdr_size, GFP_KERNEL,
1261                                          sess->s.dev->ib_dev,
1262                                          DMA_TO_DEVICE,
1263                                          rtrs_clt_rdma_done);
1264                 if (!req->iu)
1265                         goto out;
1266
1267                 req->sge = kmalloc_array(clt->max_segments + 1,
1268                                          sizeof(*req->sge), GFP_KERNEL);
1269                 if (!req->sge)
1270                         goto out;
1271
1272                 req->mr = ib_alloc_mr(sess->s.dev->ib_pd, IB_MR_TYPE_MEM_REG,
1273                                       sess->max_pages_per_mr);
1274                 if (IS_ERR(req->mr)) {
1275                         err = PTR_ERR(req->mr);
1276                         req->mr = NULL;
1277                         pr_err("Failed to alloc sess->max_pages_per_mr %d\n",
1278                                sess->max_pages_per_mr);
1279                         goto out;
1280                 }
1281
1282                 init_completion(&req->inv_comp);
1283         }
1284
1285         return 0;
1286
1287 out:
1288         free_sess_reqs(sess);
1289
1290         return err;
1291 }
1292
1293 static int alloc_permits(struct rtrs_clt *clt)
1294 {
1295         unsigned int chunk_bits;
1296         int err, i;
1297
1298         clt->permits_map = kcalloc(BITS_TO_LONGS(clt->queue_depth),
1299                                    sizeof(long), GFP_KERNEL);
1300         if (!clt->permits_map) {
1301                 err = -ENOMEM;
1302                 goto out_err;
1303         }
1304         clt->permits = kcalloc(clt->queue_depth, permit_size(clt), GFP_KERNEL);
1305         if (!clt->permits) {
1306                 err = -ENOMEM;
1307                 goto err_map;
1308         }
1309         chunk_bits = ilog2(clt->queue_depth - 1) + 1;
1310         for (i = 0; i < clt->queue_depth; i++) {
1311                 struct rtrs_permit *permit;
1312
1313                 permit = get_permit(clt, i);
1314                 permit->mem_id = i;
1315                 permit->mem_off = i << (MAX_IMM_PAYL_BITS - chunk_bits);
1316         }
1317
1318         return 0;
1319
1320 err_map:
1321         kfree(clt->permits_map);
1322         clt->permits_map = NULL;
1323 out_err:
1324         return err;
1325 }
1326
1327 static void free_permits(struct rtrs_clt *clt)
1328 {
1329         kfree(clt->permits_map);
1330         clt->permits_map = NULL;
1331         kfree(clt->permits);
1332         clt->permits = NULL;
1333 }
1334
1335 static void query_fast_reg_mode(struct rtrs_clt_sess *sess)
1336 {
1337         struct ib_device *ib_dev;
1338         u64 max_pages_per_mr;
1339         int mr_page_shift;
1340
1341         ib_dev = sess->s.dev->ib_dev;
1342
1343         /*
1344          * Use the smallest page size supported by the HCA, down to a
1345          * minimum of 4096 bytes. We're unlikely to build large sglists
1346          * out of smaller entries.
1347          */
1348         mr_page_shift      = max(12, ffs(ib_dev->attrs.page_size_cap) - 1);
1349         max_pages_per_mr   = ib_dev->attrs.max_mr_size;
1350         do_div(max_pages_per_mr, (1ull << mr_page_shift));
1351         sess->max_pages_per_mr =
1352                 min3(sess->max_pages_per_mr, (u32)max_pages_per_mr,
1353                      ib_dev->attrs.max_fast_reg_page_list_len);
1354         sess->max_send_sge = ib_dev->attrs.max_send_sge;
1355 }
1356
1357 static bool rtrs_clt_change_state_get_old(struct rtrs_clt_sess *sess,
1358                                            enum rtrs_clt_state new_state,
1359                                            enum rtrs_clt_state *old_state)
1360 {
1361         bool changed;
1362
1363         spin_lock_irq(&sess->state_wq.lock);
1364         *old_state = sess->state;
1365         changed = __rtrs_clt_change_state(sess, new_state);
1366         spin_unlock_irq(&sess->state_wq.lock);
1367
1368         return changed;
1369 }
1370
1371 static bool rtrs_clt_change_state(struct rtrs_clt_sess *sess,
1372                                    enum rtrs_clt_state new_state)
1373 {
1374         enum rtrs_clt_state old_state;
1375
1376         return rtrs_clt_change_state_get_old(sess, new_state, &old_state);
1377 }
1378
1379 static void rtrs_clt_hb_err_handler(struct rtrs_con *c)
1380 {
1381         struct rtrs_clt_con *con = container_of(c, typeof(*con), c);
1382
1383         rtrs_rdma_error_recovery(con);
1384 }
1385
1386 static void rtrs_clt_init_hb(struct rtrs_clt_sess *sess)
1387 {
1388         rtrs_init_hb(&sess->s, &io_comp_cqe,
1389                       RTRS_HB_INTERVAL_MS,
1390                       RTRS_HB_MISSED_MAX,
1391                       rtrs_clt_hb_err_handler,
1392                       rtrs_wq);
1393 }
1394
1395 static void rtrs_clt_start_hb(struct rtrs_clt_sess *sess)
1396 {
1397         rtrs_start_hb(&sess->s);
1398 }
1399
1400 static void rtrs_clt_stop_hb(struct rtrs_clt_sess *sess)
1401 {
1402         rtrs_stop_hb(&sess->s);
1403 }
1404
1405 static void rtrs_clt_reconnect_work(struct work_struct *work);
1406 static void rtrs_clt_close_work(struct work_struct *work);
1407
1408 static struct rtrs_clt_sess *alloc_sess(struct rtrs_clt *clt,
1409                                          const struct rtrs_addr *path,
1410                                          size_t con_num, u16 max_segments)
1411 {
1412         struct rtrs_clt_sess *sess;
1413         int err = -ENOMEM;
1414         int cpu;
1415
1416         sess = kzalloc(sizeof(*sess), GFP_KERNEL);
1417         if (!sess)
1418                 goto err;
1419
1420         /* Extra connection for user messages */
1421         con_num += 1;
1422
1423         sess->s.con = kcalloc(con_num, sizeof(*sess->s.con), GFP_KERNEL);
1424         if (!sess->s.con)
1425                 goto err_free_sess;
1426
1427         sess->stats = kzalloc(sizeof(*sess->stats), GFP_KERNEL);
1428         if (!sess->stats)
1429                 goto err_free_con;
1430
1431         mutex_init(&sess->init_mutex);
1432         uuid_gen(&sess->s.uuid);
1433         memcpy(&sess->s.dst_addr, path->dst,
1434                rdma_addr_size((struct sockaddr *)path->dst));
1435
1436         /*
1437          * rdma_resolve_addr() passes src_addr to cma_bind_addr, which
1438          * checks the sa_family to be non-zero. If user passed src_addr=NULL
1439          * the sess->src_addr will contain only zeros, which is then fine.
1440          */
1441         if (path->src)
1442                 memcpy(&sess->s.src_addr, path->src,
1443                        rdma_addr_size((struct sockaddr *)path->src));
1444         strlcpy(sess->s.sessname, clt->sessname, sizeof(sess->s.sessname));
1445         sess->s.con_num = con_num;
1446         sess->clt = clt;
1447         sess->max_pages_per_mr = max_segments * BLK_MAX_SEGMENT_SIZE >> 12;
1448         init_waitqueue_head(&sess->state_wq);
1449         sess->state = RTRS_CLT_CONNECTING;
1450         atomic_set(&sess->connected_cnt, 0);
1451         INIT_WORK(&sess->close_work, rtrs_clt_close_work);
1452         INIT_DELAYED_WORK(&sess->reconnect_dwork, rtrs_clt_reconnect_work);
1453         rtrs_clt_init_hb(sess);
1454
1455         sess->mp_skip_entry = alloc_percpu(typeof(*sess->mp_skip_entry));
1456         if (!sess->mp_skip_entry)
1457                 goto err_free_stats;
1458
1459         for_each_possible_cpu(cpu)
1460                 INIT_LIST_HEAD(per_cpu_ptr(sess->mp_skip_entry, cpu));
1461
1462         err = rtrs_clt_init_stats(sess->stats);
1463         if (err)
1464                 goto err_free_percpu;
1465
1466         return sess;
1467
1468 err_free_percpu:
1469         free_percpu(sess->mp_skip_entry);
1470 err_free_stats:
1471         kfree(sess->stats);
1472 err_free_con:
1473         kfree(sess->s.con);
1474 err_free_sess:
1475         kfree(sess);
1476 err:
1477         return ERR_PTR(err);
1478 }
1479
1480 void free_sess(struct rtrs_clt_sess *sess)
1481 {
1482         free_percpu(sess->mp_skip_entry);
1483         mutex_destroy(&sess->init_mutex);
1484         kfree(sess->s.con);
1485         kfree(sess->rbufs);
1486         kfree(sess);
1487 }
1488
1489 static int create_con(struct rtrs_clt_sess *sess, unsigned int cid)
1490 {
1491         struct rtrs_clt_con *con;
1492
1493         con = kzalloc(sizeof(*con), GFP_KERNEL);
1494         if (!con)
1495                 return -ENOMEM;
1496
1497         /* Map first two connections to the first CPU */
1498         con->cpu  = (cid ? cid - 1 : 0) % nr_cpu_ids;
1499         con->c.cid = cid;
1500         con->c.sess = &sess->s;
1501         atomic_set(&con->io_cnt, 0);
1502
1503         sess->s.con[cid] = &con->c;
1504
1505         return 0;
1506 }
1507
1508 static void destroy_con(struct rtrs_clt_con *con)
1509 {
1510         struct rtrs_clt_sess *sess = to_clt_sess(con->c.sess);
1511
1512         sess->s.con[con->c.cid] = NULL;
1513         kfree(con);
1514 }
1515
1516 static int create_con_cq_qp(struct rtrs_clt_con *con)
1517 {
1518         struct rtrs_clt_sess *sess = to_clt_sess(con->c.sess);
1519         u16 wr_queue_size;
1520         int err, cq_vector;
1521         struct rtrs_msg_rkey_rsp *rsp;
1522
1523         /*
1524          * This function can fail, but still destroy_con_cq_qp() should
1525          * be called, this is because create_con_cq_qp() is called on cm
1526          * event path, thus caller/waiter never knows: have we failed before
1527          * create_con_cq_qp() or after.  To solve this dilemma without
1528          * creating any additional flags just allow destroy_con_cq_qp() be
1529          * called many times.
1530          */
1531
1532         if (con->c.cid == 0) {
1533                 /*
1534                  * One completion for each receive and two for each send
1535                  * (send request + registration)
1536                  * + 2 for drain and heartbeat
1537                  * in case qp gets into error state
1538                  */
1539                 wr_queue_size = SERVICE_CON_QUEUE_DEPTH * 3 + 2;
1540                 /* We must be the first here */
1541                 if (WARN_ON(sess->s.dev))
1542                         return -EINVAL;
1543
1544                 /*
1545                  * The whole session uses device from user connection.
1546                  * Be careful not to close user connection before ib dev
1547                  * is gracefully put.
1548                  */
1549                 sess->s.dev = rtrs_ib_dev_find_or_add(con->c.cm_id->device,
1550                                                        &dev_pd);
1551                 if (!sess->s.dev) {
1552                         rtrs_wrn(sess->clt,
1553                                   "rtrs_ib_dev_find_get_or_add(): no memory\n");
1554                         return -ENOMEM;
1555                 }
1556                 sess->s.dev_ref = 1;
1557                 query_fast_reg_mode(sess);
1558         } else {
1559                 /*
1560                  * Here we assume that session members are correctly set.
1561                  * This is always true if user connection (cid == 0) is
1562                  * established first.
1563                  */
1564                 if (WARN_ON(!sess->s.dev))
1565                         return -EINVAL;
1566                 if (WARN_ON(!sess->queue_depth))
1567                         return -EINVAL;
1568
1569                 /* Shared between connections */
1570                 sess->s.dev_ref++;
1571                 wr_queue_size =
1572                         min_t(int, sess->s.dev->ib_dev->attrs.max_qp_wr,
1573                               /* QD * (REQ + RSP + FR REGS or INVS) + drain */
1574                               sess->queue_depth * 3 + 1);
1575         }
1576         /* alloc iu to recv new rkey reply when server reports flags set */
1577         if (sess->flags == RTRS_MSG_NEW_RKEY_F || con->c.cid == 0) {
1578                 con->rsp_ius = rtrs_iu_alloc(wr_queue_size, sizeof(*rsp),
1579                                               GFP_KERNEL, sess->s.dev->ib_dev,
1580                                               DMA_FROM_DEVICE,
1581                                               rtrs_clt_rdma_done);
1582                 if (!con->rsp_ius)
1583                         return -ENOMEM;
1584                 con->queue_size = wr_queue_size;
1585         }
1586         cq_vector = con->cpu % sess->s.dev->ib_dev->num_comp_vectors;
1587         err = rtrs_cq_qp_create(&sess->s, &con->c, sess->max_send_sge,
1588                                  cq_vector, wr_queue_size, wr_queue_size,
1589                                  IB_POLL_SOFTIRQ);
1590         /*
1591          * In case of error we do not bother to clean previous allocations,
1592          * since destroy_con_cq_qp() must be called.
1593          */
1594
1595         if (err)
1596                 return err;
1597         return err;
1598 }
1599
1600 static void destroy_con_cq_qp(struct rtrs_clt_con *con)
1601 {
1602         struct rtrs_clt_sess *sess = to_clt_sess(con->c.sess);
1603
1604         /*
1605          * Be careful here: destroy_con_cq_qp() can be called even
1606          * create_con_cq_qp() failed, see comments there.
1607          */
1608
1609         rtrs_cq_qp_destroy(&con->c);
1610         if (con->rsp_ius) {
1611                 rtrs_iu_free(con->rsp_ius, DMA_FROM_DEVICE,
1612                               sess->s.dev->ib_dev, con->queue_size);
1613                 con->rsp_ius = NULL;
1614                 con->queue_size = 0;
1615         }
1616         if (sess->s.dev_ref && !--sess->s.dev_ref) {
1617                 rtrs_ib_dev_put(sess->s.dev);
1618                 sess->s.dev = NULL;
1619         }
1620 }
1621
1622 static void stop_cm(struct rtrs_clt_con *con)
1623 {
1624         rdma_disconnect(con->c.cm_id);
1625         if (con->c.qp)
1626                 ib_drain_qp(con->c.qp);
1627 }
1628
1629 static void destroy_cm(struct rtrs_clt_con *con)
1630 {
1631         rdma_destroy_id(con->c.cm_id);
1632         con->c.cm_id = NULL;
1633 }
1634
1635 static int rtrs_rdma_addr_resolved(struct rtrs_clt_con *con)
1636 {
1637         struct rtrs_sess *s = con->c.sess;
1638         int err;
1639
1640         err = create_con_cq_qp(con);
1641         if (err) {
1642                 rtrs_err(s, "create_con_cq_qp(), err: %d\n", err);
1643                 return err;
1644         }
1645         err = rdma_resolve_route(con->c.cm_id, RTRS_CONNECT_TIMEOUT_MS);
1646         if (err) {
1647                 rtrs_err(s, "Resolving route failed, err: %d\n", err);
1648                 destroy_con_cq_qp(con);
1649         }
1650
1651         return err;
1652 }
1653
1654 static int rtrs_rdma_route_resolved(struct rtrs_clt_con *con)
1655 {
1656         struct rtrs_clt_sess *sess = to_clt_sess(con->c.sess);
1657         struct rtrs_clt *clt = sess->clt;
1658         struct rtrs_msg_conn_req msg;
1659         struct rdma_conn_param param;
1660
1661         int err;
1662
1663         param = (struct rdma_conn_param) {
1664                 .retry_count = 7,
1665                 .rnr_retry_count = 7,
1666                 .private_data = &msg,
1667                 .private_data_len = sizeof(msg),
1668         };
1669
1670         msg = (struct rtrs_msg_conn_req) {
1671                 .magic = cpu_to_le16(RTRS_MAGIC),
1672                 .version = cpu_to_le16(RTRS_PROTO_VER),
1673                 .cid = cpu_to_le16(con->c.cid),
1674                 .cid_num = cpu_to_le16(sess->s.con_num),
1675                 .recon_cnt = cpu_to_le16(sess->s.recon_cnt),
1676         };
1677         uuid_copy(&msg.sess_uuid, &sess->s.uuid);
1678         uuid_copy(&msg.paths_uuid, &clt->paths_uuid);
1679
1680         err = rdma_connect(con->c.cm_id, &param);
1681         if (err)
1682                 rtrs_err(clt, "rdma_connect(): %d\n", err);
1683
1684         return err;
1685 }
1686
1687 static int rtrs_rdma_conn_established(struct rtrs_clt_con *con,
1688                                        struct rdma_cm_event *ev)
1689 {
1690         struct rtrs_clt_sess *sess = to_clt_sess(con->c.sess);
1691         struct rtrs_clt *clt = sess->clt;
1692         const struct rtrs_msg_conn_rsp *msg;
1693         u16 version, queue_depth;
1694         int errno;
1695         u8 len;
1696
1697         msg = ev->param.conn.private_data;
1698         len = ev->param.conn.private_data_len;
1699         if (len < sizeof(*msg)) {
1700                 rtrs_err(clt, "Invalid RTRS connection response\n");
1701                 return -ECONNRESET;
1702         }
1703         if (le16_to_cpu(msg->magic) != RTRS_MAGIC) {
1704                 rtrs_err(clt, "Invalid RTRS magic\n");
1705                 return -ECONNRESET;
1706         }
1707         version = le16_to_cpu(msg->version);
1708         if (version >> 8 != RTRS_PROTO_VER_MAJOR) {
1709                 rtrs_err(clt, "Unsupported major RTRS version: %d, expected %d\n",
1710                           version >> 8, RTRS_PROTO_VER_MAJOR);
1711                 return -ECONNRESET;
1712         }
1713         errno = le16_to_cpu(msg->errno);
1714         if (errno) {
1715                 rtrs_err(clt, "Invalid RTRS message: errno %d\n",
1716                           errno);
1717                 return -ECONNRESET;
1718         }
1719         if (con->c.cid == 0) {
1720                 queue_depth = le16_to_cpu(msg->queue_depth);
1721
1722                 if (queue_depth > MAX_SESS_QUEUE_DEPTH) {
1723                         rtrs_err(clt, "Invalid RTRS message: queue=%d\n",
1724                                   queue_depth);
1725                         return -ECONNRESET;
1726                 }
1727                 if (!sess->rbufs || sess->queue_depth < queue_depth) {
1728                         kfree(sess->rbufs);
1729                         sess->rbufs = kcalloc(queue_depth, sizeof(*sess->rbufs),
1730                                               GFP_KERNEL);
1731                         if (!sess->rbufs)
1732                                 return -ENOMEM;
1733                 }
1734                 sess->queue_depth = queue_depth;
1735                 sess->max_hdr_size = le32_to_cpu(msg->max_hdr_size);
1736                 sess->max_io_size = le32_to_cpu(msg->max_io_size);
1737                 sess->flags = le32_to_cpu(msg->flags);
1738                 sess->chunk_size = sess->max_io_size + sess->max_hdr_size;
1739
1740                 /*
1741                  * Global queue depth and IO size is always a minimum.
1742                  * If while a reconnection server sends us a value a bit
1743                  * higher - client does not care and uses cached minimum.
1744                  *
1745                  * Since we can have several sessions (paths) restablishing
1746                  * connections in parallel, use lock.
1747                  */
1748                 mutex_lock(&clt->paths_mutex);
1749                 clt->queue_depth = min_not_zero(sess->queue_depth,
1750                                                 clt->queue_depth);
1751                 clt->max_io_size = min_not_zero(sess->max_io_size,
1752                                                 clt->max_io_size);
1753                 mutex_unlock(&clt->paths_mutex);
1754
1755                 /*
1756                  * Cache the hca_port and hca_name for sysfs
1757                  */
1758                 sess->hca_port = con->c.cm_id->port_num;
1759                 scnprintf(sess->hca_name, sizeof(sess->hca_name),
1760                           sess->s.dev->ib_dev->name);
1761                 sess->s.src_addr = con->c.cm_id->route.addr.src_addr;
1762         }
1763
1764         return 0;
1765 }
1766
1767 static inline void flag_success_on_conn(struct rtrs_clt_con *con)
1768 {
1769         struct rtrs_clt_sess *sess = to_clt_sess(con->c.sess);
1770
1771         atomic_inc(&sess->connected_cnt);
1772         con->cm_err = 1;
1773 }
1774
1775 static int rtrs_rdma_conn_rejected(struct rtrs_clt_con *con,
1776                                     struct rdma_cm_event *ev)
1777 {
1778         struct rtrs_sess *s = con->c.sess;
1779         const struct rtrs_msg_conn_rsp *msg;
1780         const char *rej_msg;
1781         int status, errno;
1782         u8 data_len;
1783
1784         status = ev->status;
1785         rej_msg = rdma_reject_msg(con->c.cm_id, status);
1786         msg = rdma_consumer_reject_data(con->c.cm_id, ev, &data_len);
1787
1788         if (msg && data_len >= sizeof(*msg)) {
1789                 errno = (int16_t)le16_to_cpu(msg->errno);
1790                 if (errno == -EBUSY)
1791                         rtrs_err(s,
1792                                   "Previous session is still exists on the server, please reconnect later\n");
1793                 else
1794                         rtrs_err(s,
1795                                   "Connect rejected: status %d (%s), rtrs errno %d\n",
1796                                   status, rej_msg, errno);
1797         } else {
1798                 rtrs_err(s,
1799                           "Connect rejected but with malformed message: status %d (%s)\n",
1800                           status, rej_msg);
1801         }
1802
1803         return -ECONNRESET;
1804 }
1805
1806 static void rtrs_clt_close_conns(struct rtrs_clt_sess *sess, bool wait)
1807 {
1808         if (rtrs_clt_change_state(sess, RTRS_CLT_CLOSING))
1809                 queue_work(rtrs_wq, &sess->close_work);
1810         if (wait)
1811                 flush_work(&sess->close_work);
1812 }
1813
1814 static inline void flag_error_on_conn(struct rtrs_clt_con *con, int cm_err)
1815 {
1816         if (con->cm_err == 1) {
1817                 struct rtrs_clt_sess *sess;
1818
1819                 sess = to_clt_sess(con->c.sess);
1820                 if (atomic_dec_and_test(&sess->connected_cnt))
1821
1822                         wake_up(&sess->state_wq);
1823         }
1824         con->cm_err = cm_err;
1825 }
1826
1827 static int rtrs_clt_rdma_cm_handler(struct rdma_cm_id *cm_id,
1828                                      struct rdma_cm_event *ev)
1829 {
1830         struct rtrs_clt_con *con = cm_id->context;
1831         struct rtrs_sess *s = con->c.sess;
1832         struct rtrs_clt_sess *sess = to_clt_sess(s);
1833         int cm_err = 0;
1834
1835         switch (ev->event) {
1836         case RDMA_CM_EVENT_ADDR_RESOLVED:
1837                 cm_err = rtrs_rdma_addr_resolved(con);
1838                 break;
1839         case RDMA_CM_EVENT_ROUTE_RESOLVED:
1840                 cm_err = rtrs_rdma_route_resolved(con);
1841                 break;
1842         case RDMA_CM_EVENT_ESTABLISHED:
1843                 con->cm_err = rtrs_rdma_conn_established(con, ev);
1844                 if (likely(!con->cm_err)) {
1845                         /*
1846                          * Report success and wake up. Here we abuse state_wq,
1847                          * i.e. wake up without state change, but we set cm_err.
1848                          */
1849                         flag_success_on_conn(con);
1850                         wake_up(&sess->state_wq);
1851                         return 0;
1852                 }
1853                 break;
1854         case RDMA_CM_EVENT_REJECTED:
1855                 cm_err = rtrs_rdma_conn_rejected(con, ev);
1856                 break;
1857         case RDMA_CM_EVENT_CONNECT_ERROR:
1858         case RDMA_CM_EVENT_UNREACHABLE:
1859                 rtrs_wrn(s, "CM error event %d\n", ev->event);
1860                 cm_err = -ECONNRESET;
1861                 break;
1862         case RDMA_CM_EVENT_ADDR_ERROR:
1863         case RDMA_CM_EVENT_ROUTE_ERROR:
1864                 cm_err = -EHOSTUNREACH;
1865                 break;
1866         case RDMA_CM_EVENT_DISCONNECTED:
1867         case RDMA_CM_EVENT_ADDR_CHANGE:
1868         case RDMA_CM_EVENT_TIMEWAIT_EXIT:
1869                 cm_err = -ECONNRESET;
1870                 break;
1871         case RDMA_CM_EVENT_DEVICE_REMOVAL:
1872                 /*
1873                  * Device removal is a special case.  Queue close and return 0.
1874                  */
1875                 rtrs_clt_close_conns(sess, false);
1876                 return 0;
1877         default:
1878                 rtrs_err(s, "Unexpected RDMA CM event (%d)\n", ev->event);
1879                 cm_err = -ECONNRESET;
1880                 break;
1881         }
1882
1883         if (cm_err) {
1884                 /*
1885                  * cm error makes sense only on connection establishing,
1886                  * in other cases we rely on normal procedure of reconnecting.
1887                  */
1888                 flag_error_on_conn(con, cm_err);
1889                 rtrs_rdma_error_recovery(con);
1890         }
1891
1892         return 0;
1893 }
1894
1895 static int create_cm(struct rtrs_clt_con *con)
1896 {
1897         struct rtrs_sess *s = con->c.sess;
1898         struct rtrs_clt_sess *sess = to_clt_sess(s);
1899         struct rdma_cm_id *cm_id;
1900         int err;
1901
1902         cm_id = rdma_create_id(&init_net, rtrs_clt_rdma_cm_handler, con,
1903                                sess->s.dst_addr.ss_family == AF_IB ?
1904                                RDMA_PS_IB : RDMA_PS_TCP, IB_QPT_RC);
1905         if (IS_ERR(cm_id)) {
1906                 err = PTR_ERR(cm_id);
1907                 rtrs_err(s, "Failed to create CM ID, err: %d\n", err);
1908
1909                 return err;
1910         }
1911         con->c.cm_id = cm_id;
1912         con->cm_err = 0;
1913         /* allow the port to be reused */
1914         err = rdma_set_reuseaddr(cm_id, 1);
1915         if (err != 0) {
1916                 rtrs_err(s, "Set address reuse failed, err: %d\n", err);
1917                 goto destroy_cm;
1918         }
1919         err = rdma_resolve_addr(cm_id, (struct sockaddr *)&sess->s.src_addr,
1920                                 (struct sockaddr *)&sess->s.dst_addr,
1921                                 RTRS_CONNECT_TIMEOUT_MS);
1922         if (err) {
1923                 rtrs_err(s, "Failed to resolve address, err: %d\n", err);
1924                 goto destroy_cm;
1925         }
1926         /*
1927          * Combine connection status and session events. This is needed
1928          * for waiting two possible cases: cm_err has something meaningful
1929          * or session state was really changed to error by device removal.
1930          */
1931         err = wait_event_interruptible_timeout(
1932                         sess->state_wq,
1933                         con->cm_err || sess->state != RTRS_CLT_CONNECTING,
1934                         msecs_to_jiffies(RTRS_CONNECT_TIMEOUT_MS));
1935         if (err == 0 || err == -ERESTARTSYS) {
1936                 if (err == 0)
1937                         err = -ETIMEDOUT;
1938                 /* Timedout or interrupted */
1939                 goto errr;
1940         }
1941         if (con->cm_err < 0) {
1942                 err = con->cm_err;
1943                 goto errr;
1944         }
1945         if (READ_ONCE(sess->state) != RTRS_CLT_CONNECTING) {
1946                 /* Device removal */
1947                 err = -ECONNABORTED;
1948                 goto errr;
1949         }
1950
1951         return 0;
1952
1953 errr:
1954         stop_cm(con);
1955         /* Is safe to call destroy if cq_qp is not inited */
1956         destroy_con_cq_qp(con);
1957 destroy_cm:
1958         destroy_cm(con);
1959
1960         return err;
1961 }
1962
1963 static void rtrs_clt_sess_up(struct rtrs_clt_sess *sess)
1964 {
1965         struct rtrs_clt *clt = sess->clt;
1966         int up;
1967
1968         /*
1969          * We can fire RECONNECTED event only when all paths were
1970          * connected on rtrs_clt_open(), then each was disconnected
1971          * and the first one connected again.  That's why this nasty
1972          * game with counter value.
1973          */
1974
1975         mutex_lock(&clt->paths_ev_mutex);
1976         up = ++clt->paths_up;
1977         /*
1978          * Here it is safe to access paths num directly since up counter
1979          * is greater than MAX_PATHS_NUM only while rtrs_clt_open() is
1980          * in progress, thus paths removals are impossible.
1981          */
1982         if (up > MAX_PATHS_NUM && up == MAX_PATHS_NUM + clt->paths_num)
1983                 clt->paths_up = clt->paths_num;
1984         else if (up == 1)
1985                 clt->link_ev(clt->priv, RTRS_CLT_LINK_EV_RECONNECTED);
1986         mutex_unlock(&clt->paths_ev_mutex);
1987
1988         /* Mark session as established */
1989         sess->established = true;
1990         sess->reconnect_attempts = 0;
1991         sess->stats->reconnects.successful_cnt++;
1992 }
1993
1994 static void rtrs_clt_sess_down(struct rtrs_clt_sess *sess)
1995 {
1996         struct rtrs_clt *clt = sess->clt;
1997
1998         if (!sess->established)
1999                 return;
2000
2001         sess->established = false;
2002         mutex_lock(&clt->paths_ev_mutex);
2003         WARN_ON(!clt->paths_up);
2004         if (--clt->paths_up == 0)
2005                 clt->link_ev(clt->priv, RTRS_CLT_LINK_EV_DISCONNECTED);
2006         mutex_unlock(&clt->paths_ev_mutex);
2007 }
2008
2009 static void rtrs_clt_stop_and_destroy_conns(struct rtrs_clt_sess *sess)
2010 {
2011         struct rtrs_clt_con *con;
2012         unsigned int cid;
2013
2014         WARN_ON(READ_ONCE(sess->state) == RTRS_CLT_CONNECTED);
2015
2016         /*
2017          * Possible race with rtrs_clt_open(), when DEVICE_REMOVAL comes
2018          * exactly in between.  Start destroying after it finishes.
2019          */
2020         mutex_lock(&sess->init_mutex);
2021         mutex_unlock(&sess->init_mutex);
2022
2023         /*
2024          * All IO paths must observe !CONNECTED state before we
2025          * free everything.
2026          */
2027         synchronize_rcu();
2028
2029         rtrs_clt_stop_hb(sess);
2030
2031         /*
2032          * The order it utterly crucial: firstly disconnect and complete all
2033          * rdma requests with error (thus set in_use=false for requests),
2034          * then fail outstanding requests checking in_use for each, and
2035          * eventually notify upper layer about session disconnection.
2036          */
2037
2038         for (cid = 0; cid < sess->s.con_num; cid++) {
2039                 if (!sess->s.con[cid])
2040                         break;
2041                 con = to_clt_con(sess->s.con[cid]);
2042                 stop_cm(con);
2043         }
2044         fail_all_outstanding_reqs(sess);
2045         free_sess_reqs(sess);
2046         rtrs_clt_sess_down(sess);
2047
2048         /*
2049          * Wait for graceful shutdown, namely when peer side invokes
2050          * rdma_disconnect(). 'connected_cnt' is decremented only on
2051          * CM events, thus if other side had crashed and hb has detected
2052          * something is wrong, here we will stuck for exactly timeout ms,
2053          * since CM does not fire anything.  That is fine, we are not in
2054          * hurry.
2055          */
2056         wait_event_timeout(sess->state_wq, !atomic_read(&sess->connected_cnt),
2057                            msecs_to_jiffies(RTRS_CONNECT_TIMEOUT_MS));
2058
2059         for (cid = 0; cid < sess->s.con_num; cid++) {
2060                 if (!sess->s.con[cid])
2061                         break;
2062                 con = to_clt_con(sess->s.con[cid]);
2063                 destroy_con_cq_qp(con);
2064                 destroy_cm(con);
2065                 destroy_con(con);
2066         }
2067 }
2068
2069 static inline bool xchg_sessions(struct rtrs_clt_sess __rcu **rcu_ppcpu_path,
2070                                  struct rtrs_clt_sess *sess,
2071                                  struct rtrs_clt_sess *next)
2072 {
2073         struct rtrs_clt_sess **ppcpu_path;
2074
2075         /* Call cmpxchg() without sparse warnings */
2076         ppcpu_path = (typeof(ppcpu_path))rcu_ppcpu_path;
2077         return sess == cmpxchg(ppcpu_path, sess, next);
2078 }
2079
2080 static void rtrs_clt_remove_path_from_arr(struct rtrs_clt_sess *sess)
2081 {
2082         struct rtrs_clt *clt = sess->clt;
2083         struct rtrs_clt_sess *next;
2084         bool wait_for_grace = false;
2085         int cpu;
2086
2087         mutex_lock(&clt->paths_mutex);
2088         list_del_rcu(&sess->s.entry);
2089
2090         /* Make sure everybody observes path removal. */
2091         synchronize_rcu();
2092
2093         /*
2094          * At this point nobody sees @sess in the list, but still we have
2095          * dangling pointer @pcpu_path which _can_ point to @sess.  Since
2096          * nobody can observe @sess in the list, we guarantee that IO path
2097          * will not assign @sess to @pcpu_path, i.e. @pcpu_path can be equal
2098          * to @sess, but can never again become @sess.
2099          */
2100
2101         /*
2102          * Decrement paths number only after grace period, because
2103          * caller of do_each_path() must firstly observe list without
2104          * path and only then decremented paths number.
2105          *
2106          * Otherwise there can be the following situation:
2107          *    o Two paths exist and IO is coming.
2108          *    o One path is removed:
2109          *      CPU#0                          CPU#1
2110          *      do_each_path():                rtrs_clt_remove_path_from_arr():
2111          *          path = get_next_path()
2112          *          ^^^                            list_del_rcu(path)
2113          *          [!CONNECTED path]              clt->paths_num--
2114          *                                              ^^^^^^^^^
2115          *          load clt->paths_num                 from 2 to 1
2116          *                    ^^^^^^^^^
2117          *                    sees 1
2118          *
2119          *      path is observed as !CONNECTED, but do_each_path() loop
2120          *      ends, because expression i < clt->paths_num is false.
2121          */
2122         clt->paths_num--;
2123
2124         /*
2125          * Get @next connection from current @sess which is going to be
2126          * removed.  If @sess is the last element, then @next is NULL.
2127          */
2128         rcu_read_lock();
2129         next = list_next_or_null_rr_rcu(&clt->paths_list, &sess->s.entry,
2130                                         typeof(*next), s.entry);
2131         rcu_read_unlock();
2132
2133         /*
2134          * @pcpu paths can still point to the path which is going to be
2135          * removed, so change the pointer manually.
2136          */
2137         for_each_possible_cpu(cpu) {
2138                 struct rtrs_clt_sess __rcu **ppcpu_path;
2139
2140                 ppcpu_path = per_cpu_ptr(clt->pcpu_path, cpu);
2141                 if (rcu_dereference_protected(*ppcpu_path,
2142                         lockdep_is_held(&clt->paths_mutex)) != sess)
2143                         /*
2144                          * synchronize_rcu() was called just after deleting
2145                          * entry from the list, thus IO code path cannot
2146                          * change pointer back to the pointer which is going
2147                          * to be removed, we are safe here.
2148                          */
2149                         continue;
2150
2151                 /*
2152                  * We race with IO code path, which also changes pointer,
2153                  * thus we have to be careful not to overwrite it.
2154                  */
2155                 if (xchg_sessions(ppcpu_path, sess, next))
2156                         /*
2157                          * @ppcpu_path was successfully replaced with @next,
2158                          * that means that someone could also pick up the
2159                          * @sess and dereferencing it right now, so wait for
2160                          * a grace period is required.
2161                          */
2162                         wait_for_grace = true;
2163         }
2164         if (wait_for_grace)
2165                 synchronize_rcu();
2166
2167         mutex_unlock(&clt->paths_mutex);
2168 }
2169
2170 static void rtrs_clt_add_path_to_arr(struct rtrs_clt_sess *sess,
2171                                       struct rtrs_addr *addr)
2172 {
2173         struct rtrs_clt *clt = sess->clt;
2174
2175         mutex_lock(&clt->paths_mutex);
2176         clt->paths_num++;
2177
2178         list_add_tail_rcu(&sess->s.entry, &clt->paths_list);
2179         mutex_unlock(&clt->paths_mutex);
2180 }
2181
2182 static void rtrs_clt_close_work(struct work_struct *work)
2183 {
2184         struct rtrs_clt_sess *sess;
2185
2186         sess = container_of(work, struct rtrs_clt_sess, close_work);
2187
2188         cancel_delayed_work_sync(&sess->reconnect_dwork);
2189         rtrs_clt_stop_and_destroy_conns(sess);
2190         rtrs_clt_change_state(sess, RTRS_CLT_CLOSED);
2191 }
2192
2193 static int init_conns(struct rtrs_clt_sess *sess)
2194 {
2195         unsigned int cid;
2196         int err;
2197
2198         /*
2199          * On every new session connections increase reconnect counter
2200          * to avoid clashes with previous sessions not yet closed
2201          * sessions on a server side.
2202          */
2203         sess->s.recon_cnt++;
2204
2205         /* Establish all RDMA connections  */
2206         for (cid = 0; cid < sess->s.con_num; cid++) {
2207                 err = create_con(sess, cid);
2208                 if (err)
2209                         goto destroy;
2210
2211                 err = create_cm(to_clt_con(sess->s.con[cid]));
2212                 if (err) {
2213                         destroy_con(to_clt_con(sess->s.con[cid]));
2214                         goto destroy;
2215                 }
2216         }
2217         err = alloc_sess_reqs(sess);
2218         if (err)
2219                 goto destroy;
2220
2221         rtrs_clt_start_hb(sess);
2222
2223         return 0;
2224
2225 destroy:
2226         while (cid--) {
2227                 struct rtrs_clt_con *con = to_clt_con(sess->s.con[cid]);
2228
2229                 stop_cm(con);
2230                 destroy_con_cq_qp(con);
2231                 destroy_cm(con);
2232                 destroy_con(con);
2233         }
2234         /*
2235          * If we've never taken async path and got an error, say,
2236          * doing rdma_resolve_addr(), switch to CONNECTION_ERR state
2237          * manually to keep reconnecting.
2238          */
2239         rtrs_clt_change_state(sess, RTRS_CLT_CONNECTING_ERR);
2240
2241         return err;
2242 }
2243
2244 static void rtrs_clt_info_req_done(struct ib_cq *cq, struct ib_wc *wc)
2245 {
2246         struct rtrs_clt_con *con = cq->cq_context;
2247         struct rtrs_clt_sess *sess = to_clt_sess(con->c.sess);
2248         struct rtrs_iu *iu;
2249
2250         iu = container_of(wc->wr_cqe, struct rtrs_iu, cqe);
2251         rtrs_iu_free(iu, DMA_TO_DEVICE, sess->s.dev->ib_dev, 1);
2252
2253         if (unlikely(wc->status != IB_WC_SUCCESS)) {
2254                 rtrs_err(sess->clt, "Sess info request send failed: %s\n",
2255                           ib_wc_status_msg(wc->status));
2256                 rtrs_clt_change_state(sess, RTRS_CLT_CONNECTING_ERR);
2257                 return;
2258         }
2259
2260         rtrs_clt_update_wc_stats(con);
2261 }
2262
2263 static int process_info_rsp(struct rtrs_clt_sess *sess,
2264                             const struct rtrs_msg_info_rsp *msg)
2265 {
2266         unsigned int sg_cnt, total_len;
2267         int i, sgi;
2268
2269         sg_cnt = le16_to_cpu(msg->sg_cnt);
2270         if (unlikely(!sg_cnt))
2271                 return -EINVAL;
2272         /*
2273          * Check if IB immediate data size is enough to hold the mem_id and
2274          * the offset inside the memory chunk.
2275          */
2276         if (unlikely((ilog2(sg_cnt - 1) + 1) +
2277                      (ilog2(sess->chunk_size - 1) + 1) >
2278                      MAX_IMM_PAYL_BITS)) {
2279                 rtrs_err(sess->clt,
2280                           "RDMA immediate size (%db) not enough to encode %d buffers of size %dB\n",
2281                           MAX_IMM_PAYL_BITS, sg_cnt, sess->chunk_size);
2282                 return -EINVAL;
2283         }
2284         if (unlikely(!sg_cnt || (sess->queue_depth % sg_cnt))) {
2285                 rtrs_err(sess->clt, "Incorrect sg_cnt %d, is not multiple\n",
2286                           sg_cnt);
2287                 return -EINVAL;
2288         }
2289         total_len = 0;
2290         for (sgi = 0, i = 0; sgi < sg_cnt && i < sess->queue_depth; sgi++) {
2291                 const struct rtrs_sg_desc *desc = &msg->desc[sgi];
2292                 u32 len, rkey;
2293                 u64 addr;
2294
2295                 addr = le64_to_cpu(desc->addr);
2296                 rkey = le32_to_cpu(desc->key);
2297                 len  = le32_to_cpu(desc->len);
2298
2299                 total_len += len;
2300
2301                 if (unlikely(!len || (len % sess->chunk_size))) {
2302                         rtrs_err(sess->clt, "Incorrect [%d].len %d\n", sgi,
2303                                   len);
2304                         return -EINVAL;
2305                 }
2306                 for ( ; len && i < sess->queue_depth; i++) {
2307                         sess->rbufs[i].addr = addr;
2308                         sess->rbufs[i].rkey = rkey;
2309
2310                         len  -= sess->chunk_size;
2311                         addr += sess->chunk_size;
2312                 }
2313         }
2314         /* Sanity check */
2315         if (unlikely(sgi != sg_cnt || i != sess->queue_depth)) {
2316                 rtrs_err(sess->clt, "Incorrect sg vector, not fully mapped\n");
2317                 return -EINVAL;
2318         }
2319         if (unlikely(total_len != sess->chunk_size * sess->queue_depth)) {
2320                 rtrs_err(sess->clt, "Incorrect total_len %d\n", total_len);
2321                 return -EINVAL;
2322         }
2323
2324         return 0;
2325 }
2326
2327 static void rtrs_clt_info_rsp_done(struct ib_cq *cq, struct ib_wc *wc)
2328 {
2329         struct rtrs_clt_con *con = cq->cq_context;
2330         struct rtrs_clt_sess *sess = to_clt_sess(con->c.sess);
2331         struct rtrs_msg_info_rsp *msg;
2332         enum rtrs_clt_state state;
2333         struct rtrs_iu *iu;
2334         size_t rx_sz;
2335         int err;
2336
2337         state = RTRS_CLT_CONNECTING_ERR;
2338
2339         WARN_ON(con->c.cid);
2340         iu = container_of(wc->wr_cqe, struct rtrs_iu, cqe);
2341         if (unlikely(wc->status != IB_WC_SUCCESS)) {
2342                 rtrs_err(sess->clt, "Sess info response recv failed: %s\n",
2343                           ib_wc_status_msg(wc->status));
2344                 goto out;
2345         }
2346         WARN_ON(wc->opcode != IB_WC_RECV);
2347
2348         if (unlikely(wc->byte_len < sizeof(*msg))) {
2349                 rtrs_err(sess->clt, "Sess info response is malformed: size %d\n",
2350                           wc->byte_len);
2351                 goto out;
2352         }
2353         ib_dma_sync_single_for_cpu(sess->s.dev->ib_dev, iu->dma_addr,
2354                                    iu->size, DMA_FROM_DEVICE);
2355         msg = iu->buf;
2356         if (unlikely(le16_to_cpu(msg->type) != RTRS_MSG_INFO_RSP)) {
2357                 rtrs_err(sess->clt, "Sess info response is malformed: type %d\n",
2358                           le16_to_cpu(msg->type));
2359                 goto out;
2360         }
2361         rx_sz  = sizeof(*msg);
2362         rx_sz += sizeof(msg->desc[0]) * le16_to_cpu(msg->sg_cnt);
2363         if (unlikely(wc->byte_len < rx_sz)) {
2364                 rtrs_err(sess->clt, "Sess info response is malformed: size %d\n",
2365                           wc->byte_len);
2366                 goto out;
2367         }
2368         err = process_info_rsp(sess, msg);
2369         if (unlikely(err))
2370                 goto out;
2371
2372         err = post_recv_sess(sess);
2373         if (unlikely(err))
2374                 goto out;
2375
2376         state = RTRS_CLT_CONNECTED;
2377
2378 out:
2379         rtrs_clt_update_wc_stats(con);
2380         rtrs_iu_free(iu, DMA_FROM_DEVICE, sess->s.dev->ib_dev, 1);
2381         rtrs_clt_change_state(sess, state);
2382 }
2383
2384 static int rtrs_send_sess_info(struct rtrs_clt_sess *sess)
2385 {
2386         struct rtrs_clt_con *usr_con = to_clt_con(sess->s.con[0]);
2387         struct rtrs_msg_info_req *msg;
2388         struct rtrs_iu *tx_iu, *rx_iu;
2389         size_t rx_sz;
2390         int err;
2391
2392         rx_sz  = sizeof(struct rtrs_msg_info_rsp);
2393         rx_sz += sizeof(u64) * MAX_SESS_QUEUE_DEPTH;
2394
2395         tx_iu = rtrs_iu_alloc(1, sizeof(struct rtrs_msg_info_req), GFP_KERNEL,
2396                                sess->s.dev->ib_dev, DMA_TO_DEVICE,
2397                                rtrs_clt_info_req_done);
2398         rx_iu = rtrs_iu_alloc(1, rx_sz, GFP_KERNEL, sess->s.dev->ib_dev,
2399                                DMA_FROM_DEVICE, rtrs_clt_info_rsp_done);
2400         if (unlikely(!tx_iu || !rx_iu)) {
2401                 err = -ENOMEM;
2402                 goto out;
2403         }
2404         /* Prepare for getting info response */
2405         err = rtrs_iu_post_recv(&usr_con->c, rx_iu);
2406         if (unlikely(err)) {
2407                 rtrs_err(sess->clt, "rtrs_iu_post_recv(), err: %d\n", err);
2408                 goto out;
2409         }
2410         rx_iu = NULL;
2411
2412         msg = tx_iu->buf;
2413         msg->type = cpu_to_le16(RTRS_MSG_INFO_REQ);
2414         memcpy(msg->sessname, sess->s.sessname, sizeof(msg->sessname));
2415
2416         ib_dma_sync_single_for_device(sess->s.dev->ib_dev, tx_iu->dma_addr,
2417                                       tx_iu->size, DMA_TO_DEVICE);
2418
2419         /* Send info request */
2420         err = rtrs_iu_post_send(&usr_con->c, tx_iu, sizeof(*msg), NULL);
2421         if (unlikely(err)) {
2422                 rtrs_err(sess->clt, "rtrs_iu_post_send(), err: %d\n", err);
2423                 goto out;
2424         }
2425         tx_iu = NULL;
2426
2427         /* Wait for state change */
2428         wait_event_interruptible_timeout(sess->state_wq,
2429                                          sess->state != RTRS_CLT_CONNECTING,
2430                                          msecs_to_jiffies(
2431                                                  RTRS_CONNECT_TIMEOUT_MS));
2432         if (unlikely(READ_ONCE(sess->state) != RTRS_CLT_CONNECTED)) {
2433                 if (READ_ONCE(sess->state) == RTRS_CLT_CONNECTING_ERR)
2434                         err = -ECONNRESET;
2435                 else
2436                         err = -ETIMEDOUT;
2437                 goto out;
2438         }
2439
2440 out:
2441         if (tx_iu)
2442                 rtrs_iu_free(tx_iu, DMA_TO_DEVICE, sess->s.dev->ib_dev, 1);
2443         if (rx_iu)
2444                 rtrs_iu_free(rx_iu, DMA_FROM_DEVICE, sess->s.dev->ib_dev, 1);
2445         if (unlikely(err))
2446                 /* If we've never taken async path because of malloc problems */
2447                 rtrs_clt_change_state(sess, RTRS_CLT_CONNECTING_ERR);
2448
2449         return err;
2450 }
2451
2452 /**
2453  * init_sess() - establishes all session connections and does handshake
2454  * @sess: client session.
2455  * In case of error full close or reconnect procedure should be taken,
2456  * because reconnect or close async works can be started.
2457  */
2458 static int init_sess(struct rtrs_clt_sess *sess)
2459 {
2460         int err;
2461
2462         mutex_lock(&sess->init_mutex);
2463         err = init_conns(sess);
2464         if (err) {
2465                 rtrs_err(sess->clt, "init_conns(), err: %d\n", err);
2466                 goto out;
2467         }
2468         err = rtrs_send_sess_info(sess);
2469         if (err) {
2470                 rtrs_err(sess->clt, "rtrs_send_sess_info(), err: %d\n", err);
2471                 goto out;
2472         }
2473         rtrs_clt_sess_up(sess);
2474 out:
2475         mutex_unlock(&sess->init_mutex);
2476
2477         return err;
2478 }
2479
2480 static void rtrs_clt_reconnect_work(struct work_struct *work)
2481 {
2482         struct rtrs_clt_sess *sess;
2483         struct rtrs_clt *clt;
2484         unsigned int delay_ms;
2485         int err;
2486
2487         sess = container_of(to_delayed_work(work), struct rtrs_clt_sess,
2488                             reconnect_dwork);
2489         clt = sess->clt;
2490
2491         if (READ_ONCE(sess->state) != RTRS_CLT_RECONNECTING)
2492                 return;
2493
2494         if (sess->reconnect_attempts >= clt->max_reconnect_attempts) {
2495                 /* Close a session completely if max attempts is reached */
2496                 rtrs_clt_close_conns(sess, false);
2497                 return;
2498         }
2499         sess->reconnect_attempts++;
2500
2501         /* Stop everything */
2502         rtrs_clt_stop_and_destroy_conns(sess);
2503         msleep(RTRS_RECONNECT_BACKOFF);
2504         if (rtrs_clt_change_state(sess, RTRS_CLT_CONNECTING)) {
2505                 err = init_sess(sess);
2506                 if (err)
2507                         goto reconnect_again;
2508         }
2509
2510         return;
2511
2512 reconnect_again:
2513         if (rtrs_clt_change_state(sess, RTRS_CLT_RECONNECTING)) {
2514                 sess->stats->reconnects.fail_cnt++;
2515                 delay_ms = clt->reconnect_delay_sec * 1000;
2516                 queue_delayed_work(rtrs_wq, &sess->reconnect_dwork,
2517                                    msecs_to_jiffies(delay_ms));
2518         }
2519 }
2520
2521 static void rtrs_clt_dev_release(struct device *dev)
2522 {
2523         struct rtrs_clt *clt = container_of(dev, struct rtrs_clt, dev);
2524
2525         kfree(clt);
2526 }
2527
2528 static struct rtrs_clt *alloc_clt(const char *sessname, size_t paths_num,
2529                                   u16 port, size_t pdu_sz, void *priv,
2530                                   void  (*link_ev)(void *priv,
2531                                                    enum rtrs_clt_link_ev ev),
2532                                   unsigned int max_segments,
2533                                   unsigned int reconnect_delay_sec,
2534                                   unsigned int max_reconnect_attempts)
2535 {
2536         struct rtrs_clt *clt;
2537         int err;
2538
2539         if (!paths_num || paths_num > MAX_PATHS_NUM)
2540                 return ERR_PTR(-EINVAL);
2541
2542         if (strlen(sessname) >= sizeof(clt->sessname))
2543                 return ERR_PTR(-EINVAL);
2544
2545         clt = kzalloc(sizeof(*clt), GFP_KERNEL);
2546         if (!clt)
2547                 return ERR_PTR(-ENOMEM);
2548
2549         clt->pcpu_path = alloc_percpu(typeof(*clt->pcpu_path));
2550         if (!clt->pcpu_path) {
2551                 kfree(clt);
2552                 return ERR_PTR(-ENOMEM);
2553         }
2554
2555         uuid_gen(&clt->paths_uuid);
2556         INIT_LIST_HEAD_RCU(&clt->paths_list);
2557         clt->paths_num = paths_num;
2558         clt->paths_up = MAX_PATHS_NUM;
2559         clt->port = port;
2560         clt->pdu_sz = pdu_sz;
2561         clt->max_segments = max_segments;
2562         clt->reconnect_delay_sec = reconnect_delay_sec;
2563         clt->max_reconnect_attempts = max_reconnect_attempts;
2564         clt->priv = priv;
2565         clt->link_ev = link_ev;
2566         clt->mp_policy = MP_POLICY_MIN_INFLIGHT;
2567         strlcpy(clt->sessname, sessname, sizeof(clt->sessname));
2568         init_waitqueue_head(&clt->permits_wait);
2569         mutex_init(&clt->paths_ev_mutex);
2570         mutex_init(&clt->paths_mutex);
2571
2572         clt->dev.class = rtrs_clt_dev_class;
2573         clt->dev.release = rtrs_clt_dev_release;
2574         err = dev_set_name(&clt->dev, "%s", sessname);
2575         if (err) {
2576                 free_percpu(clt->pcpu_path);
2577                 kfree(clt);
2578                 return ERR_PTR(err);
2579         }
2580         /*
2581          * Suppress user space notification until
2582          * sysfs files are created
2583          */
2584         dev_set_uevent_suppress(&clt->dev, true);
2585         err = device_register(&clt->dev);
2586         if (err) {
2587                 free_percpu(clt->pcpu_path);
2588                 put_device(&clt->dev);
2589                 return ERR_PTR(err);
2590         }
2591
2592         clt->kobj_paths = kobject_create_and_add("paths", &clt->dev.kobj);
2593         if (!clt->kobj_paths) {
2594                 free_percpu(clt->pcpu_path);
2595                 device_unregister(&clt->dev);
2596                 return NULL;
2597         }
2598         err = rtrs_clt_create_sysfs_root_files(clt);
2599         if (err) {
2600                 free_percpu(clt->pcpu_path);
2601                 kobject_del(clt->kobj_paths);
2602                 kobject_put(clt->kobj_paths);
2603                 device_unregister(&clt->dev);
2604                 return ERR_PTR(err);
2605         }
2606         dev_set_uevent_suppress(&clt->dev, false);
2607         kobject_uevent(&clt->dev.kobj, KOBJ_ADD);
2608
2609         return clt;
2610 }
2611
2612 static void wait_for_inflight_permits(struct rtrs_clt *clt)
2613 {
2614         if (clt->permits_map) {
2615                 size_t sz = clt->queue_depth;
2616
2617                 wait_event(clt->permits_wait,
2618                            find_first_bit(clt->permits_map, sz) >= sz);
2619         }
2620 }
2621
2622 static void free_clt(struct rtrs_clt *clt)
2623 {
2624         wait_for_inflight_permits(clt);
2625         free_permits(clt);
2626         free_percpu(clt->pcpu_path);
2627         mutex_destroy(&clt->paths_ev_mutex);
2628         mutex_destroy(&clt->paths_mutex);
2629         /* release callback will free clt in last put */
2630         device_unregister(&clt->dev);
2631 }
2632
2633 /**
2634  * rtrs_clt_open() - Open a session to an RTRS server
2635  * @ops: holds the link event callback and the private pointer.
2636  * @sessname: name of the session
2637  * @paths: Paths to be established defined by their src and dst addresses
2638  * @paths_num: Number of elements in the @paths array
2639  * @port: port to be used by the RTRS session
2640  * @pdu_sz: Size of extra payload which can be accessed after permit allocation.
2641  * @reconnect_delay_sec: time between reconnect tries
2642  * @max_segments: Max. number of segments per IO request
2643  * @max_reconnect_attempts: Number of times to reconnect on error before giving
2644  *                          up, 0 for * disabled, -1 for forever
2645  *
2646  * Starts session establishment with the rtrs_server. The function can block
2647  * up to ~2000ms before it returns.
2648  *
2649  * Return a valid pointer on success otherwise PTR_ERR.
2650  */
2651 struct rtrs_clt *rtrs_clt_open(struct rtrs_clt_ops *ops,
2652                                  const char *sessname,
2653                                  const struct rtrs_addr *paths,
2654                                  size_t paths_num, u16 port,
2655                                  size_t pdu_sz, u8 reconnect_delay_sec,
2656                                  u16 max_segments,
2657                                  s16 max_reconnect_attempts)
2658 {
2659         struct rtrs_clt_sess *sess, *tmp;
2660         struct rtrs_clt *clt;
2661         int err, i;
2662
2663         clt = alloc_clt(sessname, paths_num, port, pdu_sz, ops->priv,
2664                         ops->link_ev,
2665                         max_segments, reconnect_delay_sec,
2666                         max_reconnect_attempts);
2667         if (IS_ERR(clt)) {
2668                 err = PTR_ERR(clt);
2669                 goto out;
2670         }
2671         for (i = 0; i < paths_num; i++) {
2672                 struct rtrs_clt_sess *sess;
2673
2674                 sess = alloc_sess(clt, &paths[i], nr_cpu_ids,
2675                                   max_segments);
2676                 if (IS_ERR(sess)) {
2677                         err = PTR_ERR(sess);
2678                         goto close_all_sess;
2679                 }
2680                 list_add_tail_rcu(&sess->s.entry, &clt->paths_list);
2681
2682                 err = init_sess(sess);
2683                 if (err) {
2684                         list_del_rcu(&sess->s.entry);
2685                         rtrs_clt_close_conns(sess, true);
2686                         free_sess(sess);
2687                         goto close_all_sess;
2688                 }
2689
2690                 err = rtrs_clt_create_sess_files(sess);
2691                 if (err) {
2692                         list_del_rcu(&sess->s.entry);
2693                         rtrs_clt_close_conns(sess, true);
2694                         free_sess(sess);
2695                         goto close_all_sess;
2696                 }
2697         }
2698         err = alloc_permits(clt);
2699         if (err)
2700                 goto close_all_sess;
2701
2702         return clt;
2703
2704 close_all_sess:
2705         list_for_each_entry_safe(sess, tmp, &clt->paths_list, s.entry) {
2706                 rtrs_clt_destroy_sess_files(sess, NULL);
2707                 rtrs_clt_close_conns(sess, true);
2708                 kobject_put(&sess->kobj);
2709         }
2710         rtrs_clt_destroy_sysfs_root_files(clt);
2711         rtrs_clt_destroy_sysfs_root_folders(clt);
2712         free_clt(clt);
2713
2714 out:
2715         return ERR_PTR(err);
2716 }
2717 EXPORT_SYMBOL(rtrs_clt_open);
2718
2719 /**
2720  * rtrs_clt_close() - Close a session
2721  * @clt: Session handle. Session is freed upon return.
2722  */
2723 void rtrs_clt_close(struct rtrs_clt *clt)
2724 {
2725         struct rtrs_clt_sess *sess, *tmp;
2726
2727         /* Firstly forbid sysfs access */
2728         rtrs_clt_destroy_sysfs_root_files(clt);
2729         rtrs_clt_destroy_sysfs_root_folders(clt);
2730
2731         /* Now it is safe to iterate over all paths without locks */
2732         list_for_each_entry_safe(sess, tmp, &clt->paths_list, s.entry) {
2733                 rtrs_clt_destroy_sess_files(sess, NULL);
2734                 rtrs_clt_close_conns(sess, true);
2735                 kobject_put(&sess->kobj);
2736         }
2737         free_clt(clt);
2738 }
2739 EXPORT_SYMBOL(rtrs_clt_close);
2740
2741 int rtrs_clt_reconnect_from_sysfs(struct rtrs_clt_sess *sess)
2742 {
2743         enum rtrs_clt_state old_state;
2744         int err = -EBUSY;
2745         bool changed;
2746
2747         changed = rtrs_clt_change_state_get_old(sess, RTRS_CLT_RECONNECTING,
2748                                                  &old_state);
2749         if (changed) {
2750                 sess->reconnect_attempts = 0;
2751                 queue_delayed_work(rtrs_wq, &sess->reconnect_dwork, 0);
2752         }
2753         if (changed || old_state == RTRS_CLT_RECONNECTING) {
2754                 /*
2755                  * flush_delayed_work() queues pending work for immediate
2756                  * execution, so do the flush if we have queued something
2757                  * right now or work is pending.
2758                  */
2759                 flush_delayed_work(&sess->reconnect_dwork);
2760                 err = (READ_ONCE(sess->state) ==
2761                        RTRS_CLT_CONNECTED ? 0 : -ENOTCONN);
2762         }
2763
2764         return err;
2765 }
2766
2767 int rtrs_clt_disconnect_from_sysfs(struct rtrs_clt_sess *sess)
2768 {
2769         rtrs_clt_close_conns(sess, true);
2770
2771         return 0;
2772 }
2773
2774 int rtrs_clt_remove_path_from_sysfs(struct rtrs_clt_sess *sess,
2775                                      const struct attribute *sysfs_self)
2776 {
2777         enum rtrs_clt_state old_state;
2778         bool changed;
2779
2780         /*
2781          * Continue stopping path till state was changed to DEAD or
2782          * state was observed as DEAD:
2783          * 1. State was changed to DEAD - we were fast and nobody
2784          *    invoked rtrs_clt_reconnect(), which can again start
2785          *    reconnecting.
2786          * 2. State was observed as DEAD - we have someone in parallel
2787          *    removing the path.
2788          */
2789         do {
2790                 rtrs_clt_close_conns(sess, true);
2791                 changed = rtrs_clt_change_state_get_old(sess,
2792                                                         RTRS_CLT_DEAD,
2793                                                         &old_state);
2794         } while (!changed && old_state != RTRS_CLT_DEAD);
2795
2796         if (likely(changed)) {
2797                 rtrs_clt_destroy_sess_files(sess, sysfs_self);
2798                 rtrs_clt_remove_path_from_arr(sess);
2799                 kobject_put(&sess->kobj);
2800         }
2801
2802         return 0;
2803 }
2804
2805 void rtrs_clt_set_max_reconnect_attempts(struct rtrs_clt *clt, int value)
2806 {
2807         clt->max_reconnect_attempts = (unsigned int)value;
2808 }
2809
2810 int rtrs_clt_get_max_reconnect_attempts(const struct rtrs_clt *clt)
2811 {
2812         return (int)clt->max_reconnect_attempts;
2813 }
2814
2815 /**
2816  * rtrs_clt_request() - Request data transfer to/from server via RDMA.
2817  *
2818  * @dir:        READ/WRITE
2819  * @ops:        callback function to be called as confirmation, and the pointer.
2820  * @clt:        Session
2821  * @permit:     Preallocated permit
2822  * @vec:        Message that is sent to server together with the request.
2823  *              Sum of len of all @vec elements limited to <= IO_MSG_SIZE.
2824  *              Since the msg is copied internally it can be allocated on stack.
2825  * @nr:         Number of elements in @vec.
2826  * @data_len:   length of data sent to/from server
2827  * @sg:         Pages to be sent/received to/from server.
2828  * @sg_cnt:     Number of elements in the @sg
2829  *
2830  * Return:
2831  * 0:           Success
2832  * <0:          Error
2833  *
2834  * On dir=READ rtrs client will request a data transfer from Server to client.
2835  * The data that the server will respond with will be stored in @sg when
2836  * the user receives an %RTRS_CLT_RDMA_EV_RDMA_REQUEST_WRITE_COMPL event.
2837  * On dir=WRITE rtrs client will rdma write data in sg to server side.
2838  */
2839 int rtrs_clt_request(int dir, struct rtrs_clt_req_ops *ops,
2840                      struct rtrs_clt *clt, struct rtrs_permit *permit,
2841                       const struct kvec *vec, size_t nr, size_t data_len,
2842                       struct scatterlist *sg, unsigned int sg_cnt)
2843 {
2844         struct rtrs_clt_io_req *req;
2845         struct rtrs_clt_sess *sess;
2846
2847         enum dma_data_direction dma_dir;
2848         int err = -ECONNABORTED, i;
2849         size_t usr_len, hdr_len;
2850         struct path_it it;
2851
2852         /* Get kvec length */
2853         for (i = 0, usr_len = 0; i < nr; i++)
2854                 usr_len += vec[i].iov_len;
2855
2856         if (dir == READ) {
2857                 hdr_len = sizeof(struct rtrs_msg_rdma_read) +
2858                           sg_cnt * sizeof(struct rtrs_sg_desc);
2859                 dma_dir = DMA_FROM_DEVICE;
2860         } else {
2861                 hdr_len = sizeof(struct rtrs_msg_rdma_write);
2862                 dma_dir = DMA_TO_DEVICE;
2863         }
2864
2865         do_each_path(sess, clt, &it) {
2866                 if (unlikely(READ_ONCE(sess->state) != RTRS_CLT_CONNECTED))
2867                         continue;
2868
2869                 if (unlikely(usr_len + hdr_len > sess->max_hdr_size)) {
2870                         rtrs_wrn_rl(sess->clt,
2871                                      "%s request failed, user message size is %zu and header length %zu, but max size is %u\n",
2872                                      dir == READ ? "Read" : "Write",
2873                                      usr_len, hdr_len, sess->max_hdr_size);
2874                         err = -EMSGSIZE;
2875                         break;
2876                 }
2877                 req = rtrs_clt_get_req(sess, ops->conf_fn, permit, ops->priv,
2878                                        vec, usr_len, sg, sg_cnt, data_len,
2879                                        dma_dir);
2880                 if (dir == READ)
2881                         err = rtrs_clt_read_req(req);
2882                 else
2883                         err = rtrs_clt_write_req(req);
2884                 if (unlikely(err)) {
2885                         req->in_use = false;
2886                         continue;
2887                 }
2888                 /* Success path */
2889                 break;
2890         } while_each_path(&it);
2891
2892         return err;
2893 }
2894 EXPORT_SYMBOL(rtrs_clt_request);
2895
2896 /**
2897  * rtrs_clt_query() - queries RTRS session attributes
2898  *@clt: session pointer
2899  *@attr: query results for session attributes.
2900  * Returns:
2901  *    0 on success
2902  *    -ECOMM            no connection to the server
2903  */
2904 int rtrs_clt_query(struct rtrs_clt *clt, struct rtrs_attrs *attr)
2905 {
2906         if (!rtrs_clt_is_connected(clt))
2907                 return -ECOMM;
2908
2909         attr->queue_depth      = clt->queue_depth;
2910         attr->max_io_size      = clt->max_io_size;
2911         attr->sess_kobj        = &clt->dev.kobj;
2912         strlcpy(attr->sessname, clt->sessname, sizeof(attr->sessname));
2913
2914         return 0;
2915 }
2916 EXPORT_SYMBOL(rtrs_clt_query);
2917
2918 int rtrs_clt_create_path_from_sysfs(struct rtrs_clt *clt,
2919                                      struct rtrs_addr *addr)
2920 {
2921         struct rtrs_clt_sess *sess;
2922         int err;
2923
2924         sess = alloc_sess(clt, addr, nr_cpu_ids, clt->max_segments);
2925         if (IS_ERR(sess))
2926                 return PTR_ERR(sess);
2927
2928         /*
2929          * It is totally safe to add path in CONNECTING state: coming
2930          * IO will never grab it.  Also it is very important to add
2931          * path before init, since init fires LINK_CONNECTED event.
2932          */
2933         rtrs_clt_add_path_to_arr(sess, addr);
2934
2935         err = init_sess(sess);
2936         if (err)
2937                 goto close_sess;
2938
2939         err = rtrs_clt_create_sess_files(sess);
2940         if (err)
2941                 goto close_sess;
2942
2943         return 0;
2944
2945 close_sess:
2946         rtrs_clt_remove_path_from_arr(sess);
2947         rtrs_clt_close_conns(sess, true);
2948         free_sess(sess);
2949
2950         return err;
2951 }
2952
2953 static int rtrs_clt_ib_dev_init(struct rtrs_ib_dev *dev)
2954 {
2955         if (!(dev->ib_dev->attrs.device_cap_flags &
2956               IB_DEVICE_MEM_MGT_EXTENSIONS)) {
2957                 pr_err("Memory registrations not supported.\n");
2958                 return -ENOTSUPP;
2959         }
2960
2961         return 0;
2962 }
2963
2964 static const struct rtrs_rdma_dev_pd_ops dev_pd_ops = {
2965         .init = rtrs_clt_ib_dev_init
2966 };
2967
2968 static int __init rtrs_client_init(void)
2969 {
2970         rtrs_rdma_dev_pd_init(0, &dev_pd);
2971
2972         rtrs_clt_dev_class = class_create(THIS_MODULE, "rtrs-client");
2973         if (IS_ERR(rtrs_clt_dev_class)) {
2974                 pr_err("Failed to create rtrs-client dev class\n");
2975                 return PTR_ERR(rtrs_clt_dev_class);
2976         }
2977         rtrs_wq = alloc_workqueue("rtrs_client_wq", WQ_MEM_RECLAIM, 0);
2978         if (!rtrs_wq) {
2979                 class_destroy(rtrs_clt_dev_class);
2980                 return -ENOMEM;
2981         }
2982
2983         return 0;
2984 }
2985
2986 static void __exit rtrs_client_exit(void)
2987 {
2988         destroy_workqueue(rtrs_wq);
2989         class_destroy(rtrs_clt_dev_class);
2990         rtrs_rdma_dev_pd_deinit(&dev_pd);
2991 }
2992
2993 module_init(rtrs_client_init);
2994 module_exit(rtrs_client_exit);